From 8ba7d3db2a87a8b49dc9e412180412939aa2e84f Mon Sep 17 00:00:00 2001
From: Lewis Panos <lpanos@tenstorrent.com>
Date: Wed, 20 Nov 2024 10:33:22 -0500
Subject: [PATCH 01/84] Fix legalization of stablehlo.dot_general to TTIR
 matmul (#1311)

Add batch matmul silicon test
---
 .../StableHLOToTTIRPatterns.cpp               | 75 ++++++++++++++++---
 .../dot_general_2d.mlir}                      |  0
 .../dot_general/dot_general_3d.mlir           | 10 +++
 .../dot_general_op_2d.mlir}                   |  4 +-
 .../dot_general_op_batch_matmul.mlir          | 21 ++++++
 5 files changed, 96 insertions(+), 14 deletions(-)
 rename test/ttmlir/Conversion/StableHLOToTTIR/{dot_general_op.mlir => dot_general/dot_general_2d.mlir} (100%)
 create mode 100644 test/ttmlir/Conversion/StableHLOToTTIR/dot_general/dot_general_3d.mlir
 rename test/ttmlir/Silicon/StableHLO/{dot_general_op.mlir => dot_general/dot_general_op_2d.mlir} (82%)
 create mode 100644 test/ttmlir/Silicon/StableHLO/dot_general/dot_general_op_batch_matmul.mlir

diff --git a/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp b/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
index 1ec8556cff..9120edc117 100644
--- a/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
+++ b/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
@@ -279,30 +279,81 @@ class StableHLOToTTIRDotGeneralOpConversionPattern
     ::mlir::stablehlo::DotDimensionNumbersAttr dimensions =
         adaptor.getDotDimensionNumbers();
 
-    if (dimensions.getLhsContractingDimensions().empty() ||
-        dimensions.getRhsContractingDimensions().empty()) {
-      return rewriter.notifyMatchFailure(srcOp,
-                                         "Contracting dimension is missing.");
+    if (dimensions.getLhsContractingDimensions().size() != 1 ||
+        dimensions.getRhsContractingDimensions().size() != 1) {
+      return rewriter.notifyMatchFailure(
+          srcOp,
+          "LHS and RHS must have exactly 1 contracting dimension each. "
+          "Received LHS contracting dims: " +
+              std::to_string(dimensions.getLhsContractingDimensions().size()) +
+              ", RHS contracting dims: " +
+              std::to_string(dimensions.getRhsContractingDimensions().size()));
+    }
+
+    // Use negative indexing to determine if this is a valid matmul since math
+    // is done over the final two dimensions.
+    int64_t lhsContractingDim = dimensions.getLhsContractingDimensions()[0] -
+                                srcOp.getLhs().getType().getRank();
+    int64_t rhsContractingDim = dimensions.getRhsContractingDimensions()[0] -
+                                srcOp.getRhs().getType().getRank();
+
+    if (lhsContractingDim != -1) {
+      return rewriter.notifyMatchFailure(
+          srcOp, "Only support contracting dimensions that correspond to valid "
+                 "matmuls. LHS contracting dimension must be " +
+                     std::to_string(srcOp.getLhs().getType().getRank() - 1) +
+                     ". Got " + std::to_string(lhsContractingDim));
     }
 
-    if (dimensions.getLhsContractingDimensions()[0] != 1) {
+    if (rhsContractingDim != -2) {
       return rewriter.notifyMatchFailure(
-          srcOp, "Only non-transposed matmul is currently supported in TTIR.");
+          srcOp, "Only support contracting dimensions that correspond to valid "
+                 "matmuls. RHS contracting dimension must be " +
+                     std::to_string(srcOp.getRhs().getType().getRank() - 2) +
+                     ". Got " + std::to_string(rhsContractingDim));
     }
 
-    if (dimensions.getRhsContractingDimensions()[0] != 0) {
+    if (dimensions.getLhsBatchingDimensions() !=
+        dimensions.getRhsBatchingDimensions()) {
       return rewriter.notifyMatchFailure(
-          srcOp, "Only non-transposed matmul is currently supported in TTIR.");
+          srcOp, "LHS and RHS must have same batching dimensions.");
     }
 
-    if (!dimensions.getLhsBatchingDimensions().empty()) {
+    // For the RHS, all dimensions which are not the row and column dimensions
+    // must be 1 OR they must be equal to the corresponding dimension in the
+    // LHS. If the RHS has less dimensions than the LHS we will assume that the
+    // missing dimensions are 1.
+
+    auto lhsShape = srcOp.getLhs().getType().getShape().vec();
+    auto rhsShape = srcOp.getRhs().getType().getShape().vec();
+
+    if (rhsShape.size() > lhsShape.size()) {
       return rewriter.notifyMatchFailure(
-          srcOp, "Only non-transposed matmul is currently supported in TTIR.");
+          srcOp, "RHS must not be a higher rank than LHS.");
+    }
+
+    while (rhsShape.size() < lhsShape.size()) {
+      rhsShape.insert(rhsShape.begin(), 1);
+    }
+
+    // Need only to check dims to the left of dim -2 on the RHS
+    bool allOnes = true;
+    bool mismatchedDims = false;
+    for (int32_t i = rhsShape.size() - 3; i >= 0; i--) {
+      if (rhsShape[i] != 1) {
+        allOnes = false;
+      }
+
+      if (rhsShape[i] != lhsShape[i]) {
+        mismatchedDims = true;
+      }
     }
 
-    if (!dimensions.getRhsBatchingDimensions().empty()) {
+    if (mismatchedDims && !allOnes) {
       return rewriter.notifyMatchFailure(
-          srcOp, "Only non-transposed matmul is currently supported in TTIR.");
+          srcOp, "All dimensions in the RHS that are not the row and column "
+                 "dimensions must be 1 OR they must all be equal to the "
+                 "corresponding dimensions in the LHS.");
     }
 
     return success();
diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/dot_general_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/dot_general/dot_general_2d.mlir
similarity index 100%
rename from test/ttmlir/Conversion/StableHLOToTTIR/dot_general_op.mlir
rename to test/ttmlir/Conversion/StableHLOToTTIR/dot_general/dot_general_2d.mlir
diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/dot_general/dot_general_3d.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/dot_general/dot_general_3d.mlir
new file mode 100644
index 0000000000..52e2d80016
--- /dev/null
+++ b/test/ttmlir/Conversion/StableHLOToTTIR/dot_general/dot_general_3d.mlir
@@ -0,0 +1,10 @@
+// REQUIRES: stablehlo
+// RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s
+module {
+  func.func @main(%arg0: tensor<8x1x920xbf16>, %arg1: tensor<8x100x32xbf16>, %arg2: tensor<8x32x920xbf16>) -> tensor<8x100x920xbf16> {
+    %0 = stablehlo.broadcast_in_dim %arg2, dims = [0, 1, 2] : (tensor<8x32x920xbf16>) -> tensor<8x32x920xbf16>
+    // CHECK: %[[C:.*]] = "ttir.matmul"[[C:.*]]
+    %1 = stablehlo.dot_general %arg1, %0, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x32xbf16>, tensor<8x32x920xbf16>) -> tensor<8x100x920xbf16>
+    return %1 : tensor<8x100x920xbf16>
+  }
+}
diff --git a/test/ttmlir/Silicon/StableHLO/dot_general_op.mlir b/test/ttmlir/Silicon/StableHLO/dot_general/dot_general_op_2d.mlir
similarity index 82%
rename from test/ttmlir/Silicon/StableHLO/dot_general_op.mlir
rename to test/ttmlir/Silicon/StableHLO/dot_general/dot_general_op_2d.mlir
index 57a0bdcd8d..179f112b49 100644
--- a/test/ttmlir/Silicon/StableHLO/dot_general_op.mlir
+++ b/test/ttmlir/Silicon/StableHLO/dot_general/dot_general_op_2d.mlir
@@ -6,8 +6,8 @@
 // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
 // RUN: FileCheck --input-file=%t.mlir %s
 
-module @jit_dot_general attributes {} {
-  func.func public @test_dot_general(%arg0 : tensor<16x32xf32>, %arg1 : tensor<32x8xf32>) -> tensor<16x8xf32> {
+module @jit_dot_general_2d attributes {} {
+  func.func public @test_dot_general_2d(%arg0 : tensor<16x32xf32>, %arg1 : tensor<32x8xf32>) -> tensor<16x8xf32> {
     // CHECK-LABEL: func.func public @test_dot_general
     // CHECK: ttnn.empty
     // CHECK: ttnn.matmul
diff --git a/test/ttmlir/Silicon/StableHLO/dot_general/dot_general_op_batch_matmul.mlir b/test/ttmlir/Silicon/StableHLO/dot_general/dot_general_op_batch_matmul.mlir
new file mode 100644
index 0000000000..f23ece73ff
--- /dev/null
+++ b/test/ttmlir/Silicon/StableHLO/dot_general/dot_general_op_batch_matmul.mlir
@@ -0,0 +1,21 @@
+// REQUIRES: stablehlo
+// RUN: rm -rf %t.ttnn
+// RUN: rm -rf %t.mlir
+// RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | \
+// RUN:     ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" > %t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+// RUN: FileCheck --input-file=%t.mlir %s
+
+module @jit_dot_general_4d attributes {} {
+  func.func public @test_dot_general_4d(%arg0 : tensor<1x128x16x32xf32>, %arg1 : tensor<1x128x32x8xf32>) -> tensor<1x128x16x8xf32> {
+    // CHECK-LABEL: func.func public @test_dot_general
+    // CHECK: ttnn.empty
+    // CHECK: ttnn.matmul
+    // CHECK-SAME: tensor<1x128x16x32xf32,
+    // CHECK-SAME: tensor<1x128x32x8xf32,
+    // CHECK-SAME: tensor<1x128x16x8xf32,
+    // CHECK-SAME: -> tensor<1x128x16x8xf32
+    %0 = stablehlo.dot_general %arg0, %arg1, batching_dims = [0, 1] x [0, 1], contracting_dims = [3] x [2] : (tensor<1x128x16x32xf32>, tensor<1x128x32x8xf32>) -> tensor<1x128x16x8xf32>
+    return %0 : tensor<1x128x16x8xf32>
+  }
+}

From 351a587c6aed2084be93fafa9cfde9b3b012452d Mon Sep 17 00:00:00 2001
From: Vraj Prajapati <vprajapati@tenstorrent.com>
Date: Wed, 20 Nov 2024 10:07:26 -0600
Subject: [PATCH 02/84] Add cluster_descriptor.yaml to gitignore (#1331)

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 8663a2ff0e..b206279832 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,4 @@ ttrt-artifacts/*
 query_results.json
 run_results.json
 ttrt_report.xml
+cluster_descriptor.yaml

From ff339a1591a355227cf9060db62809734b7f8917 Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <157983820+vmilosevic@users.noreply.github.com>
Date: Wed, 20 Nov 2024 22:40:00 +0100
Subject: [PATCH 03/84] Uplift third_party/tt-metal to  2024-11-20 (#952)

* Uplift third_party/tt-metal to  2024-11-20

* Change createMemoryConfig() to not return ShardSpec if INTERLEAVED

 - Recent tt-metal check will fire this (and use ShardSpec) even for Interleaved tensors.
   >> Tensor with shape ttnn.Shape([1, 100[128], 3[32], 100[128]]) cannot be sharded because alignment will have rank greater than 2!

---------

Co-authored-by: Kyle Mabee <kmabee@tenstorrent.com>
---
 .../include/tt/runtime/ttnn/operations/utils.cpp     | 12 +++++++++---
 third_party/CMakeLists.txt                           |  2 +-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.cpp b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.cpp
index 435607b87e..c595fe26bc 100644
--- a/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.cpp
+++ b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.cpp
@@ -125,7 +125,10 @@ createMemoryConfig(const ::tt::target::TensorRef *tensorRef) {
   ::tt::tt_metal::BufferType ttnnBufferType =
       ::tt::runtime::ttnn::utils::toTTNNBufferType(targetMemorySpace);
 
-  return {ttnnMemLayout, ttnnBufferType, shardSpec};
+  return {ttnnMemLayout, ttnnBufferType,
+          ttnnMemLayout == tt_metal::TensorMemoryLayout::INTERLEAVED
+              ? std::nullopt
+              : std::make_optional(shardSpec)};
 }
 
 // Prefer to use this method over the one above
@@ -169,8 +172,11 @@ createMemoryConfig(const ::tt::target::MemoryConfigDesc *memcfg,
       ttnnCoreRangeSet, ttnnShardShape,
       ::tt::tt_metal::ShardOrientation::ROW_MAJOR, false);
 
-  ::ttnn::MemoryConfig memoryConfig = {tensorMemoryLayout, bufferType,
-                                       shardSpec};
+  ::ttnn::MemoryConfig memoryConfig = {
+      tensorMemoryLayout, bufferType,
+      tensorMemoryLayout == tt_metal::TensorMemoryLayout::INTERLEAVED
+          ? std::nullopt
+          : std::make_optional(shardSpec)};
   return memoryConfig;
 }
 
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index b1c679a507..4e61755666 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -1,6 +1,6 @@
 include(ExternalProject)
 
-set(TT_METAL_VERSION "f16cadfabebd7654baef73e4ac2c3240b12b0d1d")
+set(TT_METAL_VERSION "89fc3ba835854773eaec4274da10044718dee429")
 
 if ("$ENV{ARCH_NAME}" STREQUAL "grayskull")
   set(ARCH_NAME "grayskull")

From 548b06326bf1e48c90e55b92523675ab471676bd Mon Sep 17 00:00:00 2001
From: Muhammad Asif Manzoor <mmanzoor@tenstorrent.com>
Date: Wed, 20 Nov 2024 20:54:55 -0500
Subject: [PATCH 04/84] Minor fixes for stablehlo.gather op (#1337)

* Use updated operands during stableHLO to TTIR conversion
* Add check to ensure the input argument data type is bfloat16
* Add stablehlo runtime test for gather op
---
 .../StableHLOToTTIRPatterns.cpp               | 10 ++---
 .../TTIRToTTIRDecomposition.cpp               |  8 ++++
 .../Conversion/StableHLOToTTIR/gather_op.mlir | 17 +++++++
 .../TTNN/embedding/gather_to_embedding.mlir   | 24 +++++-----
 .../gather_to_embedding_negative.mlir         | 22 +++++++++
 test/ttmlir/Silicon/StableHLO/gather_op.mlir  | 45 +++++++++++++++++++
 6 files changed, 109 insertions(+), 17 deletions(-)
 create mode 100644 test/ttmlir/Silicon/StableHLO/gather_op.mlir

diff --git a/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp b/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
index 9120edc117..28bf4f71de 100644
--- a/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
+++ b/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
@@ -850,7 +850,7 @@ class StableHLOToTTIRBroadcastInDimOpConversionPattern
 
     llvm::SmallVector<int64_t, 4> broadcastedShape;
     auto srcType =
-        getTypeConverter()->convertType(srcOp.getOperand().getType());
+        getTypeConverter()->convertType(adaptor.getOperand().getType());
     auto inputShape = mlir::cast<mlir::RankedTensorType>(srcType).getShape();
     auto outputShape = mlir::cast<mlir::RankedTensorType>(srcType).getShape();
 
@@ -996,8 +996,8 @@ class StableHLOToTTIRConcatOpConversionPattern
                                          "ConcatOp dimension is too large.");
     }
 
-    auto rankedTensorType =
-        mlir::dyn_cast<mlir::RankedTensorType>(srcOp.getOperand(0).getType());
+    auto rankedTensorType = mlir::dyn_cast<mlir::RankedTensorType>(
+        adaptor.getOperands()[0].getType());
     if (static_cast<int64_t>(adaptor.getDimension()) >=
         rankedTensorType.getRank()) {
       return rewriter.notifyMatchFailure(srcOp,
@@ -1185,8 +1185,8 @@ class StableHLOToTTIRGatherOpConversionPattern
     auto dimensionNumbers = srcOp.getDimensionNumbers();
 
     rewriter.replaceOpWithNewOp<mlir::tt::ttir::GatherOp>(
-        srcOp, outputType, srcOp.getOperands()[0],
-        srcOp.getOperands()[1], // Start indices
+        srcOp, outputType, adaptor.getOperands()[0],
+        adaptor.getOperands()[1], // Start indices
         Value(outputTensor), dimensionNumbers.getOffsetDims(),
         dimensionNumbers.getCollapsedSliceDims(),
         dimensionNumbers.getOperandBatchingDims(),
diff --git a/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecomposition.cpp b/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecomposition.cpp
index 9b8c634adb..91fa520dd8 100644
--- a/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecomposition.cpp
+++ b/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecomposition.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Transforms/DialectConversion.h"
 
 #include <algorithm>
+#include <mlir/IR/BuiltinAttributes.h>
 
 using namespace mlir;
 using namespace mlir::tt;
@@ -407,6 +408,13 @@ struct GatherToEmbeddingConversionPattern
     // collapsed slice dims of the gather op
     auto collapsedSliceDims = op.getCollapsedSliceDims();
 
+    RankedTensorType operandType =
+        mlir::cast<RankedTensorType>(op->getOperand(0).getType());
+    if (!operandType.getElementType().isBF16()) {
+      return rewriter.notifyMatchFailure(
+          op, "only supports bfloat16 input tensor.");
+    }
+
     if (shape.size() > 1) {
       auto hiddenDim = shape[shape.size() - 1];
       // check if sliceSizes has more than one element
diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/gather_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/gather_op.mlir
index ba29d123e8..e80bb75886 100644
--- a/test/ttmlir/Conversion/StableHLOToTTIR/gather_op.mlir
+++ b/test/ttmlir/Conversion/StableHLOToTTIR/gather_op.mlir
@@ -8,6 +8,7 @@ module @jit_gather attributes {} {
     // CHECK: %[[C:.*]] = "ttir.gather"[[C:.*]]
     return %0 : tensor<1x32x1024xf32>
   }
+
   func.func public @test_gather_1(%operand: tensor<448x384xf32>, %start_indices: tensor<1x2x1xi32>) -> tensor<1x2x384xf32> {
     %0 = "stablehlo.gather"(%operand, %start_indices) <{dimension_numbers = #stablehlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = array<i64: 1, 384>}> : (tensor<448x384xf32>, tensor<1x2x1xi32>) -> tensor<1x2x384xf32>
     // CHECK: %[[C:.*]] = tensor.empty[[C:.*]]
@@ -22,4 +23,20 @@ module @jit_gather attributes {} {
     return %0 : tensor<1x2x384xf32>
   }
 
+  func.func public @test_gather_3(%arg0: tensor<32128x512xbf16>, %arg1: tensor<1x15xi64>) -> tensor<1x15x512xbf16> {
+    // CHECK: %[[EMPTY:[0-9]+]] = tensor.empty() : tensor<1x15x512xbf16>
+    // CHECK: %[[VAL:[0-9]+]] = "ttir.gather"(%arg0, %arg1, %[[EMPTY]])
+    // CHECK-SAME: collapsed_slice_dims = array<i64: 0>,
+    // CHECK-SAME: index_vector_dim = 2 : si64,
+    // CHECK-SAME: indices_are_sorted = false,
+    // CHECK-SAME: offset_dims = array<i64: 2>,
+    // CHECK-SAME: operand_batching_dims = array<i64>,
+    // CHECK-SAME: slice_sizes = array<i64: 1, 512>,
+    // CHECK-SAME: start_index_map = array<i64: 0>,
+    // CHECK-SAME: start_indices_batching_dims = array<i64>
+    // CHECK-SAME: (tensor<32128x512xbf16>, tensor<1x15xi32>, tensor<1x15x512xbf16>) -> tensor<1x15x512xbf16>
+    %0 = "stablehlo.gather"(%arg0, %arg1) <{dimension_numbers = #stablehlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = array<i64: 1, 512>}> : (tensor<32128x512xbf16>, tensor<1x15xi64>) -> tensor<1x15x512xbf16>
+    // CEHCK: return %[[VAL]] : tensor<1x15x512xbf16>
+    return %0 : tensor<1x15x512xbf16>
+  }
 }
diff --git a/test/ttmlir/Dialect/TTNN/embedding/gather_to_embedding.mlir b/test/ttmlir/Dialect/TTNN/embedding/gather_to_embedding.mlir
index dfbf99008d..6404ee6e94 100644
--- a/test/ttmlir/Dialect/TTNN/embedding/gather_to_embedding.mlir
+++ b/test/ttmlir/Dialect/TTNN/embedding/gather_to_embedding.mlir
@@ -1,9 +1,9 @@
 // RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 module attributes {} {
-  func.func @gather_0(%operand: tensor<32000x1024xf32>, %start_indices: tensor<1x32xi32>) -> tensor<1x32x1024xf32> {
+  func.func @gather_0(%operand: tensor<32000x1024xbf16>, %start_indices: tensor<1x32xi32>) -> tensor<1x32x1024xbf16> {
     // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
-    %0 = tensor.empty() : tensor<1x32x1024xf32>
+    %0 = tensor.empty() : tensor<1x32x1024xbf16>
     // CHECK: %[[C:.*]] = "ttnn.embedding"[[C:.*]]
     %1 = "ttir.gather"(%operand, %start_indices, %0) {
         offset_dims = array<i64: 2>,
@@ -15,13 +15,13 @@ module attributes {} {
         slice_sizes = array<i64: 1, 1024>,
         indices_are_sorted = false,
         operand_constraints = [#any_device, #any_device, #any_device]
-    } : (tensor<32000x1024xf32>, tensor<1x32xi32>, tensor<1x32x1024xf32>) -> tensor<1x32x1024xf32>
-    return %1 : tensor<1x32x1024xf32>
+    } : (tensor<32000x1024xbf16>, tensor<1x32xi32>, tensor<1x32x1024xbf16>) -> tensor<1x32x1024xbf16>
+    return %1 : tensor<1x32x1024xbf16>
   }
 
-  func.func @gather_1(%operand: tensor<448x384xf32>, %start_indices: tensor<1x2x1xi32>) -> tensor<1x2x384xf32> {
+  func.func @gather_1(%operand: tensor<448x384xbf16>, %start_indices: tensor<1x2x1xi32>) -> tensor<1x2x384xbf16> {
     // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
-    %0 = tensor.empty() : tensor<1x2x384xf32>
+    %0 = tensor.empty() : tensor<1x2x384xbf16>
     // CHECK: %[[C:.*]] = "ttnn.embedding"[[C:.*]]
     %1 = "ttir.gather"(%operand, %start_indices, %0) <{
         offset_dims = array<i64: 2>,
@@ -33,13 +33,13 @@ module attributes {} {
         slice_sizes = array<i64: 1, 384>,
         indices_are_sorted = false,
         operand_constraints = [#any_device, #any_device, #any_device]
-      }> : (tensor<448x384xf32>, tensor<1x2x1xi32>, tensor<1x2x384xf32>) -> tensor<1x2x384xf32>
-    return %1 : tensor<1x2x384xf32>
+      }> : (tensor<448x384xbf16>, tensor<1x2x1xi32>, tensor<1x2x384xbf16>) -> tensor<1x2x384xbf16>
+    return %1 : tensor<1x2x384xbf16>
   }
 
-  func.func @gather_2(%operand: tensor<51864x384xf32>, %start_indices: tensor<1x2xi32>) -> tensor<1x2x384xf32> {
+  func.func @gather_2(%operand: tensor<51864x384xbf16>, %start_indices: tensor<1x2xi32>) -> tensor<1x2x384xbf16> {
     // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
-    %0 = tensor.empty() : tensor<1x2x384xf32>
+    %0 = tensor.empty() : tensor<1x2x384xbf16>
     // CHECK: %[[C:.*]] = "ttnn.embedding"[[C:.*]]
     %1 = "ttir.gather"(%operand, %start_indices, %0) <{
         offset_dims = array<i64: 2>,
@@ -51,7 +51,7 @@ module attributes {} {
         slice_sizes = array<i64: 1, 384>,
         indices_are_sorted = false,
         operand_constraints = [#any_device, #any_device, #any_device]
-      }> : (tensor<51864x384xf32>, tensor<1x2xi32>, tensor<1x2x384xf32>) -> tensor<1x2x384xf32>
-    return %1 : tensor<1x2x384xf32>
+      }> : (tensor<51864x384xbf16>, tensor<1x2xi32>, tensor<1x2x384xbf16>) -> tensor<1x2x384xbf16>
+    return %1 : tensor<1x2x384xbf16>
   }
 }
diff --git a/test/ttmlir/Dialect/TTNN/embedding/gather_to_embedding_negative.mlir b/test/ttmlir/Dialect/TTNN/embedding/gather_to_embedding_negative.mlir
index 2a06bf92b6..44ffea73ef 100644
--- a/test/ttmlir/Dialect/TTNN/embedding/gather_to_embedding_negative.mlir
+++ b/test/ttmlir/Dialect/TTNN/embedding/gather_to_embedding_negative.mlir
@@ -110,3 +110,25 @@ module attributes {} {
     return %1 : tensor<1x2x384xf32>
   }
 }
+
+// Verify that the parsing fails for data type other than bfloat16.
+// -----
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @gather_0(%operand: tensor<32000x1024xf32>, %start_indices: tensor<1x32xi32>) -> tensor<1x32x1024xf32> {
+    %0 = tensor.empty() : tensor<1x32x1024xf32>
+    // CHECK: error: failed to legalize operation 'ttir.gather' that was explicitly marked illegal
+    %1 = "ttir.gather"(%operand, %start_indices, %0) {
+        offset_dims = array<i64: 2>,
+        collapsed_slice_dims = array<i64: 0>,
+        operand_batching_dims = array<i64: 0>,
+        start_indices_batching_dims = array<i64: 0>,
+        start_index_map = array<i64: 0>,
+        index_vector_dim = 1 : si64,
+        slice_sizes = array<i64: 1, 1024>,
+        indices_are_sorted = false,
+        operand_constraints = [#any_device, #any_device, #any_device]
+    } : (tensor<32000x1024xf32>, tensor<1x32xi32>, tensor<1x32x1024xf32>) -> tensor<1x32x1024xf32>
+    return %1 : tensor<1x32x1024xf32>
+  }
+}
diff --git a/test/ttmlir/Silicon/StableHLO/gather_op.mlir b/test/ttmlir/Silicon/StableHLO/gather_op.mlir
new file mode 100644
index 0000000000..9a4a90b1b6
--- /dev/null
+++ b/test/ttmlir/Silicon/StableHLO/gather_op.mlir
@@ -0,0 +1,45 @@
+// REQUIRES: stablehlo
+// RUN: rm -rf %t.ttnn
+// RUN: rm -rf %t.mlir
+// RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | \
+// RUN:     ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" > %t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+// RU1N: FileCheck --input-file=%t.mlir %s
+
+module @jit_gather attributes {} {
+  func.func public @test_gather_0(%operand: tensor<32000x1024xbf16>, %start_indices: tensor<1x32xi32>) -> tensor<1x32x1024xbf16> {
+    // CHECK-LABEL: func.func public @test_gather_0
+    // CHECK: ttnn.empty
+    // CHECK: ttnn.embedding
+    // CHECK-SAME: tensor<1x32xi32,
+    // CHECK-SAME: tensor<1x32x1024xbf16
+    // CHECK-SAME: tensor<32000x1024xbf16,
+    // CHECK-SAME: -> tensor<1x32x1024xbf16
+    %0 = "stablehlo.gather"(%operand, %start_indices) <{dimension_numbers = #stablehlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = array<i64: 1, 1024>}> : (tensor<32000x1024xbf16>, tensor<1x32xi32>) -> tensor<1x32x1024xbf16>
+    return %0 : tensor<1x32x1024xbf16>
+  }
+
+  func.func public @test_gather_1(%operand: tensor<51864x384xbf16>, %start_indices: tensor<1x2xi32>) -> tensor<1x2x384xbf16> {
+    // CHECK-LABEL: func.func public @test_gather_1
+    // CHECK: ttnn.empty
+    // CHECK: ttnn.embedding
+    // CHECK-SAME: tensor<1x2xi32,
+    // CHECK-SAME: tensor<1x2x384xbf16
+    // CHECK-SAME: tensor<51864x384xbf16,
+    // CHECK-SAME: -> tensor<1x2x384xbf16
+    %0 = "stablehlo.gather"(%operand, %start_indices) <{dimension_numbers = #stablehlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = array<i64: 1, 384>}> : (tensor<51864x384xbf16>, tensor<1x2xi32>) -> tensor<1x2x384xbf16>
+    return %0 : tensor<1x2x384xbf16>
+  }
+
+  func.func public @test_gather_2(%operand: tensor<32128x512xbf16>, %start_indices: tensor<1x15xi64>) -> tensor<1x15x512xbf16> {
+    // CHECK-LABEL: func.func public @test_gather_2
+    // CHECK: ttnn.empty
+    // CHECK: ttnn.embedding
+    // CHECK-SAME: tensor<1x16xi32,
+    // CHECK-SAME: tensor<1x15x512xbf16
+    // CHECK-SAME: tensor<32128x512xbf16,
+    // CHECK-SAME: -> tensor<1x15x512xbf16
+    %0 = "stablehlo.gather"(%operand, %start_indices) <{dimension_numbers = #stablehlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = array<i64: 1, 512>}> : (tensor<32128x512xbf16>, tensor<1x15xi64>) -> tensor<1x15x512xbf16>
+    return %0 : tensor<1x15x512xbf16>
+  }
+}

From 9d731b7484155611b878c17e69cf383d2c59fdae Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <157983820+vmilosevic@users.noreply.github.com>
Date: Thu, 21 Nov 2024 10:01:21 +0100
Subject: [PATCH 05/84] Uplift third_party/tt-metal to  2024-11-21 (#1361)

---
 third_party/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index 4e61755666..682f559024 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -1,6 +1,6 @@
 include(ExternalProject)
 
-set(TT_METAL_VERSION "89fc3ba835854773eaec4274da10044718dee429")
+set(TT_METAL_VERSION "b057e090e19c2f18e209817b8de538209765db6d")
 
 if ("$ENV{ARCH_NAME}" STREQUAL "grayskull")
   set(ARCH_NAME "grayskull")

From e78bb00a12e9f329e5233a882eb5587c66262815 Mon Sep 17 00:00:00 2001
From: Radenko Pavlovic <133032400+rpavlovicTT@users.noreply.github.com>
Date: Thu, 21 Nov 2024 14:42:32 +0100
Subject: [PATCH 06/84] Select op support (#1338)

Select op support

* Add select op to TTIR
* Decompose select op to TTIR slices and concat optionally
* Unit tests
---
 include/ttmlir/Dialect/TTIR/IR/TTIROps.td     |  27 ++++
 .../TTIRToTTIRDecomposition.cpp               | 115 +++++++++++++++++
 .../TTIRToTTIRDecompositionPass.cpp           |   1 +
 lib/Dialect/TTIR/IR/TTIROps.cpp               |  94 ++++++++++++++
 .../select_decomposition_tests.mlir           |  26 ++++
 .../TTIR/select/select_tests_negative.mlir    | 116 ++++++++++++++++++
 .../TTIR/select/select_tests_positive.mlir    |  44 +++++++
 7 files changed, 423 insertions(+)
 create mode 100644 test/ttmlir/Dialect/TTIR/decompositions/select_decomposition_tests.mlir
 create mode 100644 test/ttmlir/Dialect/TTIR/select/select_tests_negative.mlir
 create mode 100644 test/ttmlir/Dialect/TTIR/select/select_tests_positive.mlir

diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
index 489fd2faa9..8782f63ae1 100644
--- a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
+++ b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
@@ -925,6 +925,33 @@ def TTIR_SliceOp: TTIR_DPSOp<"slice"> {
     let hasVerifier = 1;
 }
 
+def TTIR_SelectOp: TTIR_DPSOp<"select"> {
+    let summary = "Select op.";
+    let description = [{
+      Extracts a sub-tensor (slice) from the input tensor along a specified dimension in few steps defined by the
+      `begin`, `length`, and `stride` attributes.
+      The `begin` specifies the start index for the selected dimension of the tensor.
+      The `length` specifies the number of elements to extract from the input tensor along the selected dimension.
+      The `stride` specifies the step size for the start index. The default value is 0. 0 means no stride.
+    }];
+
+    let arguments = (ins AnyRankedTensor:$input,
+                         AnyRankedTensor:$output,
+                         SI32Attr:$dim,
+                         SI32Attr:$begin,
+                         SI32Attr:$length,
+                         DefaultValuedOptionalAttr<SI32Attr, "0">:$stride,
+                         TT_OperandConstraintArrayAttr:$operand_constraints);
+
+    let results = (outs AnyRankedTensor:$result);
+
+    let extraClassDeclaration = [{
+      MutableOperandRange getDpsInitsMutable() { return getOutputMutable(); }
+    }];
+
+    let hasVerifier = 1;
+}
+
 // ANCHOR: decomposing_an_op_index_ttir
 def TTIR_IndexOp: TTIR_DPSOp<"index"> {
     let summary = "Index op.";
diff --git a/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecomposition.cpp b/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecomposition.cpp
index 91fa520dd8..9c5afd41e6 100644
--- a/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecomposition.cpp
+++ b/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecomposition.cpp
@@ -783,6 +783,120 @@ class GetDimensionSizeToConstantConversionPattern
   }
 };
 
+// SelectOp is converted to a series of SliceOp and potentially a ConcatOp if
+// the sliced dimension is sliced multiple times. For example, if the input
+// tensor is
+//    [[[1, 2, 3],
+//      [4, 5, 6],
+//      [7, 8, 9],
+//      [10, 11, 12],
+//      [13, 14, 15],
+//      [16, 17, 18]],
+//     [[19, 20, 21],
+//      [22, 23, 24],
+//      [25, 26, 27],
+//      [28, 29, 30],
+//      [31, 32, 33],
+//      [34, 35, 36]]],
+//    shape = [2, 6, 3]
+// and the SelectOp is dim=1, begin=0, length=2, stride=4, the output tensor
+// will be
+//    [[[1, 2, 3],
+//      [4, 5, 6],
+//      [13, 14, 15],
+//      [16, 17, 18]],
+//     [[19, 20, 21],
+//      [22, 23, 24],
+//      [31, 32, 33],
+//      [34, 35, 36]]],
+//    shape = [2, 4, 3]
+// In this case 2 slices are created and concatenated to form the output tensor.
+// First slice has begins=[0, 0, 0], ends=[2, 2, 3], steps=[1, 1, 1], and the
+// second slice has begins=[0, 4, 0], ends=[2, 6, 3], steps=[1, 1, 1].
+struct SelectToSliceConversionPattern
+    : public OpConversionPattern<ttir::SelectOp> {
+public:
+  using OpConversionPattern<ttir::SelectOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(ttir::SelectOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    auto inputType = mlir::cast<RankedTensorType>(adaptor.getInput().getType());
+    auto outputType = mlir::cast<RankedTensorType>(op.getType());
+
+    auto inputShape = inputType.getShape();
+
+    int32_t dim =
+        op.getDim() < 0 ? inputType.getRank() + op.getDim() : op.getDim();
+
+    int32_t begin = op.getBegin();
+    int32_t length = op.getLength();
+    int32_t stride = op.getStride();
+
+    int32_t inputDimSize = inputType.getShape()[dim];
+    int32_t numSlices = (inputDimSize - begin + stride - 1) / stride;
+
+    llvm::SmallVector<int32_t, 4> begins, ends, steps;
+    for (int32_t i = 0; i < inputType.getRank(); ++i) {
+      // Always slicing with step 1.
+      steps.push_back(1);
+      if (i == dim) {
+        // Push placeholder values for now which will be updated later.
+        begins.push_back(0);
+        ends.push_back(0);
+        continue;
+      }
+
+      // For non-sliced dimensions, begin=0, end=dimSize, step=1.
+      begins.push_back(0);
+      ends.push_back(inputType.getDimSize(i));
+    }
+
+    // Create a slice for each slice of the input tensor. The slices are then
+    // concatenated. The slices are created by updating the begin and end values
+    // for the sliced dimension.
+    llvm::SmallVector<Value> slices;
+    for (int32_t i = 0; i < numSlices; ++i) {
+      int32_t newBegin = begin + i * stride;
+      int32_t newEnd = std::min(newBegin + length, inputDimSize);
+
+      // Make a copy of the input shape and update the dim size.
+      llvm::SmallVector<int64_t> resultShape(inputShape);
+      resultShape[dim] = newEnd - newBegin;
+      auto resultType =
+          RankedTensorType::get(resultShape, inputType.getElementType());
+
+      auto sliceDpsResult = rewriter.create<tensor::EmptyOp>(
+          op.getLoc(), resultShape, inputType.getElementType());
+
+      begins[dim] = newBegin;
+      ends[dim] = newEnd;
+
+      auto newOp = rewriter.create<ttir::SliceOp>(
+          op.getLoc(), resultType, adaptor.getInput(), sliceDpsResult,
+          rewriter.getI32ArrayAttr(begins), rewriter.getI32ArrayAttr(ends),
+          rewriter.getI32ArrayAttr(steps), adaptor.getOperandConstraints());
+      slices.push_back(newOp->getResult(0));
+    }
+
+    assert(!slices.empty());
+    if (slices.size() > 1) {
+      auto concatDpsResult = rewriter.create<tensor::EmptyOp>(
+          op.getLoc(), outputType.getShape(), outputType.getElementType());
+      auto concatOp = rewriter.create<ttir::ConcatOp>(
+          op.getLoc(), outputType, slices, concatDpsResult,
+          rewriter.getSI32IntegerAttr(dim), adaptor.getOperandConstraints());
+
+      rewriter.replaceOp(op, concatOp.getResult());
+    } else {
+      rewriter.replaceOp(op, slices[0]);
+    }
+
+    return success();
+  }
+};
+
 void populateTTIRToTTIRDecompositionPatterns(MLIRContext *ctx,
                                              RewritePatternSet &patterns,
                                              TypeConverter &typeConverter) {
@@ -791,6 +905,7 @@ void populateTTIRToTTIRDecompositionPatterns(MLIRContext *ctx,
   patterns.add<ConvolutionToConv2dPattern>(typeConverter, ctx);
   patterns.add<GetDimensionSizeToConstantConversionPattern>(typeConverter, ctx);
   patterns.add<GatherToEmbeddingConversionPattern>(typeConverter, ctx);
+  patterns.add<SelectToSliceConversionPattern>(typeConverter, ctx);
 }
 
 } // namespace mlir::tt
diff --git a/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecompositionPass.cpp b/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecompositionPass.cpp
index 76cbae96e2..d91084f59d 100644
--- a/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecompositionPass.cpp
+++ b/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecompositionPass.cpp
@@ -51,6 +51,7 @@ struct TTIRToTTIRDecompositionPass
     target.addIllegalOp<ttir::GetDimensionSizeOp>();
     target.addIllegalOp<ttir::PoolingOp>();
     target.addIllegalOp<ttir::GatherOp>();
+    target.addIllegalOp<ttir::SelectOp>();
 
     TypeConverter typeConverter;
     // All types map 1:1.
diff --git a/lib/Dialect/TTIR/IR/TTIROps.cpp b/lib/Dialect/TTIR/IR/TTIROps.cpp
index ec415e090b..5946cb2fe3 100644
--- a/lib/Dialect/TTIR/IR/TTIROps.cpp
+++ b/lib/Dialect/TTIR/IR/TTIROps.cpp
@@ -596,6 +596,100 @@ ::mlir::LogicalResult mlir::tt::ttir::IndexOp::verify() {
 }
 // ANCHOR_END: decomposing_an_op_index_ttir_verify
 
+//===----------------------------------------------------------------------===//
+// SelectOp
+//===----------------------------------------------------------------------===//
+
+// SelectOp verification
+::mlir::LogicalResult mlir::tt::ttir::SelectOp::verify() {
+  ::mlir::RankedTensorType inputType = getInput().getType();
+  ::mlir::RankedTensorType outputType = getOutput().getType();
+
+  if (inputType.getRank() != outputType.getRank()) {
+    return emitOpError("Input and output tensors must have the same rank.");
+  }
+
+  if (inputType.getElementType() != outputType.getElementType()) {
+    return emitOpError("Input and output tensors must have the same element "
+                       "type.");
+  }
+
+  int32_t dim = getDim();
+  int32_t origDim = dim;
+  if (dim < 0) {
+    dim += inputType.getRank();
+  }
+
+  if (dim < 0 || dim >= inputType.getRank()) {
+    return emitOpError() << "Invalid dimension " << origDim
+                         << " for select op with input tensor rank "
+                         << inputType.getRank();
+  }
+
+  int32_t dimSize = inputType.getDimSize(dim);
+
+  int32_t stride = getStride();
+  if (stride == 0) {
+    stride = dimSize;
+  }
+
+  if (stride < 0) {
+    return emitOpError() << "Invalid stride " << stride << " for dimension "
+                         << dim << ", stride must be non-negative";
+  }
+
+  if (stride > dimSize) {
+    return emitOpError() << "Invalid stride " << stride << " for dimension "
+                         << dim << " with size " << dimSize
+                         << ". stride must be less than or equal to the "
+                            "dimension size";
+  }
+
+  int32_t begin = getBegin();
+  int32_t length = getLength();
+  if (begin < 0 || begin >= dimSize) {
+    return emitOpError() << "Invalid begin index " << begin << " for dimension "
+                         << dim << " with size " << dimSize
+                         << ". begin must be "
+                            "in the range [0, dimSize)";
+  }
+
+  if (length < 1 || length > stride) {
+    return emitOpError() << "Invalid length " << length << " for begin index "
+                         << begin << " and stride " << stride
+                         << " for dimension " << dim << " with size " << dimSize
+                         << ". stride must be greater than or equal to length";
+  }
+
+  if (begin + length > dimSize) {
+    return emitOpError() << "Invalid length " << length << " for begin index "
+                         << begin << " and dimension " << dim << " with size "
+                         << dimSize
+                         << ". begin + length must be less than or "
+                            "equal to the dimension size";
+  }
+
+  // Get the number of slices as the number of times the stride fits in the
+  // dimension size starting from the begin index.
+  int32_t numSlices = (dimSize - begin + stride - 1) / stride;
+  int32_t totalLength = 0;
+  for (int32_t i = 0; i < numSlices; i++) {
+    int32_t newBegin = begin + i * stride;
+    int32_t newEnd = std::min(newBegin + length, dimSize);
+    totalLength += newEnd - newBegin;
+  }
+
+  if (totalLength != outputType.getDimSize(dim)) {
+    return emitOpError() << "Sum of all slices must be equal to the output "
+                            "dimension size for the given dimension. Expected "
+                            "output dimension size: "
+                         << outputType.getDimSize(dim) << ", but got "
+                         << totalLength;
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // SqueezeOp
 //===----------------------------------------------------------------------===//
diff --git a/test/ttmlir/Dialect/TTIR/decompositions/select_decomposition_tests.mlir b/test/ttmlir/Dialect/TTIR/decompositions/select_decomposition_tests.mlir
new file mode 100644
index 0000000000..8365bbddd3
--- /dev/null
+++ b/test/ttmlir/Dialect/TTIR/decompositions/select_decomposition_tests.mlir
@@ -0,0 +1,26 @@
+// RUN: ttmlir-opt --ttir-to-ttir-decomposition %s | FileCheck %s
+
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module attributes {} {
+  func.func @select_identity(%arg0: tensor<4x4xf32>) -> tensor<4x4xf32> {
+    %0 = tensor.empty() : tensor<4x4xf32>
+    // CHECK: %{{[0-9]+}} = "ttir.slice"
+    %1 = "ttir.select"(%arg0, %0) <{dim = 1: si32, begin = 0: si32, length = 4: si32, stride = 4: si32, operand_constraints = [#any_device_tile, #any_device_tile]}>  :
+        (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+    return %1 : tensor<4x4xf32>
+  }
+
+  func.func @select_multi_slice(%arg0: tensor<4x2x64x128xf32>) -> tensor<4x2x64x32xf32> {
+    %0 = tensor.empty() : tensor<4x2x64x32xf32>
+
+    // CHECK: %{{[0-9]+}} = "ttir.slice"
+    // CHECK: %{{[0-9]+}} = "ttir.slice"
+    // CHECK: %{{[0-9]+}} = "ttir.slice"
+    // CHECK: %{{[0-9]+}} = "ttir.slice"
+    // CHECK: %{{[0-9]+}} = "ttir.concat"
+    %1 = "ttir.select"(%arg0, %0) <{dim = -1: si32, begin = 0: si32, length = 4: si32, stride = 16: si32, operand_constraints = [#any_device_tile, #any_device_tile]}>  :
+        (tensor<4x2x64x128xf32>, tensor<4x2x64x32xf32>) -> tensor<4x2x64x32xf32>
+
+    return %1 : tensor<4x2x64x32xf32>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTIR/select/select_tests_negative.mlir b/test/ttmlir/Dialect/TTIR/select/select_tests_negative.mlir
new file mode 100644
index 0000000000..f505bfcb73
--- /dev/null
+++ b/test/ttmlir/Dialect/TTIR/select/select_tests_negative.mlir
@@ -0,0 +1,116 @@
+// RUN: not ttmlir-opt --split-input-file %s 2>&1 | FileCheck %s
+
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module attributes {} {
+  func.func @select_negative_invalid_dim(%arg0: tensor<4x4xf32>) -> tensor<4x4xf32> {
+    %0 = tensor.empty() : tensor<4x4xf32>
+    // CHECK: {{.*error.*Invalid dimension}}
+    %1 = "ttir.select"(%arg0, %0) <{dim = -3: si32, begin = 0: si32, length = 4: si32, stride = 4: si32, operand_constraints = [#any_device_tile, #any_device_tile]}>  :
+        (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+    return %1 : tensor<4x4xf32>
+  }
+}
+
+// -----
+
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module attributes {} {
+  func.func @select_negative_invalid_stride(%arg0: tensor<4x4xf32>) -> tensor<4x4xf32> {
+    %0 = tensor.empty() : tensor<4x4xf32>
+    // CHECK: {{.*error.*Invalid stride.*}}
+    %1 = "ttir.select"(%arg0, %0) <{dim = 1: si32, begin = 0: si32, length = 4: si32, stride = 7: si32, operand_constraints = [#any_device_tile, #any_device_tile]}>  :
+        (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+    return %1 : tensor<4x4xf32>
+  }
+}
+
+// -----
+
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module attributes {} {
+  func.func @select_negative_invalid_stride_2(%arg0: tensor<4x4xf32>) -> tensor<4x4xf32> {
+    %0 = tensor.empty() : tensor<4x4xf32>
+    // CHECK: {{.*error.*Invalid stride.*}}
+    %1 = "ttir.select"(%arg0, %0) <{dim = 1: si32, begin = 0: si32, length = 4: si32, stride = -1: si32, operand_constraints = [#any_device_tile, #any_device_tile]}>  :
+        (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+    return %1 : tensor<4x4xf32>
+  }
+}
+
+// -----
+
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module attributes {} {
+  func.func @select_negative_invalid_begin(%arg0: tensor<4x4xf32>) -> tensor<4x4xf32> {
+    %0 = tensor.empty() : tensor<4x4xf32>
+    // CHECK: {{.*error.*Invalid begin index.*}}
+    %1 = "ttir.select"(%arg0, %0) <{dim = 1: si32, begin = -3: si32, length = 4: si32, stride = 1: si32, operand_constraints = [#any_device_tile, #any_device_tile]}>  :
+        (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+    return %1 : tensor<4x4xf32>
+  }
+}
+
+// -----
+
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module attributes {} {
+  func.func @select_negative_invalid_begin_2(%arg0: tensor<4x4xf32>) -> tensor<4x4xf32> {
+    %0 = tensor.empty() : tensor<4x4xf32>
+    // CHECK: {{.*error.*Invalid begin index.*}}
+    %1 = "ttir.select"(%arg0, %0) <{dim = 1: si32, begin = 4: si32, length = 4: si32, stride = 1: si32, operand_constraints = [#any_device_tile, #any_device_tile]}>  :
+        (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+    return %1 : tensor<4x4xf32>
+  }
+}
+
+// -----
+
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module attributes {} {
+  func.func @select_negative_invalid_length(%arg0: tensor<4x4xf32>) -> tensor<4x4xf32> {
+    %0 = tensor.empty() : tensor<4x4xf32>
+    // CHECK: {{.*error.*Invalid length.*}}
+    %1 = "ttir.select"(%arg0, %0) <{dim = 1: si32, begin = 0: si32, length = 5: si32, stride = 1: si32, operand_constraints = [#any_device_tile, #any_device_tile]}>  :
+        (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+    return %1 : tensor<4x4xf32>
+  }
+}
+
+// -----
+
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module attributes {} {
+  func.func @select_negative_invalid_length_2(%arg0: tensor<4x4xf32>) -> tensor<4x4xf32> {
+    %0 = tensor.empty() : tensor<4x4xf32>
+    // CHECK: {{.*error.*Invalid length.*}}
+    %1 = "ttir.select"(%arg0, %0) <{dim = 1: si32, begin = 0: si32, length = 0: si32, stride = 1: si32, operand_constraints = [#any_device_tile, #any_device_tile]}>  :
+        (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+    return %1 : tensor<4x4xf32>
+  }
+}
+
+// -----
+
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module attributes {} {
+  func.func @select_negative_invalid_length_3(%arg0: tensor<4x4xf32>) -> tensor<4x4xf32> {
+    %0 = tensor.empty() : tensor<4x4xf32>
+    // CHECK: {{.*error.*Invalid length.*}}
+    %1 = "ttir.select"(%arg0, %0) <{dim = 1: si32, begin = 0: si32, length = 2: si32, stride = 1: si32, operand_constraints = [#any_device_tile, #any_device_tile]}>  :
+        (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+    return %1 : tensor<4x4xf32>
+  }
+}
+
+// -----
+
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module attributes {} {
+  func.func @select_negative_invalid_total_size(%arg0: tensor<4x2x64x48xf32>) -> tensor<4x2x4x48xf32> {
+    %0 = tensor.empty() : tensor<4x2x4x48xf32>
+    // CHECK: {{.*error.*Sum of all slices.*}}
+    %1 = "ttir.select"( %arg0, %0) <{dim = 2: si32, begin = 0: si32, length = 4: si32, stride = 4: si32, operand_constraints = [#any_device_tile, #any_device_tile]}>  :
+        (tensor<4x2x64x48xf32>, tensor<4x2x4x48xf32>) -> tensor<4x2x4x48xf32>
+    return %1 : tensor<4x2x4x48xf32>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTIR/select/select_tests_positive.mlir b/test/ttmlir/Dialect/TTIR/select/select_tests_positive.mlir
new file mode 100644
index 0000000000..b613c85bf8
--- /dev/null
+++ b/test/ttmlir/Dialect/TTIR/select/select_tests_positive.mlir
@@ -0,0 +1,44 @@
+// RUN: ttmlir-opt %s | FileCheck %s
+
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module attributes {} {
+  func.func @select_identity(%arg0: tensor<4x4xf32>) -> tensor<4x4xf32> {
+    %0 = tensor.empty() : tensor<4x4xf32>
+    // CHECK: %{{[0-9]+}} = "ttir.select"
+    %1 = "ttir.select"(%arg0, %0) <{dim = 1: si32, begin = 0: si32, length = 4: si32, stride = 4: si32, operand_constraints = [#any_device_tile, #any_device_tile]}>  :
+        (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+    return %1 : tensor<4x4xf32>
+  }
+
+  func.func @select_half(%arg0: tensor<4x4xf32>) -> tensor<4x2xf32> {
+    %0 = tensor.empty() : tensor<4x2xf32>
+    // CHECK: %{{[0-9]+}} = "ttir.select"
+    %1 = "ttir.select"(%arg0, %0) <{dim = 1: si32, begin = 0: si32, length = 2: si32, stride = 4: si32, operand_constraints = [#any_device_tile, #any_device_tile]}>  :
+        (tensor<4x4xf32>, tensor<4x2xf32>) -> tensor<4x2xf32>
+    return %1 : tensor<4x2xf32>
+  }
+
+  func.func @select_single(%arg0: tensor<4x4xf32>) -> tensor<4x1xf32> {
+    %0 = tensor.empty() : tensor<4x1xf32>
+    // CHECK: %{{[0-9]+}} = "ttir.select"
+    %1 = "ttir.select"(%arg0, %0) <{dim = 1: si32, begin = 3: si32, length = 1: si32, stride = 1: si32, operand_constraints = [#any_device_tile, #any_device_tile]}>  :
+        (tensor<4x4xf32>, tensor<4x1xf32>) -> tensor<4x1xf32>
+    return %1 : tensor<4x1xf32>
+  }
+
+  func.func @select_half_2_no_stride(%arg0: tensor<4x4xf32>) -> tensor<4x2xf32> {
+    %0 = tensor.empty() : tensor<4x2xf32>
+    // CHECK: %{{[0-9]+}} = "ttir.select"
+    %1 = "ttir.select"(%arg0, %0) <{dim = 1: si32, begin = 2: si32, length = 2: si32, operand_constraints = [#any_device_tile, #any_device_tile]}>  :
+        (tensor<4x4xf32>, tensor<4x2xf32>) -> tensor<4x2xf32>
+    return %1 : tensor<4x2xf32>
+  }
+
+  func.func @select_neg_dim(%arg0: tensor<10x3x128x64xf32>) -> tensor<10x3x8x64xf32> {
+    %0 = tensor.empty() : tensor<10x3x8x64xf32>
+    // CHECK: %{{[0-9]+}} = "ttir.select"
+    %1 = "ttir.select"(%arg0, %0) <{dim = -2: si32, begin = 0: si32, length = 2: si32, stride = 32: si32, operand_constraints = [#any_device_tile, #any_device_tile]}>  :
+        (tensor<10x3x128x64xf32>, tensor<10x3x8x64xf32>) -> tensor<10x3x8x64xf32>
+    return %1 : tensor<10x3x8x64xf32>
+  }
+}

From c7557926ff2f19839a6fa884979a4ae9e32e710f Mon Sep 17 00:00:00 2001
From: Kristijan Mitrovic <kmitrovic@tenstorrent.com>
Date: Thu, 21 Nov 2024 15:27:18 +0100
Subject: [PATCH 07/84] Small improvements to gh nightly uplift yaml (#1324)

* Small improvements to gh nightly uplift yaml

* Code review changes
---
 .github/workflows/nightly-uplift.yml | 31 +++++++++++++++++-----------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/nightly-uplift.yml b/.github/workflows/nightly-uplift.yml
index a0f6eb5345..bace8c9ff7 100644
--- a/.github/workflows/nightly-uplift.yml
+++ b/.github/workflows/nightly-uplift.yml
@@ -5,7 +5,7 @@ name: Nighty Uplift
 
 on:
   schedule:
-    - cron: '0 8 * * *'  # Runs at 08:00 UTC every day
+    - cron: '0 6 * * *'  # Runs at 06:00 UTC every day
   workflow_dispatch:  # Manual trigger
 
 jobs:
@@ -13,25 +13,32 @@ jobs:
     runs-on: ubuntu-latest
 
     env:
-      SUBMODULE_PATH: third_party/tt-metal
-      TT_METAL_VERSION: origin/main
+      TT_METAL_SUBMODULE_PATH: third_party/tt-metal
 
     steps:
 
       - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          fetch-depth: 0
+          ref: main
 
-      - name: Set env variable
+      - name: Set env variable for today's date
         run: |
           echo "TODAY=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
 
-      - name: Update tt-metal reference
+      - name: Fetch latest SHA of tt-metal submodule
+        id: get_sha
+        run: |
+          LATEST_TT_METAL_VERSION=$(gh api repos/tenstorrent/tt-mlir/commits/main --jq '.sha')
+          echo "LATEST_TT_METAL_VERSION=$LATEST_TT_METAL_VERSION" >> $GITHUB_ENV
+
+      - name: Update tt-metal reference in third_party/CMakeLists.txt
         env:
           GH_TOKEN: ${{ github.token }}
         run: |
-            # Fetch the latest SHA using GitHub CLI
-            LATEST_SHA=$(gh api repos/tenstorrent/tt-metal/commits/main --jq '.sha')
-            # Update the third_party/CMakeLists.txt file with the new SHA
-            sed -i "s/set(TT_METAL_VERSION \".*\")/set(TT_METAL_VERSION \"${LATEST_SHA}\")/" third_party/CMakeLists.txt
+          echo "Updating tt-mlir to SHA: ${{ env.LATEST_TT_METAL_VERSION }}"
+          sed -i "s/set(TT_METAL_VERSION \".*\")/set(TT_METAL_VERSION \"${{ env.LATEST_TT_METAL_VERSION }}\")/" third_party/CMakeLists.txt
 
       - name: Create Pull Request
         uses: peter-evans/create-pull-request@v7
@@ -41,9 +48,9 @@ jobs:
           committer: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
           author: ${{ github.actor }} <${{ github.actor_id }}+${{ github.actor }}@users.noreply.github.com>
           base: main
-          commit-message: "Uplift ${{ env.SUBMODULE_PATH }} to ${{ env.SUBMODULE_VERSION }} ${{ env.TODAY }}"
-          title: "Uplift ${{ env.SUBMODULE_PATH }} to ${{ env.SUBMODULE_VERSION }} ${{ env.TODAY }}"
-          body: "This PR uplifts the ${{ env.SUBMODULE_PATH }} to the ${{ env.SUBMODULE_VERSION }}"
+          commit-message: "Uplift ${{ env.TT_METAL_SUBMODULE_PATH }} to ${{ env.LATEST_TT_METAL_VERSION }} ${{ env.TODAY }}"
+          title: "Uplift ${{ env.TT_METAL_SUBMODULE_PATH }} to ${{ env.LATEST_TT_METAL_VERSION }} ${{ env.TODAY }}"
+          body: "This PR uplifts the ${{ env.TT_METAL_SUBMODULE_PATH }} to the ${{ env.LATEST_TT_METAL_VERSION }}"
           labels: uplift
           delete-branch: true
           token: ${{ secrets.GH_TOKEN }}

From a47dcc7e76fbf0e4e76cb59f9303acfa51a544eb Mon Sep 17 00:00:00 2001
From: Sasa Vuckovic <134393361+svuckovicTT@users.noreply.github.com>
Date: Thu, 21 Nov 2024 15:50:03 +0100
Subject: [PATCH 08/84] Fix unregistered dialects error message
 (ttmlir-translate) (#1341)

---
 docs/src/ttmlir-translate.md                  |  6 ++--
 lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp    | 33 +++++++++++++++++++
 .../TTNNToEmitC/TTNNToEmitCPass.cpp           | 22 ++++++++-----
 test/ttmlir/Silicon/TTNN/emitc/two_fns.mlir   | 16 +++++++++
 tools/ttnn-standalone/README.md               |  2 +-
 tools/ttnn-standalone/ttnn-standalone.cpp     |  6 ++--
 6 files changed, 69 insertions(+), 16 deletions(-)
 create mode 100644 test/ttmlir/Silicon/TTNN/emitc/two_fns.mlir

diff --git a/docs/src/ttmlir-translate.md b/docs/src/ttmlir-translate.md
index c82f7ee8f0..ba9c69b3c5 100644
--- a/docs/src/ttmlir-translate.md
+++ b/docs/src/ttmlir-translate.md
@@ -5,15 +5,15 @@ The `ttmlir-translate` translation utility. Unlike `ttmlir-opt` tool which is us
 
 ```bash
 # First, let's run `ttmlir-opt` to convert to proper dialect
-./build/bin/ttmlir-opt --ttir-load-system-desc --ttir-layout --convert-ttir-to-ttnn --convert-ttnn-to-emitc test/ttmlir/Dialect/TTNN/simple_multiply.mlir -o c.mlir
+./build/bin/ttmlir-opt --ttir-to-emitc-pipeline test/ttmlir/Dialect/TTNN/simple_multiply.mlir -o c.mlir
 
 # Now run `ttmlir-translate` to produce C++ code
-./build/bin/ttmlir-translate -mlir-to-cpp c.mlir -allow-unregistered-dialect
+./build/bin/ttmlir-translate --mlir-to-cpp c.mlir
 ```
 
 Bonus: These two commands can be piped, to avoid writing a `mlir` file to disk, like so:
 ```bash
-./build/bin/ttmlir-opt --ttir-load-system-desc --ttir-layout --convert-ttir-to-ttnn --convert-ttnn-to-emitc test/ttmlir/Dialect/TTNN/simple_multiply.mlir | ./build/bin/ttmlir-translate -mlir-to-cpp -allow-unregistered-dialect
+./build/bin/ttmlir-opt --ttir-to-emitc-pipeline test/ttmlir/Dialect/TTNN/simple_multiply.mlir | ./build/bin/ttmlir-translate -mlir-to-cpp
 ```
 
 ## Generate flatbuffer file from MLIR
diff --git a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
index 9b7cf7fe84..92862cd9da 100644
--- a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
+++ b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
@@ -618,6 +618,35 @@ class DeallocateOpConversionPattern
   }
 };
 
+// Module Op conversion pattern
+//
+// This conversion pattern removes attributes from the ModuleOp. Previously,
+// ttmlir-translate would complain when translating to C++ if there were any
+// attributes from "unregistered" dialects.
+//
+class ModuleOpConversionPattern
+    : public TTNNToEmitCBaseOpConversionPattern<mlir::ModuleOp> {
+
+public:
+  ModuleOpConversionPattern(const TypeConverter &typeConverter,
+                            MLIRContext *context, PatternBenefit benefit = 1)
+      : TTNNToEmitCBaseOpConversionPattern<mlir::ModuleOp>(typeConverter,
+                                                           context, benefit) {}
+
+  LogicalResult
+  matchAndRewrite(mlir::ModuleOp srcOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    rewriter.modifyOpInPlace(srcOp, [&]() {
+      for (const NamedAttribute &attr : srcOp->getAttrs()) {
+        srcOp->removeAttr(attr.getName());
+      }
+    });
+
+    return success();
+  }
+};
+
 } // namespace
 
 namespace mlir::tt {
@@ -720,6 +749,10 @@ void populateTTNNToEmitCPatterns(mlir::MLIRContext *ctx,
   //
   patterns.add<DefaultOpConversionPattern<ttnn::AllGatherOp>>(typeConverter,
                                                               ctx);
+
+  // Module op
+  //
+  patterns.add<ModuleOpConversionPattern>(typeConverter, ctx);
 }
 
 } // namespace mlir::tt
diff --git a/lib/Conversion/TTNNToEmitC/TTNNToEmitCPass.cpp b/lib/Conversion/TTNNToEmitC/TTNNToEmitCPass.cpp
index 71a7c52b60..bd0c9044fc 100644
--- a/lib/Conversion/TTNNToEmitC/TTNNToEmitCPass.cpp
+++ b/lib/Conversion/TTNNToEmitC/TTNNToEmitCPass.cpp
@@ -4,6 +4,11 @@
 
 #include "ttmlir/Conversion/TTNNToEmitC/TTNNToEmitC.h"
 
+#include "ttmlir/Dialect/TTNN/IR/TTNN.h"
+#include "ttmlir/Dialect/TTNN/IR/TTNNOps.h"
+#include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h"
+#include "ttmlir/Dialect/TTNN/IR/TTNNOpsTypes.h"
+
 #include "mlir/Dialect/EmitC/IR/EmitC.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Func/Transforms/FuncConversions.h"
@@ -12,11 +17,6 @@
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
 
-#include "ttmlir/Dialect/TTNN/IR/TTNN.h"
-#include "ttmlir/Dialect/TTNN/IR/TTNNOps.h"
-#include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h"
-#include "ttmlir/Dialect/TTNN/IR/TTNNOpsTypes.h"
-
 using namespace mlir;
 using namespace mlir::tt;
 
@@ -48,14 +48,20 @@ struct ConvertTTNNToEmitCPass
   void runOnOperation() override {
     mlir::ConversionTarget target(getContext());
 
+    // EmitC is legal, TTNN is illegal
+    //
     target.addLegalDialect<emitc::EmitCDialect>();
     target.addIllegalDialect<ttnn::TTNNDialect>();
-    target.addLegalOp<mlir::ModuleOp>();
+
+    // mlir::ModuleOp is legal only if no attributes are present on it
+    //
+    target.addDynamicallyLegalOp<mlir::ModuleOp>(
+        [&](mlir::ModuleOp op) { return op->getAttrs().empty(); });
 
     // Add header imports to front of module
     //
     {
-      auto module = getOperation();
+      mlir::ModuleOp module = getOperation();
       OpBuilder builder(module);
 
       if (module.getBodyRegion().empty()) {
@@ -107,7 +113,7 @@ struct ConvertTTNNToEmitCPass
         return;
       }
     }
-  };
+  }
 };
 
 } // namespace
diff --git a/test/ttmlir/Silicon/TTNN/emitc/two_fns.mlir b/test/ttmlir/Silicon/TTNN/emitc/two_fns.mlir
new file mode 100644
index 0000000000..3f304969c8
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/emitc/two_fns.mlir
@@ -0,0 +1,16 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+
+func.func @add(%arg0: tensor<32x32xbf16>, %arg1: tensor<32x32xbf16>) -> tensor<32x32xbf16> {
+  %0 = tensor.empty() : tensor<32x32xbf16>
+  %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<32x32xbf16>, tensor<32x32xbf16>, tensor<32x32xbf16>) -> tensor<32x32xbf16>
+  return %1 : tensor<32x32xbf16>
+}
+
+func.func @subtract(%arg0: tensor<32x32xbf16>, %arg1: tensor<32x32xbf16>) -> tensor<32x32xbf16> {
+  %0 = tensor.empty() : tensor<32x32xbf16>
+  %1 = "ttir.subtract"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<32x32xbf16>, tensor<32x32xbf16>, tensor<32x32xbf16>) -> tensor<32x32xbf16>
+  return %1 : tensor<32x32xbf16>
+}
diff --git a/tools/ttnn-standalone/README.md b/tools/ttnn-standalone/README.md
index 816cfe1cf5..619e52d1c3 100644
--- a/tools/ttnn-standalone/README.md
+++ b/tools/ttnn-standalone/README.md
@@ -14,7 +14,7 @@ Third party ML models (PyTorch, Jax, ONNX, ...) can be compiled to a set of TTNN
 
 ```bash
 # Compile a model to C++ code
-./build/bin/ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-layout --convert-ttir-to-ttnn --ttnn-decompose-layouts --ttnn-deallocate --convert-ttnn-to-emitc test/ttmlir/Silicon/TTNN/emitc/simple_add.mlir | ./build/bin/ttmlir-translate --mlir-to-cpp -allow-unregistered-dialect
+./build/bin/ttmlir-opt --ttir-to-emitc-pipeline test/ttmlir/Silicon/TTNN/emitc/simple_add.mlir | ./build/bin/ttmlir-translate --mlir-to-cpp
 
 # Copy paste the generated function into `ttnn-standalone.cpp`.
 
diff --git a/tools/ttnn-standalone/ttnn-standalone.cpp b/tools/ttnn-standalone/ttnn-standalone.cpp
index dff9afff43..0dee60f134 100644
--- a/tools/ttnn-standalone/ttnn-standalone.cpp
+++ b/tools/ttnn-standalone/ttnn-standalone.cpp
@@ -5,11 +5,9 @@
 #include "ttnn-precompiled.hpp"
 
 // To generate forward function, run:
-// ./build/bin/ttmlir-opt --ttir-load-system-desc --ttir-implicit-device
-// --ttir-layout --convert-ttir-to-ttnn --ttnn-decompose-layouts
-// --ttnn-deallocate --convert-ttnn-to-emitc
+// ./build/bin/ttmlir-opt --ttir-to-emitc-pipeline
 // test/ttmlir/Silicon/TTNN/emitc/simple_add.mlir | ./build/bin/ttmlir-translate
-// --mlir-to-cpp -allow-unregistered-dialect
+// --mlir-to-cpp
 
 ttnn::Tensor forward(ttnn::Tensor v1, ttnn::Tensor v2) {
   ttnn::Device *v3 = ttnn::DeviceGetter::getInstance();

From 4ceefdc03420a1c841598a6a087d761791d0cc66 Mon Sep 17 00:00:00 2001
From: Meenakshi Ramanathan <mramanathan@tenstorrent.com>
Date: Thu, 21 Nov 2024 21:53:42 +0530
Subject: [PATCH 09/84] Fix: input and output must have same shape for matching
 shapes (#1357)

---
 lib/Dialect/TTNN/IR/TTNNOps.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Dialect/TTNN/IR/TTNNOps.cpp b/lib/Dialect/TTNN/IR/TTNNOps.cpp
index 8550b8796d..4abd74d62e 100644
--- a/lib/Dialect/TTNN/IR/TTNNOps.cpp
+++ b/lib/Dialect/TTNN/IR/TTNNOps.cpp
@@ -42,7 +42,7 @@ ::mlir::LogicalResult mlir::tt::ttnn::ClampOp::verify() {
   const RankedTensorType outputTensorType =
       mlir::cast<RankedTensorType>(outputs.front().getType());
 
-  if (inputTensorType != outputTensorType) {
+  if (inputTensorType.getShape() != outputTensorType.getShape()) {
     return emitOpError("input and output must have same shape.");
   }
 

From 3ab87adc0c9e9c1eb6d6aa6eda8eb3bd78fbd458 Mon Sep 17 00:00:00 2001
From: Kristijan Mitrovic <kmitrovic@tenstorrent.com>
Date: Fri, 22 Nov 2024 14:17:08 +0100
Subject: [PATCH 10/84] Added gh token (#1381)

---
 .github/workflows/nightly-uplift.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/nightly-uplift.yml b/.github/workflows/nightly-uplift.yml
index bace8c9ff7..332044a027 100644
--- a/.github/workflows/nightly-uplift.yml
+++ b/.github/workflows/nightly-uplift.yml
@@ -16,7 +16,6 @@ jobs:
       TT_METAL_SUBMODULE_PATH: third_party/tt-metal
 
     steps:
-
       - uses: actions/checkout@v4
         with:
           submodules: recursive
@@ -28,14 +27,13 @@ jobs:
           echo "TODAY=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
 
       - name: Fetch latest SHA of tt-metal submodule
-        id: get_sha
+        env:
+          GH_TOKEN: ${{ github.token }}
         run: |
           LATEST_TT_METAL_VERSION=$(gh api repos/tenstorrent/tt-mlir/commits/main --jq '.sha')
           echo "LATEST_TT_METAL_VERSION=$LATEST_TT_METAL_VERSION" >> $GITHUB_ENV
 
       - name: Update tt-metal reference in third_party/CMakeLists.txt
-        env:
-          GH_TOKEN: ${{ github.token }}
         run: |
           echo "Updating tt-mlir to SHA: ${{ env.LATEST_TT_METAL_VERSION }}"
           sed -i "s/set(TT_METAL_VERSION \".*\")/set(TT_METAL_VERSION \"${{ env.LATEST_TT_METAL_VERSION }}\")/" third_party/CMakeLists.txt

From 164d183b32148a38ad217e082efb482a5a1f974b Mon Sep 17 00:00:00 2001
From: Vincent Wells <vwells@tenstorrent.com>
Date: Fri, 22 Nov 2024 08:48:37 -0600
Subject: [PATCH 11/84] Remove some unneeded copies in ModuleTT (#1366)

---
 python/TTModule.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/python/TTModule.cpp b/python/TTModule.cpp
index f631b01169..7417866a52 100644
--- a/python/TTModule.cpp
+++ b/python/TTModule.cpp
@@ -276,29 +276,29 @@ void populateTTModule(py::module &m) {
                   })
       .def_static(
           "get",
-          [](MlirContext ctx, std::vector<MlirAttribute> cpuDescs,
-             std::vector<MlirAttribute> chipDescs,
-             std::vector<unsigned> chipDescIndices,
-             std::vector<MlirAttribute> chipCapabilities,
-             std::vector<MlirAttribute> chipCoords,
-             std::vector<MlirAttribute> chipChannels) {
+          [](MlirContext ctx, const std::vector<MlirAttribute> &cpuDescs,
+             const std::vector<MlirAttribute> &chipDescs,
+             const std::vector<unsigned> &chipDescIndices,
+             const std::vector<MlirAttribute> &chipCapabilities,
+             const std::vector<MlirAttribute> &chipCoords,
+             const std::vector<MlirAttribute> &chipChannels) {
             std::vector<tt::ChipDescAttr> chipDescsUnwrapped;
-            for (auto chipDesc : chipDescs) {
+            for (const auto &chipDesc : chipDescs) {
               chipDescsUnwrapped.push_back(
                   mlir::cast<tt::ChipDescAttr>(unwrap(chipDesc)));
             }
             std::vector<tt::ChipCapabilityAttr> chipCapabilitiesUnwrapped;
-            for (auto chipCapability : chipCapabilities) {
+            for (const auto &chipCapability : chipCapabilities) {
               chipCapabilitiesUnwrapped.push_back(
                   mlir::cast<tt::ChipCapabilityAttr>(unwrap(chipCapability)));
             }
             std::vector<tt::ChipCoordAttr> chipCoordsUnwrapped;
-            for (auto chipCoord : chipCoords) {
+            for (const auto &chipCoord : chipCoords) {
               chipCoordsUnwrapped.push_back(
                   mlir::cast<tt::ChipCoordAttr>(unwrap(chipCoord)));
             }
             std::vector<tt::ChipChannelAttr> chipChannelsUnwrapped;
-            for (auto chipChannel : chipChannels) {
+            for (const auto &chipChannel : chipChannels) {
               chipChannelsUnwrapped.push_back(
                   mlir::cast<tt::ChipChannelAttr>(unwrap(chipChannel)));
             }

From e2c3fe40e677ca43dc3c84d00080100c68898fda Mon Sep 17 00:00:00 2001
From: Sterling Taylor <166402033+staylorTT@users.noreply.github.com>
Date: Fri, 22 Nov 2024 08:57:06 -0600
Subject: [PATCH 12/84] adding pagination to find item ids (#1238)

* adding pagination to find item ids

* removing trailing whitespace
---
 .github/workflows/issue-last-updated.yml | 99 +++++++++++++++++++++---
 1 file changed, 88 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/issue-last-updated.yml b/.github/workflows/issue-last-updated.yml
index 61a235aff2..a2cf766dd5 100644
--- a/.github/workflows/issue-last-updated.yml
+++ b/.github/workflows/issue-last-updated.yml
@@ -21,6 +21,7 @@ jobs:
           echo "project_id=PVT_kwDOA9MHEM4AjeTl" >> $GITHUB_ENV
           echo "field_id=PVTF_lADOA9MHEM4AjeTlzgiiU18" >> $GITHUB_ENV
 
+
       - name: Get Issue ID
         id: get_issue_id
         run: |
@@ -31,18 +32,94 @@ jobs:
 
 
       - name: Get Item ID for Issue
-        id: get_item_by_issue_id
+        id: get_item_id_by_issue_id
         run: |
-          ITEM_ID=$(curl -X POST -H "Authorization: Bearer $GITHUB_TOKEN" \
-               -H "Content-Type: application/json" \
-               -d '{
-                 "query": "query($projectId: ID!) { node(id: $projectId) { ... on ProjectV2 { items(first: 100) { nodes { id content { ... on Issue { id } } } } } } }",
-                 "variables": {
-                   "projectId": "'"${{ env.project_id }}"'"
-                 }
-               }' \
-               https://api.github.com/graphql | jq -r '.data.node.items.nodes[] | select(.content.id=="'"${{ env.issue_id }}"'") | .id')
-          echo "ITEM_ID=$ITEM_ID" >> $GITHUB_ENV
+          # Initialize variables
+          CURSOR=null
+          ITEM_ID=""
+
+
+          # Define the GraphQL query as a string
+          QUERY='query($projectId: ID!, $cursor: String) {
+            node(id: $projectId) {
+              ... on ProjectV2 {
+                items(first: 100, after: $cursor) {
+                  nodes {
+                    id
+                    content {
+                      ... on Issue {
+                        id
+                      }
+                    }
+                  }
+                  pageInfo {
+                    hasNextPage
+                    endCursor
+                  }
+                }
+              }
+            }
+          }'
+
+
+          while : ; do
+            # Construct JSON payload using jq for proper formatting
+            JSON_PAYLOAD=$(jq -n \
+              --arg query "$QUERY" \
+              --arg projectId "$PROJECT_ID" \
+              --arg cursor "$CURSOR" \
+              '{ query: $query, variables: { projectId: $projectId, cursor: $cursor }}')
+
+
+            # Make the GraphQL request
+            RESPONSE=$(curl -s -X POST -H "Authorization: Bearer $GITHUB_TOKEN" \
+                                 -H "Content-Type: application/json" \
+                                 -d "$JSON_PAYLOAD" \
+                                 https://api.github.com/graphql)
+
+
+            # Debug: print entire response
+            echo "RESPONSE: $RESPONSE"
+
+
+            # Check if the response contains `items` data
+            ITEMS_DATA=$(echo "$RESPONSE" | jq -r '.data.node.items.nodes' 2>/dev/null)
+            if [[ "$ITEMS_DATA" == "null" ]]; then
+              echo "Error: Items data not found. Please check your PROJECT_ID and GITHUB_TOKEN permissions."
+              break
+            fi
+
+
+            # Parse the item ID if it matches the ISSUE_NODE_ID
+            ITEM_ID=$(echo "$RESPONSE" | jq -r --arg ISSUE_NODE_ID "$ISSUE_NODE_ID" \
+                       '.data.node.items.nodes[] | select(.content.id==$ISSUE_NODE_ID) | .id')
+
+
+            # If ITEM_ID is found, output it and stop the loop
+            if [[ -n "$ITEM_ID" && "$ITEM_ID" != "null" ]]; then
+              echo "Found ITEM_ID: $ITEM_ID"
+              echo "ITEM_ID=$ITEM_ID" >> $GITHUB_ENV  # Save ITEM_ID to environment for future steps
+              break
+            fi
+
+
+            # Extract pagination information
+            HAS_NEXT_PAGE=$(echo "$RESPONSE" | jq -r '.data.node.items.pageInfo.hasNextPage')
+            CURSOR=$(echo "$RESPONSE" | jq -r '.data.node.items.pageInfo.endCursor')
+
+
+            # If no more pages, exit loop
+            if [[ "$HAS_NEXT_PAGE" != "true" ]]; then
+              echo "Issue not found in project items."
+              break
+            fi
+          done
+
+
+      - name: Use Found ITEM_ID
+        if: env.ITEM_ID  # Only runs if ITEM_ID was set
+        run: echo "The ITEM_ID is ${{ env.ITEM_ID }}"
+
 
       - name: Update Project Field
         run: |

From a7308cabd25a737ca238329699e453089cce2543 Mon Sep 17 00:00:00 2001
From: Aleksandar Zecevic <azecevic@tenstorrent.com>
Date: Fri, 22 Nov 2024 19:21:07 +0100
Subject: [PATCH 13/84] Added LinearOp support (#1233)

---
 include/ttmlir/Dialect/TTIR/IR/TTIROps.td     |  28 +++
 include/ttmlir/Dialect/TTNN/IR/TTNNOps.td     |  28 +++
 include/ttmlir/Target/TTNN/program.fbs        |   8 +
 lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp      |  17 +-
 lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp    |   3 +-
 lib/Dialect/TTIR/IR/TTIROps.cpp               | 152 ++++++++++++
 lib/Dialect/TTNN/IR/TTNNOps.cpp               | 152 ++++++++++++
 lib/Target/TTNN/TTNNToFlatbuffer.cpp          |  18 ++
 runtime/lib/ttnn/operations/matmul/matmul.cpp |  38 ++-
 runtime/lib/ttnn/operations/matmul/matmul.h   |   1 +
 runtime/lib/ttnn/program.cpp                  |   3 +
 .../TTIR/linear/linear_tests_negative.mlir    | 194 ++++++++++++++++
 .../TTNN/linear/linear_tests_positive.mlir    | 216 ++++++++++++++++++
 .../Dialect/TTNN/linear/simple_linear.mlir    |  31 +++
 .../TTNN/perf_unit/test_perf_linear.mlir      |  20 ++
 test/ttmlir/Silicon/TTNN/simple_linear.mlir   |  33 +++
 16 files changed, 934 insertions(+), 8 deletions(-)
 create mode 100644 test/ttmlir/Dialect/TTIR/linear/linear_tests_negative.mlir
 create mode 100644 test/ttmlir/Dialect/TTNN/linear/linear_tests_positive.mlir
 create mode 100644 test/ttmlir/Dialect/TTNN/linear/simple_linear.mlir
 create mode 100644 test/ttmlir/Silicon/TTNN/perf_unit/test_perf_linear.mlir
 create mode 100644 test/ttmlir/Silicon/TTNN/simple_linear.mlir

diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
index 8782f63ae1..5bfb77064f 100644
--- a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
+++ b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
@@ -1091,6 +1091,34 @@ def TTIR_FillOp : TTIR_DPSOp<"fill", [AllShapesMatch<["value", "result"]>]> {
     }];
 }
 
+def TTIR_LinearOp : TTIR_DPSOp<"linear"> {
+    let summary = "Linear transformation of inputs.";
+    let description = [{
+      Produces the matmul of tensors `a` and `b` with optional addition with `bias`.
+
+      Example:
+        %a = tensor.empty() : () -> tensor<10x64x32xbf16>
+        %b = tensor.empty() : () -> tensor<32x128xbf16>
+        %bias = tensor.empty() : () -> tensor<128xbf16>
+        %output = tensor.empty() : () -> tensor<10x64x128xbf16>
+        %0 = "ttir.linear"(%a, %b, %bias, %output) : (tensor<10x64x32xbf16>, tensor<32x128xbf16>, tensor<128xbf16>, tensor<10x64x128xbf16>) -> tensor<10x64x128xbf16>
+    }];
+
+    let arguments = (ins AnyRankedTensor:$a,
+                         AnyRankedTensor:$b,
+                         Optional<AnyRankedTensor>:$bias,
+                         AnyRankedTensor:$output,
+                         TT_OperandConstraintArrayAttr:$operand_constraints);
+
+    let results = (outs AnyRankedTensor:$result);
+
+    let extraClassDeclaration = [{
+      MutableOperandRange getDpsInitsMutable() { return getOutputMutable(); }
+    }];
+
+    let hasVerifier = 1;
+}
+
 // ANCHOR: adding_an_op_matmul_ttir
 def TTIR_MatmulOp : TTIR_DPSOp<"matmul"> {
     let summary = "Matrix multiply operation.";
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
index 910ed7dfd9..4147cc6d08 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
@@ -636,6 +636,34 @@ def TTNN_SliceOp: TTNN_NamedDPSOp<"slice"> {
     let hasVerifier = 1;
 }
 
+def TTNN_LinearOp : TTNN_NamedDPSOp<"linear"> {
+    let summary = "Linear transformation of inputs.";
+
+    let description = [{
+      Produces the matmul of tensors `a` and `b` with optional addition with `bias`.
+
+      Example:
+        // %a = [[1., 2.]], [2., 1.]]
+        // %b = [[0., 1.], [1., 0.]]
+        // %bias = [[1.]]
+        "ttnn.linear"(%a, %b, %bias, %result) : (tensor<2x2xf16>, tensor<2x2xf16>, tensor<1xf16>, tensor<2x2xf16>) -> tensor<2x2xf16>
+        // %result = [[3., 2.], [2., 3.]]
+    }];
+
+    let arguments = (ins AnyRankedTensor:$a,
+                         AnyRankedTensor:$b,
+                         Optional<AnyRankedTensor>:$bias,
+                         AnyRankedTensor:$output);
+    let results = (outs AnyRankedTensor:$result);
+
+    let extraClassDeclaration = [{
+      MutableOperandRange getDpsInitsMutable() { return getOutputMutable(); }
+    }];
+
+    let hasVerifier = 1;
+}
+
+
 // ANCHOR: adding_an_op_matmul_ttnn
 def TTNN_MatmulOp : TTNN_NamedDPSOp<"matmul"> {
     let arguments = (ins AnyRankedTensor:$a,
diff --git a/include/ttmlir/Target/TTNN/program.fbs b/include/ttmlir/Target/TTNN/program.fbs
index ec493e6496..0be274b4b6 100644
--- a/include/ttmlir/Target/TTNN/program.fbs
+++ b/include/ttmlir/Target/TTNN/program.fbs
@@ -178,6 +178,13 @@ table SliceOp {
   step: [int64];
 }
 
+table LinearOp {
+  in0: tt.target.TensorRef;
+  in1: tt.target.TensorRef;
+  bias: tt.target.TensorRef;
+  out: tt.target.TensorRef;
+}
+
 // ANCHOR: adding_an_op_matmul_fbs
 table MatmulOp {
   in0: tt.target.TensorRef;
@@ -249,6 +256,7 @@ union OpType {
   EmptyOp,
   FullOp,
   EltwiseOp,
+  LinearOp,
   MatmulOp,
   ReductionOp,
   EmbeddingOp,
diff --git a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
index 12e29a9609..52995b64c6 100644
--- a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
+++ b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
@@ -579,7 +579,19 @@ class ConstantOpConversionPattern
   }
 };
 
-} // namespace
+class LinearOpConversionPattern : public OpConversionPattern<ttir::LinearOp> {
+public:
+  using OpConversionPattern<ttir::LinearOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(ttir::LinearOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<ttnn::LinearOp>(
+        op, this->getTypeConverter()->convertType(op.getType()), adaptor.getA(),
+        adaptor.getB(), adaptor.getBias(), adaptor.getOutput());
+    return success();
+  }
+};
 
 // ANCHOR: adding_an_op_matmul_op_rewriter
 class MatmulOpConversionPattern : public OpConversionPattern<ttir::MatmulOp> {
@@ -908,6 +920,8 @@ class AllGatherOpConversionPattern
   }
 };
 
+} // namespace
+
 namespace mlir::tt {
 
 void populateTTIRToTTNNPatterns(MLIRContext *ctx, RewritePatternSet &patterns,
@@ -969,6 +983,7 @@ void populateTTIRToTTNNPatterns(MLIRContext *ctx, RewritePatternSet &patterns,
            SqueezeOpConversionPattern,
            UnsqueezeOpConversionPattern,
            ConstantOpConversionPattern,
+           LinearOpConversionPattern,
            MatmulOpConversionPattern,
            Conv2dOpConversionPattern,
            MaxPool2dOpConversionPattern,
diff --git a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
index 92862cd9da..6c83200f39 100644
--- a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
+++ b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
@@ -725,7 +725,8 @@ void populateTTNNToEmitCPatterns(mlir::MLIRContext *ctx,
 
   // Matmul ops
   //
-  patterns.add<DefaultOpConversionPattern<ttnn::MatmulOp>>(typeConverter, ctx);
+  patterns.add<DefaultOpConversionPattern<ttnn::LinearOp>,
+               DefaultOpConversionPattern<ttnn::MatmulOp>>(typeConverter, ctx);
 
   // Reduction ops
   //
diff --git a/lib/Dialect/TTIR/IR/TTIROps.cpp b/lib/Dialect/TTIR/IR/TTIROps.cpp
index 5946cb2fe3..bf734df953 100644
--- a/lib/Dialect/TTIR/IR/TTIROps.cpp
+++ b/lib/Dialect/TTIR/IR/TTIROps.cpp
@@ -895,6 +895,158 @@ mlir::tt::ttir::ToLayoutOp::compoundComponents() {
           isMemoryLayoutChange};
 }
 
+//===----------------------------------------------------------------------===//
+// LinearOp
+//===----------------------------------------------------------------------===//
+
+// LinearOp verification
+::mlir::LogicalResult mlir::tt::ttir::LinearOp::verify() {
+  ::mlir::RankedTensorType inputAType = getA().getType();
+  ::mlir::RankedTensorType inputBType = getB().getType();
+  std::optional<::mlir::RankedTensorType> biasType =
+      getBias() ? std::make_optional(getBias().getType()) : std::nullopt;
+  ::mlir::RankedTensorType outputType = getOutput().getType();
+
+  llvm::ArrayRef<int64_t> outputShape = outputType.getShape();
+  llvm::SmallVector<int64_t> inputAShape(inputAType.getShape());
+  llvm::SmallVector<int64_t> inputBShape(inputBType.getShape());
+
+  // Verify that the input A is at least 1D tensor.
+  if (inputAType.getRank() < 1) {
+    return emitOpError("Input A must be at least a 1D tensor");
+  }
+
+  // Verify that the input B is at least 1D tensor.
+  if (inputBType.getRank() < 1) {
+    return emitOpError("Input B must be at least a 1D tensor");
+  }
+
+  // If input A is a vector (1D tensor), 1 is prepended to its dimension for the
+  // purpose of the matrix multiplication. After the matrix multiplication, the
+  // prepended dimension is removed.
+  if (inputAType.getRank() == 1) {
+    inputAShape.insert(inputAShape.begin(), 1);
+  }
+
+  // If input B is a vector (1D tensor), a 1 is appended to its dimension for
+  // the purpose of the matrix-vector product and removed afterwards.
+  if (inputBType.getRank() == 1) {
+    inputBShape.push_back(1);
+  }
+
+  // Verify that the input A and input B has matching inner dimensions.
+  if (inputAShape[inputAShape.size() - 1] !=
+      inputBShape[inputBShape.size() - 2]) {
+    return emitOpError(
+        "Input A[-1](" + std::to_string(inputAShape[inputAShape.size() - 1]) +
+        ") and B[-2](" + std::to_string(inputBShape[inputBShape.size() - 2]) +
+        ") must have matching inner dimensions");
+  }
+
+  llvm::SmallVector<int64_t> expectedOutputShape;
+  // Verify that the batch dimensions are broadcast compatible and construct the
+  // expected output shape.
+  if (inputAShape.size() > 2 || inputBShape.size() > 2) {
+    llvm::SmallVector<int64_t> inputABatchDims, inputBBatchDims;
+
+    if (inputAShape.size() > 2) {
+      inputABatchDims.insert(inputABatchDims.begin(), inputAShape.begin(),
+                             inputAShape.end() - 2);
+    }
+
+    if (inputBShape.size() > 2) {
+      inputBBatchDims.insert(inputBBatchDims.begin(), inputBShape.begin(),
+                             inputBShape.end() - 2);
+    }
+
+    // Verify that the batch dimensions of input A and B are broadcast
+    // compatible.
+    llvm::SmallVector<int64_t, 4> broadcastedShape;
+    if (!OpTrait::util::getBroadcastedShape(inputABatchDims, inputBBatchDims,
+                                            broadcastedShape)) {
+
+      return emitOpError("Batch dimensions of input A(" +
+                         ttmlir::utils::join(inputABatchDims, ",") +
+                         ") and B(" +
+                         ttmlir::utils::join(inputBBatchDims, ",") +
+                         ") are not broadcast compatible");
+    }
+
+    // Insert the broadcasted batch dimensions in the expected output shape.
+    expectedOutputShape.insert(expectedOutputShape.begin(),
+                               broadcastedShape.begin(),
+                               broadcastedShape.end());
+  }
+
+  // Insert the input A and B inner dimensions in expected output shape.
+  // Consider the case where input A and B are vectors. In that case,
+  // the dimension 1 is ommited from the output shape.
+  if (inputAType.getRank() > 1) {
+    expectedOutputShape.push_back(inputAShape[inputAShape.size() - 2]);
+  }
+
+  if (inputBType.getRank() > 1) {
+    expectedOutputShape.push_back(inputBShape[inputBShape.size() - 1]);
+  }
+
+  if (biasType) {
+    // Verify that the input bias is at least 1D tensor.
+    if (biasType.value().getRank() < 1) {
+      return emitOpError("Bias must be at least a 1D tensor");
+    }
+
+    llvm::SmallVector<int64_t> biasShape(biasType.value().getShape());
+
+    // Verify that the dimensions of the matmul of A and B are broadcast
+    // compatible with input bias.
+    llvm::SmallVector<int64_t> matmulShape = expectedOutputShape;
+    if (!OpTrait::util::getBroadcastedShape(matmulShape, biasShape,
+                                            expectedOutputShape)) {
+      return emitOpError("Bias shape(" + ttmlir::utils::join(biasShape, ",") +
+                         ") is not broadcast compatible with the matmul output "
+                         "shape(" +
+                         ttmlir::utils::join(matmulShape, ",") + ")");
+    }
+  }
+
+  // Check the case of a vector-vector product. At this moment we don't support
+  // scalars in IR, hence check that the output is at least 1D tensor of size 1.
+  if (expectedOutputShape.size() == 0) {
+    if (outputType.getRank() < 1) {
+      return emitOpError("Scalar output is not supported, output must be at "
+                         "least a 1D tensor");
+    }
+
+    if (outputType.getRank() > 1 || outputType.getShape()[0] != 1) {
+      return emitOpError("Scalar output must be a 1D tensor of size 1");
+    }
+
+    return success();
+  }
+
+  // Verify that the output shape dimension count is correct.
+  if (outputShape.size() != expectedOutputShape.size()) {
+    return emitOpError("Output shape rank(" +
+                       std::to_string(outputShape.size()) +
+                       ") must match the expected output shape rank(" +
+                       std::to_string(expectedOutputShape.size()) + ")");
+  }
+
+  // Verify each dim of the output shape.
+  for (size_t i = 0; i < outputShape.size(); i++) {
+    if (outputShape[i] != expectedOutputShape[i]) {
+      return emitOpError(
+          "Output shape dimension[" + std::to_string(i) + "](" +
+          std::to_string(outputShape[i]) +
+          ") doesn't match the expected output shape dimension[" +
+          std::to_string(i) + "](" + std::to_string(expectedOutputShape[i]) +
+          ")");
+    }
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // MatmulOp
 //===----------------------------------------------------------------------===//
diff --git a/lib/Dialect/TTNN/IR/TTNNOps.cpp b/lib/Dialect/TTNN/IR/TTNNOps.cpp
index 4abd74d62e..c4f0d73941 100644
--- a/lib/Dialect/TTNN/IR/TTNNOps.cpp
+++ b/lib/Dialect/TTNN/IR/TTNNOps.cpp
@@ -592,6 +592,158 @@ ::mlir::LogicalResult mlir::tt::ttnn::ToMemoryConfigOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// LinearOp
+//===----------------------------------------------------------------------===//
+
+// LinearOp verification
+::mlir::LogicalResult mlir::tt::ttnn::LinearOp::verify() {
+  ::mlir::RankedTensorType inputAType = getA().getType();
+  ::mlir::RankedTensorType inputBType = getB().getType();
+  std::optional<::mlir::RankedTensorType> biasType =
+      getBias() ? std::make_optional(getBias().getType()) : std::nullopt;
+  ::mlir::RankedTensorType outputType = getOutput().getType();
+
+  llvm::ArrayRef<int64_t> outputShape = outputType.getShape();
+  llvm::SmallVector<int64_t> inputAShape(inputAType.getShape());
+  llvm::SmallVector<int64_t> inputBShape(inputBType.getShape());
+
+  // Verify that the input A is at least 1D tensor.
+  if (inputAType.getRank() < 1) {
+    return emitOpError("Input A must be at least a 1D tensor");
+  }
+
+  // Verify that the input B is at least 1D tensor.
+  if (inputBType.getRank() < 1) {
+    return emitOpError("Input B must be at least a 1D tensor");
+  }
+
+  // If input A is a vector (1D tensor), 1 is prepended to its dimension for the
+  // purpose of the matrix multiplication. After the matrix multiplication, the
+  // prepended dimension is removed.
+  if (inputAType.getRank() == 1) {
+    inputAShape.insert(inputAShape.begin(), 1);
+  }
+
+  // If input B is a vector (1D tensor), a 1 is appended to its dimension for
+  // the purpose of the matrix-vector product and removed afterwards.
+  if (inputBType.getRank() == 1) {
+    inputBShape.push_back(1);
+  }
+
+  // Verify that the input A and input B has matching inner dimensions.
+  if (inputAShape[inputAShape.size() - 1] !=
+      inputBShape[inputBShape.size() - 2]) {
+    return emitOpError(
+        "Input A[-1](" + std::to_string(inputAShape[inputAShape.size() - 1]) +
+        ") and B[-2](" + std::to_string(inputBShape[inputBShape.size() - 2]) +
+        ") must have matching inner dimensions");
+  }
+
+  llvm::SmallVector<int64_t> expectedOutputShape;
+  // Verify that the batch dimensions are broadcast compatible and construct the
+  // expected output shape.
+  if (inputAShape.size() > 2 || inputBShape.size() > 2) {
+    llvm::SmallVector<int64_t> inputABatchDims, inputBBatchDims;
+
+    if (inputAShape.size() > 2) {
+      inputABatchDims.insert(inputABatchDims.begin(), inputAShape.begin(),
+                             inputAShape.end() - 2);
+    }
+
+    if (inputBShape.size() > 2) {
+      inputBBatchDims.insert(inputBBatchDims.begin(), inputBShape.begin(),
+                             inputBShape.end() - 2);
+    }
+
+    // Verify that the batch dimensions of input A and B are broadcast
+    // compatible.
+    llvm::SmallVector<int64_t, 4> broadcastedShape;
+    if (!OpTrait::util::getBroadcastedShape(inputABatchDims, inputBBatchDims,
+                                            broadcastedShape)) {
+
+      return emitOpError("Batch dimensions of input A(" +
+                         ttmlir::utils::join(inputABatchDims, ",") +
+                         ") and B(" +
+                         ttmlir::utils::join(inputBBatchDims, ",") +
+                         ") are not broadcast compatible");
+    }
+
+    // Insert the broadcasted batch dimensions in the expected output shape.
+    expectedOutputShape.insert(expectedOutputShape.begin(),
+                               broadcastedShape.begin(),
+                               broadcastedShape.end());
+  }
+
+  // Insert the input A and B inner dimensions in expected output shape.
+  // Consider the case where input A and B are vectors. In that case,
+  // the dimension 1 is ommited from the output shape.
+  if (inputAType.getRank() > 1) {
+    expectedOutputShape.push_back(inputAShape[inputAShape.size() - 2]);
+  }
+
+  if (inputBType.getRank() > 1) {
+    expectedOutputShape.push_back(inputBShape[inputBShape.size() - 1]);
+  }
+
+  if (biasType) {
+    // Verify that the input bias is at least 1D tensor.
+    if (biasType.value().getRank() < 1) {
+      return emitOpError("Bias must be at least a 1D tensor");
+    }
+
+    llvm::SmallVector<int64_t> biasShape(biasType.value().getShape());
+
+    // Verify that the dimensions of the matmul of A and B are broadcast
+    // compatible with input bias.
+    llvm::SmallVector<int64_t> matmulShape = expectedOutputShape;
+    if (!OpTrait::util::getBroadcastedShape(matmulShape, biasShape,
+                                            expectedOutputShape)) {
+      return emitOpError("Bias shape(" + ttmlir::utils::join(biasShape, ",") +
+                         ") is not broadcast compatible with the matmul output "
+                         "shape(" +
+                         ttmlir::utils::join(matmulShape, ",") + ")");
+    }
+  }
+
+  // Check the case of a vector-vector product. At this moment we don't support
+  // scalars in IR, hence check that the output is at least 1D tensor of size 1.
+  if (expectedOutputShape.size() == 0) {
+    if (outputType.getRank() < 1) {
+      return emitOpError("Scalar output is not supported, output must be at "
+                         "least a 1D tensor");
+    }
+
+    if (outputType.getRank() > 1 || outputType.getShape()[0] != 1) {
+      return emitOpError("Scalar output must be a 1D tensor of size 1");
+    }
+
+    return success();
+  }
+
+  // Verify that the output shape dimension count is correct.
+  if (outputShape.size() != expectedOutputShape.size()) {
+    return emitOpError("Output shape rank(" +
+                       std::to_string(outputShape.size()) +
+                       ") must match the expected output shape rank(" +
+                       std::to_string(expectedOutputShape.size()) + ")");
+  }
+
+  // Verify each dim of the output shape.
+  for (size_t i = 0; i < outputShape.size(); i++) {
+    if (outputShape[i] != expectedOutputShape[i]) {
+      return emitOpError(
+          "Output shape dimension[" + std::to_string(i) + "](" +
+          std::to_string(outputShape[i]) +
+          ") doesn't match the expected output shape dimension[" +
+          std::to_string(i) + "](" + std::to_string(expectedOutputShape[i]) +
+          ")");
+    }
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // MatmulOp
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/TTNN/TTNNToFlatbuffer.cpp b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
index 30b83014d4..8971963f2a 100644
--- a/lib/Target/TTNN/TTNNToFlatbuffer.cpp
+++ b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
@@ -333,6 +333,21 @@ createOp(FlatbufferObjectCache &cache, FullOp op) {
                         kHostAllocatedSize));
 }
 
+::flatbuffers::Offset<::tt::target::ttnn::LinearOp>
+createOp(FlatbufferObjectCache &cache, LinearOp op) {
+  auto in0 =
+      cache.at<::tt::target::TensorRef>(getOperandThroughDPSOps(op.getA()));
+  auto in1 =
+      cache.at<::tt::target::TensorRef>(getOperandThroughDPSOps(op.getB()));
+  auto bias = op.getODSOperands(2).empty()
+                  ? flatbuffers::Offset<::tt::target::TensorRef>()
+                  : cache.at<::tt::target::TensorRef>(
+                        getOperandThroughDPSOps(op.getBias()));
+  auto output = cache.at<::tt::target::TensorRef>(
+      getOperandThroughDPSOps(op.getResult()));
+  return ::tt::target::ttnn::CreateLinearOp(*cache.fbb, in0, in1, bias, output);
+}
+
 // ANCHOR: adding_an_op_matmul_serialize_to_binary
 ::flatbuffers::Offset<::tt::target::ttnn::MatmulOp>
 createOp(FlatbufferObjectCache &cache, MatmulOp op) {
@@ -801,6 +816,9 @@ emitTTNNOperation(FlatbufferObjectCache &cache, Operation *op,
     return createOperation(cache, createEltwiseOp(cache, leakyReluOp),
                            debugString);
   }
+  if (auto linearOp = dyn_cast<LinearOp>(op); linearOp) {
+    return createOperation(cache, createOp(cache, linearOp), debugString);
+  }
   if (auto matmulOp = dyn_cast<MatmulOp>(op); matmulOp) {
     return createOperation(cache, createOp(cache, matmulOp), debugString);
   }
diff --git a/runtime/lib/ttnn/operations/matmul/matmul.cpp b/runtime/lib/ttnn/operations/matmul/matmul.cpp
index abe71f9707..a25102d9af 100644
--- a/runtime/lib/ttnn/operations/matmul/matmul.cpp
+++ b/runtime/lib/ttnn/operations/matmul/matmul.cpp
@@ -8,8 +8,8 @@
 #include "tt/runtime/ttnn/operations/utils.h"
 #include <optional>
 
-// ANCHOR: adding_an_op_matmul_runtime_operations
 namespace tt::runtime::ttnn::operations::matmul {
+// ANCHOR: adding_an_op_matmul_runtime_operations
 void run(const ::tt::target::ttnn::MatmulOp *op, ProgramContext &context) {
   ProgramTensorPool &tensorPool = context.getTensorPool();
   const ::ttnn::Tensor &lhs = tensorPool.at(op->in0()->global_id());
@@ -20,10 +20,6 @@ void run(const ::tt::target::ttnn::MatmulOp *op, ProgramContext &context) {
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
       utils::createMemoryConfig(op->out());
 
-  std::optional<
-      ::ttnn::operations::matmul::MatmulMultiCoreReuseMultiCast1DProgramConfig>
-      programConfig = std::nullopt;
-
   const std::optional<const ::tt::tt_metal::MemoryConfig> memoryConfig =
       std::make_optional(outputMemoryConfig);
 
@@ -37,5 +33,35 @@ void run(const ::tt::target::ttnn::MatmulOp *op, ProgramContext &context) {
 
   tensorPool.insert_or_assign(op->out()->global_id(), out);
 }
-} // namespace tt::runtime::ttnn::operations::matmul
 // ANCHOR_END: adding_an_op_matmul_runtime_operations
+
+void run(const ::tt::target::ttnn::LinearOp *op, ProgramContext &context) {
+  ProgramTensorPool &tensorPool = context.getTensorPool();
+  const ::ttnn::Tensor &lhs = tensorPool.at(op->in0()->global_id());
+  const ::ttnn::Tensor &rhs = tensorPool.at(op->in1()->global_id());
+  std::optional<::ttnn::Tensor> bias =
+      op->bias() ? std::make_optional(tensorPool.at(op->bias()->global_id()))
+                 : std::nullopt;
+
+  DEBUG_ASSERT(lhs.is_allocated());
+  DEBUG_ASSERT(rhs.is_allocated());
+  DEBUG_ASSERT(!bias || bias->is_allocated());
+
+  ::ttnn::DataType outputDataType = utils::getDataType(op->out());
+  ::tt::tt_metal::MemoryConfig outputMemoryConfig =
+      utils::createMemoryConfig(op->out());
+
+  const std::optional<const ::tt::tt_metal::MemoryConfig> memoryConfig =
+      std::make_optional(outputMemoryConfig);
+
+  const std::optional<const ::ttnn::DataType> dtype =
+      std::make_optional(outputDataType);
+
+  ::ttnn::Tensor out = ::ttnn::linear(
+      lhs, rhs, bias, /*transposeA*/ false, /*transposeB*/ false, memoryConfig,
+      dtype, /*programConfig*/ std::nullopt, /*activation*/ std::nullopt,
+      /*computeKernelConfig*/ std::nullopt, /*coreGrid*/ std::nullopt);
+
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
+}
+} // namespace tt::runtime::ttnn::operations::matmul
diff --git a/runtime/lib/ttnn/operations/matmul/matmul.h b/runtime/lib/ttnn/operations/matmul/matmul.h
index 5957a54a3c..7b0583786b 100644
--- a/runtime/lib/ttnn/operations/matmul/matmul.h
+++ b/runtime/lib/ttnn/operations/matmul/matmul.h
@@ -10,6 +10,7 @@
 
 namespace tt::runtime::ttnn::operations::matmul {
 void run(const ::tt::target::ttnn::MatmulOp *op, ProgramContext &context);
+void run(const ::tt::target::ttnn::LinearOp *op, ProgramContext &context);
 } // namespace tt::runtime::ttnn::operations::matmul
 
 #endif
diff --git a/runtime/lib/ttnn/program.cpp b/runtime/lib/ttnn/program.cpp
index 8cfa013891..fbd58c5939 100644
--- a/runtime/lib/ttnn/program.cpp
+++ b/runtime/lib/ttnn/program.cpp
@@ -148,6 +148,9 @@ void ProgramExecutor::runOperation(const ::tt::target::ttnn::Operation *op) {
   case ::tt::target::ttnn::OpType::EltwiseOp: {
     return runEltwiseOperation(op->type_as_EltwiseOp());
   }
+  case ::tt::target::ttnn::OpType::LinearOp: {
+    return operations::matmul::run(op->type_as_LinearOp(), context);
+  }
   // ANCHOR: adding_an_op_matmul_runtime_program
   case ::tt::target::ttnn::OpType::MatmulOp: {
     return operations::matmul::run(op->type_as_MatmulOp(), context);
diff --git a/test/ttmlir/Dialect/TTIR/linear/linear_tests_negative.mlir b/test/ttmlir/Dialect/TTIR/linear/linear_tests_negative.mlir
new file mode 100644
index 0000000000..522628160c
--- /dev/null
+++ b/test/ttmlir/Dialect/TTIR/linear/linear_tests_negative.mlir
@@ -0,0 +1,194 @@
+// RUN: not ttmlir-opt --split-input-file %s 2>&1 | FileCheck %s
+// Negative tests for linear operation
+
+// Verify that the parsing fails if either of operands is a scalar
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module {
+  func.func @linear_negative_1d_1d_scalar_a(%arg0: tensor<bf16>, %arg1: tensor<64xbf16>) -> tensor<1xbf16> {
+    // CHECK: error: 'ttir.linear' op Input A must be at least a 1D tensor
+    %0 = tensor.empty() : tensor<1xbf16>
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<bf16>, tensor<64xbf16>, tensor<1xbf16>) -> tensor<1xbf16>
+    return %1 : tensor<1xbf16>
+  }
+}
+
+// -----
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module {
+  func.func @linear_negative_1d_1d_scalar_b(%arg0: tensor<128xbf16>, %arg1: tensor<bf16>) -> tensor<1xbf16> {
+    // CHECK: error: 'ttir.linear' op Input B must be at least a 1D tensor
+    %0 = tensor.empty() : tensor<1xbf16>
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<128xbf16>, tensor<bf16>, tensor<1xbf16>) -> tensor<1xbf16>
+    return %1 : tensor<1xbf16>
+  }
+}
+
+// -----
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module {
+  func.func @linear_negative_1d_1d_scalar_bias(%arg0: tensor<128xbf16>, %arg1: tensor<128xbf16>, %bias: tensor<bf16>) -> tensor<1xbf16> {
+    // CHECK: error: 'ttir.linear' op Bias must be at least a 1D tensor
+    %0 = tensor.empty() : tensor<1xbf16>
+    %1 = "ttir.linear"(%arg0, %arg1, %bias, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<128xbf16>, tensor<128xbf16>, tensor<bf16>, tensor<1xbf16>) -> tensor<1xbf16>
+    return %1 : tensor<1xbf16>
+  }
+}
+
+// Verifty that the parsing fails if the output is a scalar
+// -----
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module {
+  func.func @linear_negative_1d_1d_scalar_output(%arg0: tensor<128xbf16>, %arg1: tensor<128xbf16>) -> tensor<bf16> {
+    // CHECK: error: 'ttir.linear' op Scalar output is not supported, output must be at least a 1D tensor
+    %0 = tensor.empty() : tensor<bf16>
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<128xbf16>, tensor<128xbf16>, tensor<bf16>) -> tensor<bf16>
+    return %1 : tensor<bf16>
+  }
+}
+
+// -----
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module {
+  func.func @linear_negative_1d_1d_output_dimension_mismatch(%arg0: tensor<128xbf16>, %arg1: tensor<128xbf16>) -> tensor<2xbf16> {
+    // CHECK: error: 'ttir.linear' op Scalar output must be a 1D tensor of size 1
+    %0 = tensor.empty() : tensor<2xbf16>
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<128xbf16>, tensor<128xbf16>, tensor<2xbf16>) -> tensor<2xbf16>
+    return %1 : tensor<2xbf16>
+  }
+}
+
+// Inner dimension mismatch tests
+// -----
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module {
+  func.func @linear_negative_1d_1d_inner_dimension_mismatch(%arg0: tensor<128xbf16>, %arg1: tensor<64xbf16>) -> tensor<1xbf16> {
+    // CHECK: error: 'ttir.linear' op Input A[-1](128) and B[-2](64) must have matching inner dimensions
+    %0 = tensor.empty() : tensor<1xbf16>
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<128xbf16>, tensor<64xbf16>, tensor<1xbf16>) -> tensor<1xbf16>
+    return %1 : tensor<1xbf16>
+  }
+}
+
+// -----
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module {
+func.func @linear_negative_1d_2d_inner_dimension_mismatch(%arg0: tensor<64xbf16>, %arg1: tensor<128x64xbf16>) -> tensor<64xbf16> {
+    // CHECK: error: 'ttir.linear' op Input A[-1](64) and B[-2](128) must have matching inner dimensions
+    %0 = tensor.empty() : tensor<64xbf16>
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<64xbf16>, tensor<128x64xbf16>, tensor<64xbf16>) -> tensor<64xbf16>
+    return %1 : tensor<64xbf16>
+  }
+}
+
+// -----
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module {
+  func.func @linear_negative_2d_1d_inner_dimension_mismatch(%arg0: tensor<64x128xbf16>, %arg1: tensor<64xbf16>) -> tensor<64xbf16> {
+   // CHECK: error: 'ttir.linear' op Input A[-1](128) and B[-2](64) must have matching inner dimensions
+    %0 = tensor.empty() : tensor<64xbf16>
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<64x128xbf16>, tensor<64xbf16>, tensor<64xbf16>) -> tensor<64xbf16>
+    return %1 : tensor<64xbf16>
+  }
+}
+
+// -----
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module {
+  func.func @linear_negative_2d_2d_inner_dimension_mismatch(%arg0: tensor<64x128xbf16>, %arg1: tensor<64x128xbf16>) -> tensor<64x64xbf16> {
+    // CHECK: error: 'ttir.linear' op Input A[-1](128) and B[-2](64) must have matching inner dimensions
+    %0 = tensor.empty() : tensor<64x64xbf16>
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<64x128xbf16>, tensor<64x128xbf16>, tensor<64x64xbf16>) -> tensor<64x64xbf16>
+    return %1 : tensor<64x64xbf16>
+  }
+}
+
+// -----
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module {
+  func.func @linear_negative_nd_nd_inner_dimension_mismatch(%arg0: tensor<7x64x128xbf16>, %arg1: tensor<1x64x128xbf16>) -> tensor<7x64x64xbf16> {
+    // CHECK: error: 'ttir.linear' op Input A[-1](128) and B[-2](64) must have matching inner dimensions
+    %0 = tensor.empty() : tensor<7x64x64xbf16>
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<7x64x128xbf16>, tensor<1x64x128xbf16>, tensor<7x64x64xbf16>) -> tensor<7x64x64xbf16>
+    return %1 : tensor<7x64x64xbf16>
+  }
+}
+
+// Batch dimension mismatch tests
+// -----
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module {
+  func.func @linear_negative_nd_nd_same_rank_batch_broadcast_incompatible_1(%arg0: tensor<7x64x128xbf16>, %arg1: tensor<2x128x64xbf16>) -> tensor<7x64x64xbf16> {
+   // CHECK: error: 'ttir.linear' op Batch dimensions of input A(7) and B(2) are not broadcast compatible
+    %0 = tensor.empty() : tensor<7x64x64xbf16>
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<7x64x128xbf16>, tensor<2x128x64xbf16>, tensor<7x64x64xbf16>) -> tensor<7x64x64xbf16>
+    return %1 : tensor<7x64x64xbf16>
+  }
+}
+
+// -----
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module {
+  func.func @linear_negative_nd_nd_same_rank_batch_broadcast_incompatible_2(%arg0: tensor<2x7x64x128xbf16>, %arg1: tensor<7x1x128x64xbf16>) -> tensor<7x7x64x64xbf16> {
+    // CHECK: error: 'ttir.linear' op Batch dimensions of input A(2,7) and B(7,1) are not broadcast compatible
+    %0 = tensor.empty() : tensor<7x64x64xbf16>
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<2x7x64x128xbf16>, tensor<7x1x128x64xbf16>, tensor<7x64x64xbf16>) -> tensor<7x7x64x64xbf16>
+    return %1 : tensor<7x7x64x64xbf16>
+  }
+}
+
+// -----
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module {
+  func.func @linear_negative_nd_nd_different_rank_batch_broadcast_incompatible(%arg0: tensor<12x2x7x64x128xbf16>, %arg1: tensor<7x1x128x64xbf16>) -> tensor<12x7x7x64x64xbf16> {
+    // CHECK: error: 'ttir.linear' op Batch dimensions of input A(12,2,7) and B(7,1) are not broadcast compatible
+    %0 = tensor.empty() : tensor<12x7x7x64x64xbf16>
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<12x2x7x64x128xbf16>, tensor<7x1x128x64xbf16>, tensor<12x7x7x64x64xbf16>) -> tensor<12x7x7x64x64xbf16>
+    return %1 : tensor<12x7x7x64x64xbf16>
+  }
+}
+
+// Bias shape mismatch tests
+// -----
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module {
+  func.func @linear_negative_matmul_bias_broadcast_incompatible(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x64xbf16>, %bias: tensor<2x64xbf16>) -> tensor<64x64xbf16> {
+    // CHECK: error: 'ttir.linear' op Bias shape(2,64) is not broadcast compatible with the matmul output shape(64,64)
+    %0 = tensor.empty() : tensor<64x64xbf16>
+    %1 = "ttir.linear"(%arg0, %arg1, %bias, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<64x128xbf16>, tensor<128x64xbf16>, tensor<2x64xbf16>, tensor<64x64xbf16>) -> tensor<64x64xbf16>
+    return %1 : tensor<64x64xbf16>
+  }
+}
+
+// -----
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module {
+  func.func @linear_negative_nd_nd_matmul_bias_broadcast_incompatible(%arg0: tensor<3x64x128xbf16>, %arg1: tensor<128x64xbf16>, %bias: tensor<2x64x64xbf16>) -> tensor<3x64x64xbf16> {
+    // CHECK: error: 'ttir.linear' op Bias shape(2,64,64) is not broadcast compatible with the matmul output shape(3,64,64)
+    %0 = tensor.empty() : tensor<3x64x64xbf16>
+    %1 = "ttir.linear"(%arg0, %arg1, %bias, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<3x64x128xbf16>, tensor<128x64xbf16>, tensor<2x64x64xbf16>, tensor<3x64x64xbf16>) -> tensor<3x64x64xbf16>
+    return %1 : tensor<3x64x64xbf16>
+  }
+}
+
+// Output shape mismatch tests
+// -----
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module {
+  func.func @linear_negative_2d_2d_output_shape_mismatch(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x64xbf16>) -> tensor<64xbf16> {
+    // CHECK: error: 'ttir.linear' op Output shape rank(1) must match the expected output shape rank(2)
+    %0 = tensor.empty() : tensor<64xbf16>
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<64x128xbf16>, tensor<128x64xbf16>, tensor<64xbf16>) -> tensor<64xbf16>
+    return %1 : tensor<64xbf16>
+  }
+}
+
+// -----
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module {
+  func.func @linear_negative_2d_2d_output_shape_mismatch(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x64xbf16>) -> tensor<64x128xbf16> {
+    // CHECK: error: 'ttir.linear' op Output shape dimension[1](128) doesn't match the expected output shape dimension[1](64)
+    %0 = tensor.empty() : tensor<64x128xbf16>
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<64x128xbf16>, tensor<128x64xbf16>, tensor<64x128xbf16>) -> tensor<64x128xbf16>
+    return %1 : tensor<64x128xbf16>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/linear/linear_tests_positive.mlir b/test/ttmlir/Dialect/TTNN/linear/linear_tests_positive.mlir
new file mode 100644
index 0000000000..0e248623da
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/linear/linear_tests_positive.mlir
@@ -0,0 +1,216 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s | FileCheck %s
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module {
+  func.func @linear_1d_1d(%arg0: tensor<128xbf16>, %arg1: tensor<128xbf16>) -> tensor<1xbf16> {
+    // CHECK: "ttnn.empty"
+    // CHECK-SAME: tensor<1xbf16
+    %0 = tensor.empty() : tensor<1xbf16>
+    // CHECK: "ttnn.linear"
+    // CHECK-SAME: tensor<128xbf16
+    // CHECK-SAME: tensor<128xbf16
+    // CHECK-SAME: tensor<1xbf16
+    // CHECK-SAME: tensor<1xbf16
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<128xbf16>, tensor<128xbf16>, tensor<1xbf16>) -> tensor<1xbf16>
+    return %1 : tensor<1xbf16>
+  }
+
+  func.func @linear_1d_1d_bias(%arg0: tensor<128xbf16>, %arg1: tensor<128xbf16>, %bias: tensor<1xbf16>) -> tensor<1xbf16> {
+    // CHECK: "ttnn.empty"
+    // CHECK-SAME: tensor<1xbf16
+    %0 = tensor.empty() : tensor<1xbf16>
+    // CHECK: "ttnn.linear"
+    // CHECK-SAME: tensor<128xbf16
+    // CHECK-SAME: tensor<128xbf16
+    // CHECK-SAME: tensor<1xbf16
+    // CHECK-SAME: tensor<1xbf16
+    // CHECK-SAME: tensor<1xbf16
+    %1 = "ttir.linear"(%arg0, %arg1, %bias, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<128xbf16>, tensor<128xbf16>, tensor<1xbf16>, tensor<1xbf16>) -> tensor<1xbf16>
+    return %1 : tensor<1xbf16>
+  }
+
+    func.func @linear_1d_1d_bias_broadcast(%arg0: tensor<128xbf16>, %arg1: tensor<128xbf16>, %bias: tensor<128xbf16>) -> tensor<128xbf16> {
+    // CHECK: "ttnn.empty"
+    // CHECK-SAME: tensor<128xbf16
+    %0 = tensor.empty() : tensor<128xbf16>
+    // CHECK: "ttnn.linear"
+    // CHECK-SAME: tensor<128xbf16
+    // CHECK-SAME: tensor<128xbf16
+    // CHECK-SAME: tensor<128xbf16
+    // CHECK-SAME: tensor<128xbf16
+    // CHECK-SAME: tensor<128xbf16
+    %1 = "ttir.linear"(%arg0, %arg1, %bias, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<128xbf16>, tensor<128xbf16>, tensor<128xbf16>, tensor<128xbf16>) -> tensor<128xbf16>
+    return %1 : tensor<128xbf16>
+  }
+
+  func.func @linear_2d_1d(%arg0: tensor<64x128xbf16>, %arg1: tensor<128xbf16>) -> tensor<64xbf16> {
+    // CHECK: "ttnn.empty"
+    // CHECK-SAME: tensor<64xbf16
+    %0 = tensor.empty() : tensor<64xbf16>
+    // CHECK: "ttnn.linear"
+    // CHECK-SAME: tensor<64x128xbf16
+    // CHECK-SAME: tensor<128xbf16
+    // CHECK-SAME: tensor<64xbf16
+    // CHECK-SAME: tensor<64xbf16
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<64x128xbf16>, tensor<128xbf16>, tensor<64xbf16>) -> tensor<64xbf16>
+    return %1 : tensor<64xbf16>
+  }
+
+  func.func @linear_2d_2d(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x64xbf16>) -> tensor<64x64xbf16> {
+    // CHECK: "ttnn.empty"
+    // CHECK-SAME: tensor<64x64xbf16
+    %0 = tensor.empty() : tensor<64x64xbf16>
+    // CHECK: "ttnn.linear"
+    // CHECK-SAME: tensor<64x128xbf16
+    // CHECK-SAME: tensor<128x64xbf16
+    // CHECK-SAME: tensor<64x64xbf16
+    // CHECK-SAME: tensor<64x64xbf16
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<64x128xbf16>, tensor<128x64xbf16>, tensor<64x64xbf16>) -> tensor<64x64xbf16>
+    return %1 : tensor<64x64xbf16>
+  }
+
+    func.func @linear_2d_2d_bias(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x64xbf16>, %bias: tensor<64x64xbf16>) -> tensor<64x64xbf16> {
+    // CHECK: "ttnn.empty"
+    // CHECK-SAME: tensor<64x64xbf16
+    %0 = tensor.empty() : tensor<64x64xbf16>
+    // CHECK: "ttnn.linear"
+    // CHECK-SAME: tensor<64x128xbf16
+    // CHECK-SAME: tensor<128x64xbf16
+    // CHECK-SAME: tensor<64x64xbf16
+    // CHECK-SAME: tensor<64x64xbf16
+    // CHECK-SAME: tensor<64x64xbf16
+    %1 = "ttir.linear"(%arg0, %arg1, %bias, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<64x128xbf16>, tensor<128x64xbf16>, tensor<64x64xbf16>, tensor<64x64xbf16>) -> tensor<64x64xbf16>
+    return %1 : tensor<64x64xbf16>
+  }
+
+  func.func @linear_1d_nd(%arg0: tensor<128xbf16>, %arg1: tensor<12x7x128x64xbf16>) -> tensor<12x7x64xbf16> {
+    // CHECK: "ttnn.empty"
+    // CHECK-SAME: tensor<12x7x64xbf16
+    %0 = tensor.empty() : tensor<12x7x64xbf16>
+    // CHECK: "ttnn.linear"
+    // CHECK-SAME: tensor<128xbf16
+    // CHECK-SAME: tensor<12x7x128x64xbf16
+    // CHECK-SAME: tensor<12x7x64xbf16
+    // CHECK-SAME: tensor<12x7x64xbf16
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<128xbf16>, tensor<12x7x128x64xbf16>, tensor<12x7x64xbf16>) -> tensor<12x7x64xbf16>
+    return %1 : tensor<12x7x64xbf16>
+  }
+
+  func.func @linear_nd_1d(%arg0: tensor<12x7x128x64xbf16>, %arg1: tensor<64xbf16>) -> tensor<12x7x128xbf16> {
+    // CHECK: "ttnn.empty"
+    // CHECK-SAME: tensor<12x7x128xbf16
+    %0 = tensor.empty() : tensor<12x7x128xbf16>
+    // CHECK: "ttnn.linear"
+    // CHECK-SAME: tensor<12x7x128x64xbf16
+    // CHECK-SAME: tensor<64xbf16
+    // CHECK-SAME: tensor<12x7x128xbf16
+    // CHECK-SAME: tensor<12x7x128xbf16
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<12x7x128x64xbf16>, tensor<64xbf16>, tensor<12x7x128xbf16>) -> tensor<12x7x128xbf16>
+    return %1 : tensor<12x7x128xbf16>
+  }
+
+  func.func @linear_2d_nd(%arg0: tensor<64x128xbf16>, %arg1: tensor<12x7x128x64xbf16>) -> tensor<12x7x64x64xbf16> {
+    // CHECK: "ttnn.empty"
+    // CHECK-SAME: tensor<12x7x64x64xbf16
+    %0 = tensor.empty() : tensor<12x7x64x64xbf16>
+    // CHECK: "ttnn.linear"
+    // CHECK-SAME: tensor<64x128xbf16
+    // CHECK-SAME: tensor<12x7x128x64xbf16
+    // CHECK-SAME: tensor<12x7x64x64xbf16
+    // CHECK-SAME: tensor<12x7x64x64xbf16
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<64x128xbf16>, tensor<12x7x128x64xbf16>, tensor<12x7x64x64xbf16>) -> tensor<12x7x64x64xbf16>
+    return %1 : tensor<12x7x64x64xbf16>
+  }
+
+  func.func @linear_nd_2d(%arg0: tensor<12x7x128x64xbf16>, %arg1: tensor<64x128xbf16>) -> tensor<12x7x128x128xbf16> {
+    // CHECK: "ttnn.empty"
+    // CHECK-SAME: tensor<12x7x128x128xbf16
+    %0 = tensor.empty() : tensor<12x7x128x128xbf16>
+    // CHECK: "ttnn.linear"
+    // CHECK-SAME: tensor<12x7x128x64xbf16
+    // CHECK-SAME: tensor<64x128xbf16
+    // CHECK-SAME: tensor<12x7x128x128xbf16
+    // CHECK-SAME: tensor<12x7x128x128xbf16
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<12x7x128x64xbf16>, tensor<64x128xbf16>, tensor<12x7x128x128xbf16>) -> tensor<12x7x128x128xbf16>
+    return %1 : tensor<12x7x128x128xbf16>
+  }
+
+  // linear nd - nd tests
+  func.func @linear_nd_nd_same_rank_same_dims(%arg0: tensor<7x64x128xbf16>, %arg1: tensor<7x128x64xbf16>) -> tensor<7x64x64xbf16> {
+    // CHECK: "ttnn.empty"
+    // CHECK-SAME: tensor<7x64x64xbf16
+    %0 = tensor.empty() : tensor<7x64x64xbf16>
+    // CHECK: "ttnn.linear"
+    // CHECK-SAME: tensor<7x64x128xbf16
+    // CHECK-SAME: tensor<7x128x64xbf16
+    // CHECK-SAME: tensor<7x64x64xbf16
+    // CHECK-SAME: tensor<7x64x64xbf16
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<7x64x128xbf16>, tensor<7x128x64xbf16>, tensor<7x64x64xbf16>) -> tensor<7x64x64xbf16>
+    return %1 : tensor<7x64x64xbf16>
+  }
+
+  func.func @linear_nd_nd_same_rank_broadcastable_dims_1(%arg0: tensor<7x64x128xbf16>, %arg1: tensor<1x128x64xbf16>) -> tensor<7x64x64xbf16> {
+    // CHECK: "ttnn.empty"
+    // CHECK-SAME: tensor<7x64x64xbf16
+    %0 = tensor.empty() : tensor<7x64x64xbf16>
+    // CHECK: "ttnn.linear"
+    // CHECK-SAME: tensor<7x64x128xbf16
+    // CHECK-SAME: tensor<1x128x64xbf16
+    // CHECK-SAME: tensor<7x64x64xbf16
+    // CHECK-SAME: tensor<7x64x64xbf16
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<7x64x128xbf16>, tensor<1x128x64xbf16>, tensor<7x64x64xbf16>) -> tensor<7x64x64xbf16>
+    return %1 : tensor<7x64x64xbf16>
+  }
+
+  func.func @linear_nd_nd_same_rank_broadcastable_dims_2(%arg0: tensor<1x7x64x128xbf16>, %arg1: tensor<7x1x128x64xbf16>) -> tensor<7x7x64x64xbf16> {
+    // CHECK: "ttnn.empty"
+    // CHECK-SAME: tensor<7x7x64x64xbf16
+    %0 = tensor.empty() : tensor<7x7x64x64xbf16>
+    // CHECK: "ttnn.linear"
+    // CHECK-SAME: tensor<1x7x64x128xbf16
+    // CHECK-SAME: tensor<7x1x128x64xbf16
+    // CHECK-SAME: tensor<7x7x64x64xbf16
+    // CHECK-SAME: tensor<7x7x64x64xbf16
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<1x7x64x128xbf16>, tensor<7x1x128x64xbf16>, tensor<7x7x64x64xbf16>) -> tensor<7x7x64x64xbf16>
+    return %1 : tensor<7x7x64x64xbf16>
+  }
+
+  func.func @linear_nd_nd_different_rank_broadcastable_dims_2(%arg0: tensor<12x1x7x64x128xbf16>, %arg1: tensor<7x1x128x64xbf16>) -> tensor<12x7x7x64x64xbf16> {
+    // CHECK: "ttnn.empty"
+    // CHECK-SAME: tensor<12x7x7x64x64xbf16
+    %0 = tensor.empty() : tensor<12x7x7x64x64xbf16>
+    // CHECK: "ttnn.linear"
+    // CHECK-SAME: tensor<12x1x7x64x128xbf16
+    // CHECK-SAME: tensor<7x1x128x64xbf16
+    // CHECK-SAME: tensor<12x7x7x64x64xbf16
+    // CHECK-SAME: tensor<12x7x7x64x64xbf16
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<12x1x7x64x128xbf16>, tensor<7x1x128x64xbf16>, tensor<12x7x7x64x64xbf16>) -> tensor<12x7x7x64x64xbf16>
+    return %1 : tensor<12x7x7x64x64xbf16>
+  }
+
+  func.func @linear_nd_nd_bias_broadcast_bias(%arg0: tensor<14x7x32x32xbf16>, %arg1:tensor<14x1x32x64xbf16>, %bias: tensor<64xbf16>) -> tensor<14x7x32x64xbf16> {
+    // CHECK: "ttnn.empty"
+    // CHECK-SAME: tensor<14x7x32x64xbf16
+    %0 = tensor.empty() : tensor<14x7x32x64xbf16>
+    // CHECK: "ttnn.linear"
+    // CHECK-SAME: tensor<14x7x32x32xbf16
+    // CHECK-SAME: tensor<14x1x32x64xbf16
+    // CHECK-SAME: tensor<64xbf16
+    // CHECK-SAME: tensor<14x7x32x64xbf16
+    // CHECK-SAME: tensor<14x7x32x64xbf16
+    %1 = "ttir.linear"(%arg0, %arg1, %bias, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<14x7x32x32xbf16>, tensor<14x1x32x64xbf16>, tensor<64xbf16>, tensor<14x7x32x64xbf16>) -> tensor<14x7x32x64xbf16>
+    return %1 : tensor<14x7x32x64xbf16>
+  }
+
+  func.func @linear_nd_nd_bias_broadcast_matmul(%arg0: tensor<3x64x128xbf16>, %arg1: tensor<4x3x128x32xbf16>, %bias: tensor<14x4x3x64x32xbf16>) -> tensor<14x4x3x64x32xbf16> {
+    // CHECK: "ttnn.empty"
+    // CHECK-SAME: tensor<14x4x3x64x32xbf16
+    %0 = tensor.empty() : tensor<14x4x3x64x32xbf16>
+    // CHECK: "ttnn.linear"
+    // CHECK-SAME: tensor<3x64x128xbf16
+    // CHECK-SAME: tensor<4x3x128x32xbf16
+    // CHECK-SAME: tensor<14x4x3x64x32xbf16
+    // CHECK-SAME: tensor<14x4x3x64x32xbf16
+    %1 = "ttir.linear"(%arg0, %arg1, %bias, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<3x64x128xbf16>, tensor<4x3x128x32xbf16>, tensor<14x4x3x64x32xbf16>, tensor<14x4x3x64x32xbf16>) -> tensor<14x4x3x64x32xbf16>
+    return %1 : tensor<14x4x3x64x32xbf16>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/linear/simple_linear.mlir b/test/ttmlir/Dialect/TTNN/linear/simple_linear.mlir
new file mode 100644
index 0000000000..56728eb52b
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/linear/simple_linear.mlir
@@ -0,0 +1,31 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s | FileCheck %s
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+
+module {
+  func.func @simple_linear_without_bias(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x64xbf16>) -> tensor<64x64xbf16> {
+    // CHECK: "ttnn.empty"
+    // CHECK-SAME: tensor<64x64xbf16
+    %0 = tensor.empty() : tensor<64x64xbf16>
+    // CHECK: "ttnn.linear"
+    // CHECK-SAME: tensor<64x128xbf16
+    // CHECK-SAME: tensor<128x64xbf16
+    // CHECK-SAME: tensor<64x64xbf16
+    // CHECK-SAME: tensor<64x64xbf16
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<64x128xbf16>, tensor<128x64xbf16>, tensor<64x64xbf16>) -> tensor<64x64xbf16>
+    return %1 : tensor<64x64xbf16>
+  }
+
+  func.func @simple_linear_with_bias(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x64xbf16>, %bias: tensor<64x64xbf16>) -> tensor<64x64xbf16> {
+    // CHECK: "ttnn.empty"
+    // CHECK-SAME: tensor<64x64xbf16
+    %0 = tensor.empty() : tensor<64x64xbf16>
+    // CHECK: "ttnn.linear"
+    // CHECK-SAME: tensor<64x128xbf16
+    // CHECK-SAME: tensor<128x64xbf16
+    // CHECK-SAME: tensor<64x64xbf16
+    // CHECK-SAME: tensor<64x64xbf16
+    // CHECK-SAME: tensor<64x64xbf16
+    %1 = "ttir.linear"(%arg0, %arg1, %bias, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<64x128xbf16>, tensor<128x64xbf16>, tensor<64x64xbf16>, tensor<64x64xbf16>) -> tensor<64x64xbf16>
+    return %1 : tensor<64x64xbf16>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_linear.mlir b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_linear.mlir
new file mode 100644
index 0000000000..6da5d3910e
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_linear.mlir
@@ -0,0 +1,20 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module {
+  func.func @linear(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x64xbf16>, %bias: tensor<64x64xbf16>) -> tensor<64x64xbf16> {
+    // CHECK: "ttnn.empty"
+    // CHECK-SAME: tensor<64x64xbf16
+    %0 = tensor.empty() : tensor<64x64xbf16>
+    // CHECK: "ttnn.linear"
+    // CHECK-SAME: tensor<64x128xbf16
+    // CHECK-SAME: tensor<128x64xbf16
+    // CHECK-SAME: tensor<64x64xbf16
+    // CHECK-SAME: tensor<64x64xbf16
+    // CHECK-SAME: tensor<64x64xbf16
+    %1 = "ttir.linear"(%arg0, %arg1, %bias, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<64x128xbf16>, tensor<128x64xbf16>, tensor<64x64xbf16>, tensor<64x64xbf16>) -> tensor<64x64xbf16>
+    return %1 : tensor<64x64xbf16>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/simple_linear.mlir b/test/ttmlir/Silicon/TTNN/simple_linear.mlir
new file mode 100644
index 0000000000..f53de38cf3
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/simple_linear.mlir
@@ -0,0 +1,33 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module {
+  func.func @simple_linear_without_bias(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x64xbf16>) -> tensor<64x64xbf16> {
+    // CHECK: "ttnn.empty"
+    // CHECK-SAME: tensor<64x64xbf16
+    %0 = tensor.empty() : tensor<64x64xbf16>
+    // CHECK: "ttnn.linear"
+    // CHECK-SAME: tensor<64x128xbf16
+    // CHECK-SAME: tensor<128x64xbf16
+    // CHECK-SAME: tensor<64x64xbf16
+    // CHECK-SAME: tensor<64x64xbf16
+    %1 = "ttir.linear"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<64x128xbf16>, tensor<128x64xbf16>, tensor<64x64xbf16>) -> tensor<64x64xbf16>
+    return %1 : tensor<64x64xbf16>
+  }
+
+  func.func @simple_linear_with_bias(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x64xbf16>, %bias: tensor<64x64xbf16>) -> tensor<64x64xbf16> {
+    // CHECK: "ttnn.empty"
+    // CHECK-SAME: tensor<64x64xbf16
+    %0 = tensor.empty() : tensor<64x64xbf16>
+    // CHECK: "ttnn.linear"
+    // CHECK-SAME: tensor<64x128xbf16
+    // CHECK-SAME: tensor<128x64xbf16
+    // CHECK-SAME: tensor<64x64xbf16
+    // CHECK-SAME: tensor<64x64xbf16
+    // CHECK-SAME: tensor<64x64xbf16
+    %1 = "ttir.linear"(%arg0, %arg1, %bias, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<64x128xbf16>, tensor<128x64xbf16>, tensor<64x64xbf16>, tensor<64x64xbf16>) -> tensor<64x64xbf16>
+    return %1 : tensor<64x64xbf16>
+  }
+}

From feb127907958b14bb969ffafaf526a9509c858d4 Mon Sep 17 00:00:00 2001
From: Lewis Panos <lpanos@tenstorrent.com>
Date: Fri, 22 Nov 2024 14:40:32 -0500
Subject: [PATCH 14/84] Bringup ttir.arange, ttnn.arange. (#1332)

Add conversion patterns from stablehlo.iota and stablehlo.dynamic_iota
to ttir.arange

Add pattern in TTIRToTTIRDecompositionPass to rewrite all ttir.arange
ops where the arange_dimension is not the right-most dim. This has the
effect of making-explicit the broadcasts and tms that would need to be
done after executin ttnn.arange

Add special TTNNLayout case for ttir.arange since it is a creation op

add runtime support and basic silicon test, stablehlo silicon tests

Added decomposition test
---
 include/ttmlir/Dialect/TTIR/IR/TTIROps.td     |  42 ++++++
 include/ttmlir/Dialect/TTNN/IR/TTNNOps.td     |  26 ++++
 include/ttmlir/Target/TTNN/program.fbs        |  11 ++
 .../StableHLOToTTIRPatterns.cpp               |  40 +++++
 .../TTIRToTTIRDecomposition.cpp               | 138 ++++++++++++++++++
 .../TTIRToTTIRDecompositionPass.cpp           |   8 +
 lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp      |  44 +++++-
 lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp    |   4 +-
 lib/Dialect/TTIR/IR/TTIROps.cpp               |  31 ++++
 lib/Dialect/TTNN/IR/TTNNOps.cpp               |  26 ++++
 lib/Dialect/TTNN/Transforms/TTNNLayout.cpp    |  29 ++++
 lib/Target/TTNN/TTNNToFlatbuffer.cpp          |  29 ++++
 .../lib/ttnn/include/tt/runtime/ttnn/utils.h  |   1 +
 runtime/lib/ttnn/operations/CMakeLists.txt    |   1 +
 .../lib/ttnn/operations/creation/arange.cpp   |  46 ++++++
 runtime/lib/ttnn/operations/creation/arange.h |  17 +++
 runtime/lib/ttnn/program.cpp                  |   4 +
 .../StableHLOToTTIR/dynamic_iota_op.mlir      |  11 ++
 .../Conversion/StableHLOToTTIR/iota_op.mlir   |  10 ++
 .../Decomposition/arange_decomposition.mlir   |  11 ++
 .../select_decomposition_tests.mlir           |   0
 .../TTNN/arange/arange_tests_negative.mlir    |  12 ++
 .../TTNN/arange/arange_tests_positive.mlir    |  11 ++
 .../Iota/simple_device_dynamic_iota_dim2.mlir |  15 ++
 .../Iota/simple_device_dynamic_iota_dim3.mlir |  16 ++
 .../Iota/simple_device_iota_dim2.mlir         |  15 ++
 .../Iota/simple_device_iota_dim3.mlir         |  15 ++
 .../arange/simple_device_arange_dim2.mlir     |  13 ++
 .../arange/simple_device_arange_dim3.mlir     |  13 ++
 29 files changed, 636 insertions(+), 3 deletions(-)
 create mode 100644 runtime/lib/ttnn/operations/creation/arange.cpp
 create mode 100644 runtime/lib/ttnn/operations/creation/arange.h
 create mode 100644 test/ttmlir/Conversion/StableHLOToTTIR/dynamic_iota_op.mlir
 create mode 100644 test/ttmlir/Conversion/StableHLOToTTIR/iota_op.mlir
 create mode 100644 test/ttmlir/Dialect/TTIR/Decomposition/arange_decomposition.mlir
 rename test/ttmlir/Dialect/TTIR/{decompositions => Decomposition}/select_decomposition_tests.mlir (100%)
 create mode 100644 test/ttmlir/Dialect/TTNN/arange/arange_tests_negative.mlir
 create mode 100644 test/ttmlir/Dialect/TTNN/arange/arange_tests_positive.mlir
 create mode 100644 test/ttmlir/Silicon/StableHLO/Iota/simple_device_dynamic_iota_dim2.mlir
 create mode 100644 test/ttmlir/Silicon/StableHLO/Iota/simple_device_dynamic_iota_dim3.mlir
 create mode 100644 test/ttmlir/Silicon/StableHLO/Iota/simple_device_iota_dim2.mlir
 create mode 100644 test/ttmlir/Silicon/StableHLO/Iota/simple_device_iota_dim3.mlir
 create mode 100644 test/ttmlir/Silicon/TTNN/arange/simple_device_arange_dim2.mlir
 create mode 100644 test/ttmlir/Silicon/TTNN/arange/simple_device_arange_dim3.mlir

diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
index 5bfb77064f..aeb2de1aed 100644
--- a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
+++ b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
@@ -1048,6 +1048,48 @@ def TTIR_ClampOp : TTIR_DPSOp<"clamp"> {
     let hasVerifier = 1;
 }
 
+def TTIR_ArangeOp : TTIR_Op<"arange"> {
+  let summary = "Arange operation.";
+  let description = [{
+    Tensor arange operation.
+
+    Produces a tensor with values from `start` to `end` (exclusive) with a step size of `step`, along the dimension specified by `arange_dimension`.
+
+    Examples:
+      %0 = "ttir.arange"() {start = 0 : i64, end = 5 : i64 step = 1 : i64, arange_dimension = 0 : i64} : () -> tensor<5xi64>
+      // %0: [0, 1, 2, 3, 4]
+
+      %1 = "ttir.arange"() {start = 0 : i64, end = 10 : i64, step = 2 : i64, arange_dimension = 0 : i64} : () -> tensor<5xf32>
+      // %1: [0.0, 2.0, 4.0, 6.0, 8.0]
+
+      %2 = "ttir.arange"() {start = 0 : i64, end = 5 : i64, step = 1 : i64, arange_dimension = 0 : i64} : () -> tensor<5x3xi64>
+      // %2: [
+              [0, 0, 0],
+              [1, 1, 1],
+              [2, 2, 2],
+              [3, 3, 3],
+              [4, 4, 4]
+             ]
+
+      %3 = "ttir.arange"() {start = 0 : i64, end = 3 : i64, step = 1 : i64, arange_dimension = 1 : i64} : () -> tensor<5x3xi64>
+      // %3: [
+              [0, 1, 2],
+              [0, 1, 2],
+              [0, 1, 2],
+              [0, 1, 2],
+              [0, 1, 2]
+             ]
+  }];
+
+  let arguments = (ins SI64Attr:$start,
+                       SI64Attr:$end,
+                       SI64Attr:$step,
+                       I64Attr:$arange_dimension);
+
+  let results = (outs AnyRankedTensor:$result);
+  let hasVerifier = 1;
+}
+
 def TTIR_ConstantOp : TTIR_Op<"constant", [ConstantLike,
                                            AllShapesMatch<["value", "result"]>]> {
     let summary = "Constant op.";
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
index 4147cc6d08..21eb704cf7 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
@@ -787,6 +787,32 @@ def TTNN_EmptyOp : TTNN_Op<"empty", [NoMemoryEffect]> {
     let hasVerifier = 1;
 }
 
+def TTNN_ArangeOp : TTNN_Op<"arange"> {
+  let summary = "Arange operation.";
+  let description = [{
+    Tensor arange operation.
+
+    Produces a (1, 1, 1, N)-shaped tensor with values from `start` to `end` (exclusive) with a step size of `step`.
+
+    Examples:
+      %0 = "ttnn.arange"() {start = 0 : i64, end = 5 : i64 step = 1 : i64} : () -> tensor<1x1x1x5xi64>
+      // %0: [[[[0, 1, 2, 3, 4]]]]
+
+      %1 = "ttnn.arange"() {start = 0 : i64, end = 10 : i64, step = 2 : i64} : () -> tensor<1x1x1x5xf32>
+      // %1: [[[[0.0, 2.0, 4.0, 6.0, 8.0]]]]
+  }];
+
+  let arguments = (ins I64Attr:$start,
+                       I64Attr:$end,
+                       I64Attr:$step,
+                       OptionalAttr<TT_DataTypeAttr>:$dtype,
+                       Optional<TT_Device>:$device,
+                       OptionalAttr<TTNN_MemoryConfigAttr>:$memory_config);
+
+  let results = (outs AnyRankedTensor:$result);
+  let hasVerifier = 1;
+}
+
 def TTNN_FullOp : TTNN_Op<"full"> {
     let summary = "Full op.";
     let description = [{
diff --git a/include/ttmlir/Target/TTNN/program.fbs b/include/ttmlir/Target/TTNN/program.fbs
index 0be274b4b6..5f486bac93 100644
--- a/include/ttmlir/Target/TTNN/program.fbs
+++ b/include/ttmlir/Target/TTNN/program.fbs
@@ -61,6 +61,16 @@ table FullOp {
   out: tt.target.TensorRef;
 }
 
+table ArangeOp {
+  start: float;
+  end: float;
+  step: float;
+  dtype: tt.target.DataType = null; // optional
+  device: tt.target.DeviceRef; // optional
+  memcfg: tt.target.MemoryConfigDesc;  // optional
+  out: tt.target.TensorRef;
+}
+
 enum EltwiseOpType: uint32 {
   Add = 0,
   Multiply = 1,
@@ -269,6 +279,7 @@ union OpType {
   MaxPool2dOp,
   DeallocateOp,
   AllGatherOp,
+  ArangeOp,
 }
 
 table Operation {
diff --git a/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp b/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
index 28bf4f71de..8db1b44e69 100644
--- a/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
+++ b/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
@@ -1201,6 +1201,36 @@ class StableHLOToTTIRGatherOpConversionPattern
   }
 };
 
+template <typename SrcIotaOp, typename Adaptor = typename SrcIotaOp::Adaptor>
+class StableHLOToTTIROpIotaOpConversionPattern
+    : public OpConversionPattern<SrcIotaOp> {
+
+  using OpConversionPattern<SrcIotaOp>::OpConversionPattern;
+
+public:
+  LogicalResult
+  matchAndRewrite(SrcIotaOp srcOp, Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    RankedTensorType outputType = mlir::cast<RankedTensorType>(
+        this->getTypeConverter()->convertType(srcOp.getResult().getType()));
+    rewriter.replaceOpWithNewOp<ttir::ArangeOp>(
+        srcOp, outputType, 0, outputType.getDimSize(adaptor.getIotaDimension()),
+        1, adaptor.getIotaDimension());
+
+    // Dynamic Iota has an output_shape attribute but the output shape is
+    // already known by the result type This is to remove the operand that will
+    // become dead code
+    for (auto operand : adaptor.getOperands()) {
+      if (operand.getDefiningOp()) {
+        rewriter.eraseOp(operand.getDefiningOp());
+      }
+    }
+
+    return success();
+  }
+};
+
 void addElementwiseUnaryOpsConversionPatterns(MLIRContext *ctx,
                                               RewritePatternSet &patterns,
                                               TypeConverter &typeConverter) {
@@ -1365,6 +1395,15 @@ void addGatherOpConversionPattern(MLIRContext *ctx, RewritePatternSet &patterns,
   patterns.add<StableHLOToTTIRGatherOpConversionPattern>(typeConverter, ctx);
 }
 
+void addIotaOpConversionPattern(MLIRContext *ctx, RewritePatternSet &patterns,
+                                TypeConverter &typeConverter) {
+  patterns.add<StableHLOToTTIROpIotaOpConversionPattern<stablehlo::IotaOp>>(
+      typeConverter, ctx);
+  patterns
+      .add<StableHLOToTTIROpIotaOpConversionPattern<stablehlo::DynamicIotaOp>>(
+          typeConverter, ctx);
+}
+
 } // namespace
 
 namespace mlir::tt {
@@ -1389,6 +1428,7 @@ void populateStableHLOToTTIRPatterns(MLIRContext *ctx,
   addSliceOpConversionPattern(ctx, patterns, typeConverter);
   addClampOpConversionPattern(ctx, patterns, typeConverter);
   addGatherOpConversionPattern(ctx, patterns, typeConverter);
+  addIotaOpConversionPattern(ctx, patterns, typeConverter);
 }
 
 } // namespace mlir::tt
diff --git a/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecomposition.cpp b/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecomposition.cpp
index 9c5afd41e6..ed7eb0be82 100644
--- a/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecomposition.cpp
+++ b/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecomposition.cpp
@@ -897,6 +897,143 @@ struct SelectToSliceConversionPattern
   }
 };
 
+/*
+ * This pattern rewrites ArangeOp by forcing the arange_dimension to be
+ * rightmost dimension of the output tensor. This is done by replacing the
+ * ArangeOp with a new one that has this property, and then transposing out last
+ * dimension to the dimension specified by the original ArangeOp, and also
+ * inserting a reshape to match the rank of the intended output and broadcasts
+ * to repeat the data along the other dimensions.
+ *
+ * The ArangeOp that is generated here will be equivalent to how ttnn::ArangeOp
+ * behaves. The reason this pass is done in TTIR rather than generated when we
+ * want to lower to TTNN is because in the future we will want to consteval the
+ * ArangeOp, but have the option to not include repeated data in the constant
+ * tensor and broadcast at runtime instead. Consteval will be implemented for
+ * the TTIR dialect only and so this explication of the TMs implicit in ArangeOp
+ * must be done in TTIR.
+ */
+struct ArangeForceLastDimensionPattern
+    : public OpConversionPattern<ttir::ArangeOp> {
+public:
+  using OpConversionPattern<ttir::ArangeOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(ttir::ArangeOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    const RankedTensorType outputType =
+        mlir::cast<RankedTensorType>(op.getResult().getType());
+
+    int64_t arangeDimension = adaptor.getArangeDimension();
+    int64_t arangeDimensionNegative = arangeDimension - outputType.getRank();
+    int64_t start = adaptor.getStart();
+    int64_t end = adaptor.getEnd();
+    int64_t step = adaptor.getStep();
+
+    int64_t arangeLength = (end - start) / step;
+
+    ArrayRef<int64_t> ttnnShape = {1, 1, 1, arangeLength};
+    if (ttnnShape == outputType.getShape()) {
+      return success();
+    }
+
+    RankedTensorType arangeOutputType = RankedTensorType::get(
+        SmallVector<int64_t>({1, 1, 1, arangeLength}),
+        outputType.getElementType(), outputType.getEncoding());
+
+    Value output =
+        rewriter
+            .create<ttir::ArangeOp>( // perform arange on the last dimension to
+                                     // match how ttnn behaves
+                op.getLoc(), arangeOutputType, start, end, step, 3)
+            .getResult();
+
+    std::vector<int64_t> outputShape = arangeOutputType.getShape().vec();
+    // Must transpose the output so that the data changes along the axis defined
+    // by arangeDimension
+    if (arangeDimensionNegative != -1) {
+      std::vector<int64_t> transposeShape = outputShape;
+      transposeShape[arangeDimensionNegative + transposeShape.size()] =
+          arangeLength;
+      transposeShape[arangeOutputType.getRank() - 1] = 1;
+      RankedTensorType transposeType = RankedTensorType::get(
+          transposeShape, arangeOutputType.getElementType(),
+          arangeOutputType.getEncoding());
+
+      tensor::EmptyOp dpsOutput = rewriter.create<tensor::EmptyOp>(
+          op.getLoc(), transposeShape, transposeType.getElementType());
+
+      output = rewriter.create<ttir::TransposeOp>(
+          op.getLoc(), transposeType, output, dpsOutput,
+          arangeDimensionNegative + transposeShape.size(),
+          arangeOutputType.getRank() - 1,
+          rewriter.getArrayAttr(SmallVector<Attribute>(
+              2, rewriter.getAttr<OperandConstraintAttr>(
+                     OperandConstraint::AnyDeviceTile))));
+
+      outputShape = transposeShape;
+    }
+
+    // Must match up the rank of the output with the rank of the intended output
+    // from the original arange, with the arangeDimension in the correct
+    // position
+    if (outputType.getRank() != static_cast<int64_t>(outputShape.size())) {
+      std::vector<int32_t> reshapeShape;
+      for (uint32_t i = 0; i < outputType.getRank(); i++) {
+        i == arangeDimension ? reshapeShape.push_back(end)
+                             : reshapeShape.push_back(1);
+      }
+
+      RankedTensorType reshapeType = RankedTensorType::get(
+          SmallVector<int64_t>(reshapeShape.begin(), reshapeShape.end()),
+          outputType.getElementType(), outputType.getEncoding());
+      tensor::EmptyOp dpsOutput = rewriter.create<tensor::EmptyOp>(
+          op.getLoc(),
+          SmallVector<int64_t>(reshapeShape.begin(), reshapeShape.end()),
+          reshapeType.getElementType());
+      output = rewriter.create<ttir::ReshapeOp>(
+          op.getLoc(), reshapeType, output, dpsOutput,
+          rewriter.getI32ArrayAttr(reshapeShape),
+          rewriter.getArrayAttr(SmallVector<Attribute>(
+              2, rewriter.getAttr<OperandConstraintAttr>(
+                     OperandConstraint::AnyDeviceTile))));
+
+      outputShape =
+          std::vector<int64_t>(reshapeShape.begin(), reshapeShape.end());
+    }
+
+    // Must broadcast the rest of the dimensions
+    SmallVector<Attribute> broadcastDims;
+    for (uint32_t i = 0; i < outputShape.size(); i++) {
+      if (i != arangeDimension && outputShape[i] != outputType.getShape()[i]) {
+        outputShape[i] = outputType.getShape()[i];
+        broadcastDims.push_back(rewriter.getI64IntegerAttr(i));
+      }
+    }
+    if (!broadcastDims.empty()) {
+      RankedTensorType broadcastType = RankedTensorType::get(
+          outputShape, outputType.getElementType(), outputType.getEncoding());
+
+      tensor::EmptyOp dpsOutput = rewriter.create<tensor::EmptyOp>(
+          op.getLoc(), outputShape, outputType.getElementType());
+
+      output = rewriter.create<ttir::BroadcastOp>(
+          op.getLoc(), broadcastType, output, dpsOutput,
+          rewriter.getArrayAttr(broadcastDims),
+          rewriter.getArrayAttr(SmallVector<Attribute>(
+              2, rewriter.getAttr<OperandConstraintAttr>(
+                     OperandConstraint::AnyDeviceTile))));
+
+      assert(mlir::cast<RankedTensorType>(output.getType()).getShape() ==
+                 outputType.getShape() &&
+             "Output shape must match the shape of the input tensor");
+    }
+    rewriter.replaceOp(op, output);
+    return success();
+  }
+};
+
 void populateTTIRToTTIRDecompositionPatterns(MLIRContext *ctx,
                                              RewritePatternSet &patterns,
                                              TypeConverter &typeConverter) {
@@ -906,6 +1043,7 @@ void populateTTIRToTTIRDecompositionPatterns(MLIRContext *ctx,
   patterns.add<GetDimensionSizeToConstantConversionPattern>(typeConverter, ctx);
   patterns.add<GatherToEmbeddingConversionPattern>(typeConverter, ctx);
   patterns.add<SelectToSliceConversionPattern>(typeConverter, ctx);
+  patterns.add<ArangeForceLastDimensionPattern>(typeConverter, ctx);
 }
 
 } // namespace mlir::tt
diff --git a/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecompositionPass.cpp b/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecompositionPass.cpp
index d91084f59d..e244eea8fb 100644
--- a/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecompositionPass.cpp
+++ b/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecompositionPass.cpp
@@ -53,6 +53,14 @@ struct TTIRToTTIRDecompositionPass
     target.addIllegalOp<ttir::GatherOp>();
     target.addIllegalOp<ttir::SelectOp>();
 
+    // These are the ops that must satisfy some conditions after this pass
+    target.addDynamicallyLegalOp<ttir::ArangeOp>([&](ttir::ArangeOp op) {
+      auto shape = op.getResult().getType().getShape();
+      return (static_cast<int64_t>(op.getArangeDimension()) == 3 &&
+              shape.size() == 4 && shape[0] == 1 && shape[1] == 1 &&
+              shape[2] == 1);
+    });
+
     TypeConverter typeConverter;
     // All types map 1:1.
     typeConverter.addConversion([](Type type) { return type; });
diff --git a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
index 52995b64c6..9dbc9cf978 100644
--- a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
+++ b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
@@ -920,6 +920,47 @@ class AllGatherOpConversionPattern
   }
 };
 
+class ArangeOpConversionPattern : public OpConversionPattern<ttir::ArangeOp> {
+public:
+  using OpConversionPattern<ttir::ArangeOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(ttir::ArangeOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    RankedTensorType outputType =
+        mlir::cast<RankedTensorType>(op.getResult().getType());
+    assert(static_cast<int64_t>(adaptor.getArangeDimension()) ==
+               outputType.getRank() - 1 &&
+           "Arange dimension must be the final dimension of the output tensor "
+           "to convert to ttnn.arange");
+
+    // Get ttnn::TTNNLayoutAttr of the result type
+    //
+    ttnn::TTNNLayoutAttr layoutAttr =
+        mlir::cast<ttnn::TTNNLayoutAttr>(outputType.getEncoding());
+
+    DataTypeAttr dtypeAttr = rewriter.getAttr<DataTypeAttr>(
+        elementTypeToDataType(outputType.getElementType()));
+    Value device = getOrInsertDevice(rewriter, op);
+
+    ttnn::MemoryConfigAttr memConfigAttr =
+        rewriter.getAttr<ttnn::MemoryConfigAttr>(
+            rewriter.getAttr<ttnn::TensorMemoryLayoutAttr>(
+                layoutAttr.getMemLayout()),
+            rewriter.getAttr<ttnn::BufferTypeAttr>(layoutAttr.getBufferType()),
+            rewriter.getAttr<ttnn::ShardSpecAttr>(
+                rewriter.getAttr<ttnn::ShapeAttr>(
+                    layoutAttr.getMemref().getShape())));
+
+    rewriter.replaceOpWithNewOp<ttnn::ArangeOp>(
+        op, outputType, adaptor.getStart(), adaptor.getEnd(), adaptor.getStep(),
+        dtypeAttr, device, memConfigAttr);
+
+    return success();
+  }
+};
+
 } // namespace
 
 namespace mlir::tt {
@@ -988,7 +1029,8 @@ void populateTTIRToTTNNPatterns(MLIRContext *ctx, RewritePatternSet &patterns,
            Conv2dOpConversionPattern,
            MaxPool2dOpConversionPattern,
            SubtractOpConversionPattern,
-           AllGatherOpConversionPattern
+           AllGatherOpConversionPattern,
+           ArangeOpConversionPattern
            >(typeConverter, ctx);
   // ANCHOR_END: op_rewriter_pattern_set
   // clang-format on
diff --git a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
index 6c83200f39..c5ab71b235 100644
--- a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
+++ b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
@@ -668,8 +668,8 @@ void populateTTNNToEmitCPatterns(mlir::MLIRContext *ctx,
   // Tensor ops
   //
   patterns
-      .add<EmptyOpConversionPattern, DefaultOpConversionPattern<ttnn::FullOp>>(
-          typeConverter, ctx);
+      .add<EmptyOpConversionPattern, DefaultOpConversionPattern<ttnn::FullOp>,
+           DefaultOpConversionPattern<ttnn::ArangeOp>>(typeConverter, ctx);
 
   // Eltwise unary ops
   //
diff --git a/lib/Dialect/TTIR/IR/TTIROps.cpp b/lib/Dialect/TTIR/IR/TTIROps.cpp
index bf734df953..3cd28626a4 100644
--- a/lib/Dialect/TTIR/IR/TTIROps.cpp
+++ b/lib/Dialect/TTIR/IR/TTIROps.cpp
@@ -45,6 +45,37 @@ ::mlir::LogicalResult mlir::tt::ttir::ClampOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// ArangeOp
+//===----------------------------------------------------------------------===//
+
+::mlir::LogicalResult mlir::tt::ttir::ArangeOp::verify() {
+  int64_t start = getStart();
+  int64_t end = getEnd();
+  int64_t step = getStep();
+
+  if (step == 0) {
+    return emitOpError("Step value cannot be zero");
+  }
+
+  int64_t numValues = (end - start) / step;
+
+  if (numValues <= 0) {
+    return emitOpError() << "Invalid range: start=" << start << ", end=" << end
+                         << ", step=" << step;
+  }
+
+  if (numValues != getType().getDimSize(getArangeDimension())) {
+    return emitOpError() << "Output tensor shape must be " << numValues
+                         << " at dim " << getArangeDimension()
+                         << " (since start=" << start << ", end=" << end
+                         << ", step=" << step << "), but got "
+                         << getType().getDimSize(getArangeDimension());
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // ConstantOp
 //===----------------------------------------------------------------------===//
diff --git a/lib/Dialect/TTNN/IR/TTNNOps.cpp b/lib/Dialect/TTNN/IR/TTNNOps.cpp
index c4f0d73941..b3201cf67c 100644
--- a/lib/Dialect/TTNN/IR/TTNNOps.cpp
+++ b/lib/Dialect/TTNN/IR/TTNNOps.cpp
@@ -140,6 +140,32 @@ ::mlir::LogicalResult mlir::tt::ttnn::MaxPool2dOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// ArangeOp
+//===----------------------------------------------------------------------===//
+
+::mlir::LogicalResult mlir::tt::ttnn::ArangeOp::verify() {
+
+  if (getStep() == 0) {
+    return emitOpError("Step cannot be zero.");
+  }
+
+  int64_t numValues = (getEnd() - getStart()) / getStep();
+
+  if (numValues <= 0) {
+    return emitOpError("Invalid range: start=")
+           << getStart() << ", end=" << getEnd() << ", step=" << getStep();
+  }
+
+  std::vector<int64_t> expectedShape = {1, 1, 1, numValues};
+  if (getType().getShape().vec() != expectedShape) {
+    return emitOpError() << "Output tensor shape must be " << expectedShape
+                         << ", but got " << getType().getShape();
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // EmptyOp
 //===----------------------------------------------------------------------===//
diff --git a/lib/Dialect/TTNN/Transforms/TTNNLayout.cpp b/lib/Dialect/TTNN/Transforms/TTNNLayout.cpp
index eebfdc13f3..2d4a2ff8f5 100644
--- a/lib/Dialect/TTNN/Transforms/TTNNLayout.cpp
+++ b/lib/Dialect/TTNN/Transforms/TTNNLayout.cpp
@@ -214,6 +214,28 @@ createToLayoutOp(PatternRewriter &rewriter, Location loc, Value input,
         .getResult();
   }
 
+  // If the input tensor is an arange, we want to set the desired layout just
+  // like the other creation ops. However, a caveat is that in ttnn, arange is
+  // hardcoded to be ROW_MAJOR. So we must ensure that the layout we assign to
+  // it is ROW_MAJOR - and to make it tile layout we still must insert
+  // ToLayoutOp on its output. We can do this by setting the element type to
+  // ty.getElementType() in case desiredElementType is a TileType.
+  ttir::ArangeOp existingArange = input.getDefiningOp<ttir::ArangeOp>();
+  if (existingArange) {
+    TTNNLayoutAttr arangeLayout = rewriter.getAttr<TTNNLayoutAttr>(
+        ty.getShape(), ty.getElementType(), desiredBufferType,
+        tensorConfig.getGrid(), desiredMemLayout, g_defaultCollapseDims);
+    input =
+        rewriter
+            .replaceOpWithNewOp<ttir::ArangeOp>(
+                existingArange,
+                mlir::RankedTensorType::get(ty.getShape(), ty.getElementType(),
+                                            arangeLayout),
+                existingArange.getStart(), existingArange.getEnd(),
+                existingArange.getStep(), existingArange.getArangeDimension())
+            .getResult();
+  }
+
   // If the input tensor is not a constant or empty tensor, we need to create a
   // new tensor with the desired layout which will be used as the output of the
   // ToLayoutOp
@@ -281,6 +303,13 @@ class TTNNLayoutDPSOperandsRewriter
         continue;
       }
 
+      // If the operand is a BroadcastOp or a ToLayout op do not put a
+      // ToLayoutOp on its output
+      if (operand.get().getDefiningOp<ttir::BroadcastOp>() ||
+          operand.get().getDefiningOp<ttir::ToLayoutOp>()) {
+        continue;
+      }
+
       // Read operand constrait for current operand
       OperandConstraint operandConstraint =
           mlir::cast<OperandConstraintAttr>(
diff --git a/lib/Target/TTNN/TTNNToFlatbuffer.cpp b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
index 8971963f2a..5677ce94b1 100644
--- a/lib/Target/TTNN/TTNNToFlatbuffer.cpp
+++ b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
@@ -28,6 +28,7 @@
 #include "mlir/Dialect/EmitC/IR/EmitC.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Support/LogicalResult.h"
+#include "types_generated.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -333,6 +334,31 @@ createOp(FlatbufferObjectCache &cache, FullOp op) {
                         kHostAllocatedSize));
 }
 
+::flatbuffers::Offset<::tt::target::ttnn::ArangeOp>
+createOp(FlatbufferObjectCache &cache, ArangeOp op) {
+
+  std::optional<::tt::target::DataType> dtype =
+      op.getDtype().has_value()
+          ? std::make_optional(toFlatbuffer(cache, op.getDtype().value()))
+          : std::nullopt;
+  auto device =
+      op.getDevice() ? cache.at<::tt::target::DeviceRef>(op.getDevice()) : 0;
+
+  auto memoryConfigDesc = op.getMemoryConfig().has_value()
+                              ? cache.getOrCreate(op.getMemoryConfig().value(),
+                                                  memoryConfigToFlatbuffer)
+                              : 0;
+
+  auto output = cache.getOrCreate(op.getResult(), tensorValueToFlatbuffer,
+                                  kHostAllocatedAddress, kHostAllocatedSize);
+
+  return ::tt::target::ttnn::CreateArangeOp(
+      *cache.fbb, static_cast<float>(op.getStart()),
+      static_cast<float>(op.getEnd()), static_cast<float>(op.getStep()),
+      dtype /* optional */, device /* optional */,
+      memoryConfigDesc /* optional */, output);
+}
+
 ::flatbuffers::Offset<::tt::target::ttnn::LinearOp>
 createOp(FlatbufferObjectCache &cache, LinearOp op) {
   auto in0 =
@@ -887,6 +913,9 @@ emitTTNNOperation(FlatbufferObjectCache &cache, Operation *op,
   if (auto geluOp = dyn_cast<GeluOp>(op); geluOp) {
     return createOperation(cache, createEltwiseOp(cache, geluOp), debugString);
   }
+  if (auto arangeOp = dyn_cast<ArangeOp>(op); arangeOp) {
+    return createOperation(cache, createOp(cache, arangeOp), debugString);
+  }
 
   llvm_unreachable("unhandled op in emitTTNNOperation");
 }
diff --git a/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.h b/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.h
index ca50ad58b3..75b22d1145 100644
--- a/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.h
+++ b/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.h
@@ -6,6 +6,7 @@
 #define TT_RUNTIME_TTNN_UTILS_H
 
 #include "flatbuffers/vector.h"
+#include "tt_metal/impl/buffers/buffer.hpp"
 #include "ttmlir/Target/Common/types_generated.h"
 #include "ttmlir/Target/TTNN/Target.h"
 #include "ttnn/types.hpp"
diff --git a/runtime/lib/ttnn/operations/CMakeLists.txt b/runtime/lib/ttnn/operations/CMakeLists.txt
index 4edc4780b9..38115803f0 100644
--- a/runtime/lib/ttnn/operations/CMakeLists.txt
+++ b/runtime/lib/ttnn/operations/CMakeLists.txt
@@ -5,6 +5,7 @@ set(TTNN_OPS_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/include/tt/runtime/ttnn/operations/eltwise/ternary/utils.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/ccl/all_gather.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/conv/conv2d.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/creation/arange.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/creation/empty.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/creation/full.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/data_movement/concat.cpp
diff --git a/runtime/lib/ttnn/operations/creation/arange.cpp b/runtime/lib/ttnn/operations/creation/arange.cpp
new file mode 100644
index 0000000000..446cdf72ad
--- /dev/null
+++ b/runtime/lib/ttnn/operations/creation/arange.cpp
@@ -0,0 +1,46 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "arange.h"
+#include "tt/runtime/detail/logger.h"
+#include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
+#include <functional>
+#include <ttnn/types.hpp>
+#include <variant>
+
+namespace tt::runtime::ttnn::operations::creation {
+void run(const ::tt::target::ttnn::ArangeOp *op, ProgramContext &context) {
+  ProgramTensorPool &tensorPool = context.getTensorPool();
+  ::ttnn::DataType dtype =
+      ::ttnn::DataType::BFLOAT16; // Default in arange implementation
+  std::optional<std::reference_wrapper<::ttnn::Device>> device = std::nullopt;
+  ::ttnn::MemoryConfig memoryConfig =
+      ::ttnn::DRAM_MEMORY_CONFIG; // Default in arange implementation
+
+  if (op->dtype()) {
+    dtype = ::tt::runtime::ttnn::utils::toTTNNDataType(*(op->dtype()));
+  }
+
+  if (op->memcfg()) {
+    memoryConfig = utils::createMemoryConfig(op->memcfg(), op->out());
+  }
+
+  if (op->device()) {
+    // ttnn::arange supports no device (host) and single device
+    DeviceVariant targetDevice =
+        context.getTargetDevice(op->device()->global_id());
+
+    LOG_ASSERT(std::holds_alternative<std::reference_wrapper<::ttnn::Device>>(
+                   targetDevice),
+               "ttnn::arange does not support MeshDevice.");
+    device = std::make_optional(
+        std::get<std::reference_wrapper<::ttnn::Device>>(targetDevice));
+  }
+  ::ttnn::Tensor out = ::ttnn::arange(op->start(), op->end(), op->step(), dtype,
+                                      device, memoryConfig);
+
+  utils::updateTensorPool(tensorPool, out, op->out()->global_id());
+}
+} // namespace tt::runtime::ttnn::operations::creation
diff --git a/runtime/lib/ttnn/operations/creation/arange.h b/runtime/lib/ttnn/operations/creation/arange.h
new file mode 100644
index 0000000000..157ee2dc61
--- /dev/null
+++ b/runtime/lib/ttnn/operations/creation/arange.h
@@ -0,0 +1,17 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef RUNTIME_LIB_TTNN_OPERATIONS_CREATION_ARANGE_H
+#define RUNTIME_LIB_TTNN_OPERATIONS_CREATION_ARANGE_H
+
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::creation {
+
+void run(const ::tt::target::ttnn::ArangeOp *op, ProgramContext &context);
+
+} // namespace tt::runtime::ttnn::operations::creation
+
+#endif
diff --git a/runtime/lib/ttnn/program.cpp b/runtime/lib/ttnn/program.cpp
index fbd58c5939..48b0be7ff4 100644
--- a/runtime/lib/ttnn/program.cpp
+++ b/runtime/lib/ttnn/program.cpp
@@ -4,6 +4,7 @@
 #include "operations/ccl/all_gather.h"
 #include "operations/context/get_device.h"
 #include "operations/conv/conv2d.h"
+#include "operations/creation/arange.h"
 #include "operations/creation/empty.h"
 #include "operations/creation/full.h"
 #include "operations/data_movement/concat.h"
@@ -189,6 +190,9 @@ void ProgramExecutor::runOperation(const ::tt::target::ttnn::Operation *op) {
   case ::tt::target::ttnn::OpType::AllGatherOp: {
     return operations::ccl::run(op->type_as_AllGatherOp(), context);
   }
+  case ::tt::target::ttnn::OpType::ArangeOp: {
+    return operations::creation::run(op->type_as_ArangeOp(), context);
+  }
   default: {
     LOG_FATAL("Unsupported operation type");
   }
diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/dynamic_iota_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/dynamic_iota_op.mlir
new file mode 100644
index 0000000000..43241ac6f0
--- /dev/null
+++ b/test/ttmlir/Conversion/StableHLOToTTIR/dynamic_iota_op.mlir
@@ -0,0 +1,11 @@
+// REQUIRES: stablehlo
+// RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module @jit_dnamic_iota attributes {} {
+  func.func public @test_dynamic_iota() -> tensor<1x32x128x128xf32> {
+    // CHECK: %[[C:.*]] = "ttir.arange"[[C:.*]]
+    %output_shape = stablehlo.constant dense<[1, 32, 128, 128]> : tensor<4xi64>
+    %0 = "stablehlo.dynamic_iota"(%output_shape) {iota_dimension = 1: i64} : (tensor<4xi64>) -> tensor<1x32x128x128xf32>
+    return %0 : tensor<1x32x128x128xf32>
+  }
+}
diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/iota_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/iota_op.mlir
new file mode 100644
index 0000000000..857a621bb0
--- /dev/null
+++ b/test/ttmlir/Conversion/StableHLOToTTIR/iota_op.mlir
@@ -0,0 +1,10 @@
+// REQUIRES: stablehlo
+// RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module @jit_iota attributes {} {
+  func.func public @test_iota() -> tensor<1x32x128x128xf32> {
+    // CHECK: %[[C:.*]] = "ttir.arange"[[C:.*]]
+    %0 = "stablehlo.iota"() {iota_dimension = 1: i64} : () -> tensor<1x32x128x128xf32>
+    return %0 : tensor<1x32x128x128xf32>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTIR/Decomposition/arange_decomposition.mlir b/test/ttmlir/Dialect/TTIR/Decomposition/arange_decomposition.mlir
new file mode 100644
index 0000000000..6f72e56f17
--- /dev/null
+++ b/test/ttmlir/Dialect/TTIR/Decomposition/arange_decomposition.mlir
@@ -0,0 +1,11 @@
+// RUN: ttmlir-opt --ttir-to-ttir-decomposition %s | FileCheck %s
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<1x32x128x128xf32>) -> tensor<1x32x128x128xf32> {
+    // CHECK: %[[C:.*]] = "ttir.arange"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttir.transpose"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttir.broadcast"[[C:.*]]
+    %1 = "ttir.arange"() <{start = 0: si64, end = 32: si64, step = 1: si64, arange_dimension = 1: i64}> : () -> tensor<1x32x128x128xf32>
+    return %1 : tensor<1x32x128x128xf32>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTIR/decompositions/select_decomposition_tests.mlir b/test/ttmlir/Dialect/TTIR/Decomposition/select_decomposition_tests.mlir
similarity index 100%
rename from test/ttmlir/Dialect/TTIR/decompositions/select_decomposition_tests.mlir
rename to test/ttmlir/Dialect/TTIR/Decomposition/select_decomposition_tests.mlir
diff --git a/test/ttmlir/Dialect/TTNN/arange/arange_tests_negative.mlir b/test/ttmlir/Dialect/TTNN/arange/arange_tests_negative.mlir
new file mode 100644
index 0000000000..dc3f09fbaf
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/arange/arange_tests_negative.mlir
@@ -0,0 +1,12 @@
+// RUN: not ttmlir-opt --split-input-file %s 2>&1 | FileCheck %s
+// Negative tests for matmul operation
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<1x32x128x128xf32>) -> tensor<1x32x128x128xf32> {
+    // CHECK: error: 'ttir.arange' op Output tensor shape must be 16 at dim 1 (since start=0, end=32, step=2), but got 32
+    %1 = "ttir.arange"() <{start = 0: si64, end = 32: si64, step = 2: si64, arange_dimension = 1: i64}> : () -> tensor<1x32x128x128xf32>
+    %dps = tensor.empty() : tensor<1x32x128x128xf32>
+    %2 = "ttir.multiply"(%arg0, %1, %dps) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32x128x128xf32>, tensor<1x32x128x128xf32>, tensor<1x32x128x128xf32>) -> tensor<1x32x128x128xf32>
+    return %2 : tensor<1x32x128x128xf32>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/arange/arange_tests_positive.mlir b/test/ttmlir/Dialect/TTNN/arange/arange_tests_positive.mlir
new file mode 100644
index 0000000000..4c04e138bb
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/arange/arange_tests_positive.mlir
@@ -0,0 +1,11 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s | FileCheck %s
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<1x32x128x128xf32>) -> tensor<1x32x128x128xf32> {
+    // CHECK: %[[C:.*]] = "ttnn.arange"[[C:.*]]
+    %1 = "ttir.arange"() <{start = 0: si64, end = 32: si64, step = 1: si64, arange_dimension = 1: i64}> : () -> tensor<1x32x128x128xf32>
+    %dps = tensor.empty() : tensor<1x32x128x128xf32>
+    %2 = "ttir.multiply"(%arg0, %1, %dps) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32x128x128xf32>, tensor<1x32x128x128xf32>, tensor<1x32x128x128xf32>) -> tensor<1x32x128x128xf32>
+    return %2 : tensor<1x32x128x128xf32>
+  }
+}
diff --git a/test/ttmlir/Silicon/StableHLO/Iota/simple_device_dynamic_iota_dim2.mlir b/test/ttmlir/Silicon/StableHLO/Iota/simple_device_dynamic_iota_dim2.mlir
new file mode 100644
index 0000000000..d911ec6fe2
--- /dev/null
+++ b/test/ttmlir/Silicon/StableHLO/Iota/simple_device_dynamic_iota_dim2.mlir
@@ -0,0 +1,15 @@
+// REQUIRES: stablehlo
+// RUN: rm -rf %t.ttnn
+// RUN: rm -rf %t.mlir
+// RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | \
+// RUN:     ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" > %t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+// RUN: FileCheck --input-file=%t.mlir %s
+module attributes {} {
+  func.func @forward(%arg0: tensor<1x1x32x128xbf16>) -> tensor<1x1x32x128xbf16> {
+    // CHECK: ttnn.arange
+    %0 = "stablehlo.iota"() {iota_dimension = 2: i64} : () -> tensor<1x1x32x128xbf16>
+    %2 = "stablehlo.multiply"(%arg0, %0) : (tensor<1x1x32x128xbf16>, tensor<1x1x32x128xbf16>) -> tensor<1x1x32x128xbf16>
+    return %2 : tensor<1x1x32x128xbf16>
+  }
+}
diff --git a/test/ttmlir/Silicon/StableHLO/Iota/simple_device_dynamic_iota_dim3.mlir b/test/ttmlir/Silicon/StableHLO/Iota/simple_device_dynamic_iota_dim3.mlir
new file mode 100644
index 0000000000..01aa0e91b3
--- /dev/null
+++ b/test/ttmlir/Silicon/StableHLO/Iota/simple_device_dynamic_iota_dim3.mlir
@@ -0,0 +1,16 @@
+// REQUIRES: stablehlo
+// RUN: rm -rf %t.ttnn
+// RUN: rm -rf %t.mlir
+// RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | \
+// RUN:     ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" > %t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+// RUN: FileCheck --input-file=%t.mlir %s
+module attributes {} {
+  func.func @forward(%arg0: tensor<1x1x32x128xbf16>) -> tensor<1x1x32x128xbf16> {
+    %output_shape = stablehlo.constant dense<[1, 1, 32, 128]> : tensor<4xi64>
+    // CHECK: ttnn.arange
+    %0 = "stablehlo.dynamic_iota"(%output_shape) {iota_dimension = 3: i64} : (tensor<4xi64>) -> tensor<1x1x32x128xbf16>
+    %2 = "stablehlo.multiply"(%arg0, %0) : (tensor<1x1x32x128xbf16>, tensor<1x1x32x128xbf16>) -> tensor<1x1x32x128xbf16>
+    return %2 : tensor<1x1x32x128xbf16>
+  }
+}
diff --git a/test/ttmlir/Silicon/StableHLO/Iota/simple_device_iota_dim2.mlir b/test/ttmlir/Silicon/StableHLO/Iota/simple_device_iota_dim2.mlir
new file mode 100644
index 0000000000..d911ec6fe2
--- /dev/null
+++ b/test/ttmlir/Silicon/StableHLO/Iota/simple_device_iota_dim2.mlir
@@ -0,0 +1,15 @@
+// REQUIRES: stablehlo
+// RUN: rm -rf %t.ttnn
+// RUN: rm -rf %t.mlir
+// RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | \
+// RUN:     ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" > %t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+// RUN: FileCheck --input-file=%t.mlir %s
+module attributes {} {
+  func.func @forward(%arg0: tensor<1x1x32x128xbf16>) -> tensor<1x1x32x128xbf16> {
+    // CHECK: ttnn.arange
+    %0 = "stablehlo.iota"() {iota_dimension = 2: i64} : () -> tensor<1x1x32x128xbf16>
+    %2 = "stablehlo.multiply"(%arg0, %0) : (tensor<1x1x32x128xbf16>, tensor<1x1x32x128xbf16>) -> tensor<1x1x32x128xbf16>
+    return %2 : tensor<1x1x32x128xbf16>
+  }
+}
diff --git a/test/ttmlir/Silicon/StableHLO/Iota/simple_device_iota_dim3.mlir b/test/ttmlir/Silicon/StableHLO/Iota/simple_device_iota_dim3.mlir
new file mode 100644
index 0000000000..a231432abc
--- /dev/null
+++ b/test/ttmlir/Silicon/StableHLO/Iota/simple_device_iota_dim3.mlir
@@ -0,0 +1,15 @@
+// REQUIRES: stablehlo
+// RUN: rm -rf %t.ttnn
+// RUN: rm -rf %t.mlir
+// RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | \
+// RUN:     ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" > %t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+// RUN: FileCheck --input-file=%t.mlir %s
+module attributes {} {
+  func.func @forward(%arg0: tensor<1x1x32x128xbf16>) -> tensor<1x1x32x128xbf16> {
+    // CHECK: ttnn.arange
+    %0 = "stablehlo.iota"() {iota_dimension = 3: i64} : () -> tensor<1x1x32x128xbf16>
+    %2 = "stablehlo.multiply"(%arg0, %0) : (tensor<1x1x32x128xbf16>, tensor<1x1x32x128xbf16>) -> tensor<1x1x32x128xbf16>
+    return %2 : tensor<1x1x32x128xbf16>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/arange/simple_device_arange_dim2.mlir b/test/ttmlir/Silicon/TTNN/arange/simple_device_arange_dim2.mlir
new file mode 100644
index 0000000000..ec509a1b6f
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/arange/simple_device_arange_dim2.mlir
@@ -0,0 +1,13 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<1x1x32x128xbf16>) -> tensor<1x1x32x128xbf16> {
+    // CHECK: %[[C:.*]] = "ttnn.arange"[[C:.*]]
+    %0 = "ttir.arange"() <{start = 0: si64, end = 64: si64, step = 2: si64, arange_dimension = 2: i64}> : () -> tensor<1x1x32x128xbf16>
+    %1 = tensor.empty() : tensor<1x1x32x128xbf16>
+    %2 = "ttir.multiply"(%arg0, %0, %1) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x1x32x128xbf16>, tensor<1x1x32x128xbf16>, tensor<1x1x32x128xbf16>) -> tensor<1x1x32x128xbf16>
+    return %2 : tensor<1x1x32x128xbf16>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/arange/simple_device_arange_dim3.mlir b/test/ttmlir/Silicon/TTNN/arange/simple_device_arange_dim3.mlir
new file mode 100644
index 0000000000..196e757096
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/arange/simple_device_arange_dim3.mlir
@@ -0,0 +1,13 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<1x1x32x128xbf16>) -> tensor<1x1x32x128xbf16> {
+    // CHECK: %[[C:.*]] = "ttnn.arange"[[C:.*]]
+    %0 = "ttir.arange"() <{start = 0: si64, end = 128: si64, step = 1: si64, arange_dimension = 3: i64}> : () -> tensor<1x1x32x128xbf16>
+    %1 = tensor.empty() : tensor<1x1x32x128xbf16>
+    %2 = "ttir.multiply"(%arg0, %0, %1) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x1x32x128xbf16>, tensor<1x1x32x128xbf16>, tensor<1x1x32x128xbf16>) -> tensor<1x1x32x128xbf16>
+    return %2 : tensor<1x1x32x128xbf16>
+  }
+}

From c908d529c2d435aeaa1ef96a3bd3288fa0f736d5 Mon Sep 17 00:00:00 2001
From: Collin Tod <collintod@tenstorrent.com>
Date: Fri, 22 Nov 2024 14:02:17 -0600
Subject: [PATCH 15/84] Ignore `*.ttnn` & `*.ttm` Files (#1365)

These flatbuffer files are generated as part of `test_infra`, and should
not be comitted.
---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index b206279832..274c39c1f4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,7 @@ query_results.json
 run_results.json
 ttrt_report.xml
 cluster_descriptor.yaml
+
+# TTNN and TTMetal flatbuffers
+*.ttnn
+*.ttm

From 1609d0182e4f4f61e738aae9b1502cfea79a493f Mon Sep 17 00:00:00 2001
From: Sterling Taylor <166402033+staylorTT@users.noreply.github.com>
Date: Mon, 25 Nov 2024 09:09:24 -0600
Subject: [PATCH 16/84] Use an actual defined ID (#1384)

---
 .github/workflows/issue-last-updated.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/issue-last-updated.yml b/.github/workflows/issue-last-updated.yml
index a2cf766dd5..5fed3d1882 100644
--- a/.github/workflows/issue-last-updated.yml
+++ b/.github/workflows/issue-last-updated.yml
@@ -90,9 +90,9 @@ jobs:
             fi
 
 
-            # Parse the item ID if it matches the ISSUE_NODE_ID
-            ITEM_ID=$(echo "$RESPONSE" | jq -r --arg ISSUE_NODE_ID "$ISSUE_NODE_ID" \
-                       '.data.node.items.nodes[] | select(.content.id==$ISSUE_NODE_ID) | .id')
+            # Parse the item ID if it matches the issue_id
+            ITEM_ID=$(echo "$RESPONSE" | jq -r --arg issue_id "$issue_id" \
+                       '.data.node.items.nodes[] | select(.content.id==$issue_id) | .id')
 
 
             # If ITEM_ID is found, output it and stop the loop

From 02df31c454c60f5ec1260c6bd0945284fdca7b17 Mon Sep 17 00:00:00 2001
From: Vraj Prajapati <vprajapati@tenstorrent.com>
Date: Mon, 25 Nov 2024 10:26:51 -0600
Subject: [PATCH 17/84] TTNN Rendering Support in TT-Explorer (#1298)

* Added maybe_downcast & hardened TT Attrs and Types to include better support

* Removed manual maybe_downcast, added tt_class

* Removed redundant imports

* Lint Fixes

* new MLIR module for parsing TTNN modules

* Added TTNNLayout Support + Fixes

* editable on Debug, minor fixes

* Requested Changes

* Removed stale import

* Removed stale import
---
 include/ttmlir-c/TTAttrs.h                    |   3 +
 include/ttmlir-c/TTNNAttrs.h                  |   5 +
 lib/CAPI/TTAttrs.cpp                          |   4 +
 lib/CAPI/TTNNAttrs.cpp                        |  10 +
 python/TTModule.cpp                           |  25 +-
 python/TTNNModule.cpp                         |  23 +
 python/ttmlir/dialects/ttnn.py                |   1 +
 tools/explorer/CMakeLists.txt                 |   2 +-
 .../tt_adapter/src/tt_adapter/main.py         |   4 +-
 .../tt_adapter/src/tt_adapter/mlir.py         | 571 ++++++++++++++++++
 .../tt_adapter/src/tt_adapter/ttir.py         | 149 -----
 .../tt_adapter/src/tt_adapter/utils.py        |   4 +-
 12 files changed, 643 insertions(+), 158 deletions(-)
 create mode 100644 tools/explorer/tt_adapter/src/tt_adapter/mlir.py
 delete mode 100644 tools/explorer/tt_adapter/src/tt_adapter/ttir.py

diff --git a/include/ttmlir-c/TTAttrs.h b/include/ttmlir-c/TTAttrs.h
index fbbe8de4bd..2e164ac132 100644
--- a/include/ttmlir-c/TTAttrs.h
+++ b/include/ttmlir-c/TTAttrs.h
@@ -84,6 +84,9 @@ MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTChipPhysicalCoresAttrGet(
     MlirAttribute *dram, size_t dramSize, MlirAttribute *eth, size_t ethSize,
     MlirAttribute *eth_inactive, size_t eth_inactiveSize);
 
+MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTCoreCoordAttrGet(MlirContext ctx,
+                                                          int64_t y, int64_t x);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/ttmlir-c/TTNNAttrs.h b/include/ttmlir-c/TTNNAttrs.h
index a7f5a8170d..ea3e333c2d 100644
--- a/include/ttmlir-c/TTNNAttrs.h
+++ b/include/ttmlir-c/TTNNAttrs.h
@@ -5,6 +5,7 @@
 #ifndef TTMLIR_C_TTNNATTRS_H
 #define TTMLIR_C_TTNNATTRS_H
 
+#include "mlir-c/AffineMap.h"
 #include "ttmlir-c/Dialects.h"
 
 #ifdef __cplusplus
@@ -44,6 +45,10 @@ MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTNNMeshShapeAttrGet(MlirContext ctx,
                                                             int64_t y,
                                                             int64_t x);
 
+MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTNNTTNNLayoutAttrGet(
+    MlirContext ctx, MlirAffineMap linear, MlirAttribute grid, MlirType memref,
+    unsigned memLayout);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/CAPI/TTAttrs.cpp b/lib/CAPI/TTAttrs.cpp
index 40a3ada6fb..196dc09f47 100644
--- a/lib/CAPI/TTAttrs.cpp
+++ b/lib/CAPI/TTAttrs.cpp
@@ -219,4 +219,8 @@ MlirAttribute ttmlirTTChipPhysicalCoresAttrGet(
                                          ethVec, ethInactiveVec));
 }
 
+MlirAttribute ttmlirTTCoreCoordAttrGet(MlirContext ctx, int64_t y, int64_t x) {
+  return wrap(CoreCoordAttr::get(unwrap(ctx), y, x));
+}
+
 } // namespace mlir::tt
diff --git a/lib/CAPI/TTNNAttrs.cpp b/lib/CAPI/TTNNAttrs.cpp
index 0fb1066cb8..677f22fb42 100644
--- a/lib/CAPI/TTNNAttrs.cpp
+++ b/lib/CAPI/TTNNAttrs.cpp
@@ -69,4 +69,14 @@ MlirAttribute ttmlirTTNNMeshShapeAttrGet(MlirContext ctx, int64_t y,
   return wrap(MeshShapeAttr::get(unwrap(ctx), y, x));
 }
 
+MlirAttribute ttmlirTTNNTTNNLayoutAttrGet(MlirContext ctx, MlirAffineMap linear,
+                                          MlirAttribute grid, MlirType memref,
+                                          unsigned memLayout) {
+  mlir::AffineMap affineMap = mlir::AffineMap::getFromOpaquePointer(linear.ptr);
+  return wrap(TTNNLayoutAttr::get(unwrap(ctx), affineMap,
+                                  mlir::cast<GridAttr>(unwrap(grid)),
+                                  mlir::cast<MemRefType>(unwrap(memref)),
+                                  static_cast<TensorMemoryLayout>(memLayout)));
+}
+
 } // namespace mlir::tt::ttnn
diff --git a/python/TTModule.cpp b/python/TTModule.cpp
index 7417866a52..c70d7df974 100644
--- a/python/TTModule.cpp
+++ b/python/TTModule.cpp
@@ -90,7 +90,8 @@ void populateTTModule(py::module &m) {
                                return static_cast<uint32_t>(la.getOobVal());
                              })
       .def_property_readonly("grid_attr", &tt::LayoutAttr::getGrid)
-      .def_property_readonly("memref", &tt::LayoutAttr::getMemref)
+      .def_property_readonly(
+          "memref", [](tt::LayoutAttr self) { return wrap(self.getMemref()); })
       .def_property_readonly("memory_space", &tt::LayoutAttr::getMemorySpace)
       .def_property_readonly("memory_space_as_int",
                              [](tt::LayoutAttr la) {
@@ -99,6 +100,8 @@ void populateTTModule(py::module &m) {
                              })
       .def_property_readonly("shard_shape", &tt::LayoutAttr::getShardShape)
       .def_property_readonly("memory_layout", &tt::LayoutAttr::getMemLayout)
+      .def_property_readonly(
+          "linear", [](tt::LayoutAttr self) { return wrap(self.getLinear()); })
       .def_property_readonly("memory_layout_as_int", [](tt::LayoutAttr la) {
         return static_cast<uint32_t>(la.getMemLayout());
       });
@@ -236,6 +239,14 @@ void populateTTModule(py::module &m) {
                                return self.getEthInactive().vec();
                              });
 
+  tt_attribute_class<tt::CoreCoordAttr>(m, "CoreCoordAttr")
+      .def_static("get",
+                  [](MlirContext ctx, int64_t y, int64_t x) {
+                    return wrap(tt::CoreCoordAttr::get(unwrap(ctx), y, x));
+                  })
+      .def_property_readonly("y", &tt::CoreCoordAttr::getY)
+      .def_property_readonly("x", &tt::CoreCoordAttr::getX);
+
   tt_attribute_class<tt::ChipCoordAttr>(m, "ChipCoordAttr")
       .def_static("get",
                   [](MlirContext ctx, unsigned rack, unsigned shelf, unsigned y,
@@ -430,8 +441,11 @@ void populateTTModule(py::module &m) {
              return mlir::cast<tt::DeviceAttr>(unwrap(self));
            })
       .def_property_readonly("grid_attr", &tt::DeviceAttr::getWorkerGrid)
-      .def_property_readonly("l1_map", &tt::DeviceAttr::getL1Map)
-      .def_property_readonly("dram_map", &tt::DeviceAttr::getDramMap)
+      .def_property_readonly(
+          "l1_map", [](tt::DeviceAttr self) { return wrap(self.getL1Map()); })
+      .def_property_readonly(
+          "dram_map",
+          [](tt::DeviceAttr self) { return wrap(self.getDramMap()); })
       .def_property_readonly(
           "mesh_shape",
           [](tt::DeviceAttr const &self) { return self.getMeshShape().vec(); })
@@ -447,7 +461,10 @@ void populateTTModule(py::module &m) {
                         unwrap(ctx), SmallVector<std::int64_t>{height, width},
                         static_cast<tt::DataType>(dataType)));
                   })
-      .def_property_readonly("data_type", &tt::TileType::getDataType)
+      .def_property_readonly("data_type_as_int",
+                             [](tt::TileType self) {
+                               return static_cast<uint32_t>(self.getDataType());
+                             })
       .def_property_readonly("shape", [](tt::TileType const &tile) {
         return std::vector<int64_t>({tile.getHeight(), tile.getWidth()});
       });
diff --git a/python/TTNNModule.cpp b/python/TTNNModule.cpp
index 24bd05c8f9..11e47982da 100644
--- a/python/TTNNModule.cpp
+++ b/python/TTNNModule.cpp
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#include "mlir/CAPI/AffineMap.h"
 #include "ttmlir/Bindings/Python/TTMLIRModule.h"
 
 namespace mlir::ttmlir::python {
@@ -127,5 +128,27 @@ void populateTTNNModule(py::module &m) {
                   })
       .def_property_readonly("y", &tt::ttnn::MeshShapeAttr::getY)
       .def_property_readonly("x", &tt::ttnn::MeshShapeAttr::getX);
+
+  tt_attribute_class<tt::ttnn::TTNNLayoutAttr>(m, "TTNNLayoutAttr")
+      .def_static("get",
+                  [](MlirContext ctx, MlirAffineMap linear, MlirAttribute grid,
+                     MlirType memref, unsigned memLayout) {
+                    return wrap(tt::ttnn::TTNNLayoutAttr::get(
+                        unwrap(ctx), mlir::cast<AffineMap>(unwrap(linear)),
+                        mlir::cast<tt::GridAttr>(unwrap(grid)),
+                        mlir::cast<MemRefType>(unwrap(memref)),
+                        static_cast<tt::ttnn::TensorMemoryLayout>(memLayout)));
+                  })
+      .def_property_readonly(
+          "linear",
+          [](tt::ttnn::TTNNLayoutAttr self) { return wrap(self.getLinear()); })
+      .def_property_readonly("grid_attr", &tt::ttnn::TTNNLayoutAttr::getGrid)
+      .def_property_readonly(
+          "memref",
+          [](tt::ttnn::TTNNLayoutAttr self) { return wrap(self.getMemref()); })
+      .def_property_readonly(
+          "memory_layout_as_int", [](tt::ttnn::TTNNLayoutAttr self) {
+            return static_cast<uint32_t>(self.getMemLayout());
+          });
 }
 } // namespace mlir::ttmlir::python
diff --git a/python/ttmlir/dialects/ttnn.py b/python/ttmlir/dialects/ttnn.py
index d81f58111a..659938cf66 100644
--- a/python/ttmlir/dialects/ttnn.py
+++ b/python/ttmlir/dialects/ttnn.py
@@ -3,4 +3,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from ._ttnn_ops_gen import *
+from ._ttnn_enum_gen import *
 from .._mlir_libs._ttmlir import register_dialect, ttnn_ir as ir
diff --git a/tools/explorer/CMakeLists.txt b/tools/explorer/CMakeLists.txt
index 7ad0791b87..44613b2671 100644
--- a/tools/explorer/CMakeLists.txt
+++ b/tools/explorer/CMakeLists.txt
@@ -17,7 +17,7 @@ ExternalProject_Add(
 
 add_custom_target(explorer
   COMMENT "Building tt-explorer... ${TTMLIR_BIN_DIR}"
-  COMMAND pip install ${CMAKE_CURRENT_SOURCE_DIR}/tt_adapter
+  COMMAND pip install $<$<CONFIG:Debug>:-e> ${CMAKE_CURRENT_SOURCE_DIR}/tt_adapter
   COMMAND pip install ${CMAKE_CURRENT_SOURCE_DIR}/model-explorer/src/model-explorer/src/server/package
 
   DEPENDS TTMLIRPythonModules model-explorer ttrt
diff --git a/tools/explorer/tt_adapter/src/tt_adapter/main.py b/tools/explorer/tt_adapter/src/tt_adapter/main.py
index 2bb3ece81a..d0c49b7af2 100644
--- a/tools/explorer/tt_adapter/src/tt_adapter/main.py
+++ b/tools/explorer/tt_adapter/src/tt_adapter/main.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 from typing import Dict
 import model_explorer
-from . import ttir, runner, utils
+from . import runner, utils, mlir
 import dataclasses
 import enum
 
@@ -46,7 +46,7 @@ def convert(
         module = utils.parse_mlir_file(model_path)
 
         # Convert TTIR to Model Explorer Graphs and Display/Return
-        graph = ttir.ttir_to_graph(module)
+        graph = mlir.build_graph(module)
         return {"graphs": [graph]}
 
     def execute(
diff --git a/tools/explorer/tt_adapter/src/tt_adapter/mlir.py b/tools/explorer/tt_adapter/src/tt_adapter/mlir.py
new file mode 100644
index 0000000000..5233e844c2
--- /dev/null
+++ b/tools/explorer/tt_adapter/src/tt_adapter/mlir.py
@@ -0,0 +1,571 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+# Utility library for parsing MLIR
+
+from collections import defaultdict
+from model_explorer import graph_builder
+
+from ttmlir.dialects import tt, ttnn, ttir
+from ttmlir import ir
+
+
+def get_loc_str(loc):
+    try:
+        res = str(loc).split('"')[1]
+    except:
+        res = "unknown"
+    return res
+
+
+class AttrHandler:
+    """
+    A class that handles parsing and registering handlers for MLIR attribute types.
+    """
+
+    ATTR_HANDLERS = {}
+
+    @staticmethod
+    def default_parser(attr):
+        return [graph_builder.KeyValue(key=attr.name, value=str(attr.attr))]
+
+    @staticmethod
+    def parse_attr(attr):
+        if attr.name in AttrHandler.ATTR_HANDLERS:
+            return AttrHandler.ATTR_HANDLERS[attr.name](attr.attr)
+        else:
+            # Unknown Attr Type, return default parser
+            return AttrHandler.default_parser(attr)
+
+    @staticmethod
+    def register_handler(attr_name):
+        """
+        Decorator function to register a handler for a specific attribute name.
+
+        Usage:
+
+        @AttrHandler.register_handler("attr_name")
+        def parse_attr_name(attr: ir.Attribute) -> List[graph_builder.KeyValue]:
+            pass
+
+        registers a handler for any NamedAttribute present in the MLIR module with the name "attr_name".
+
+        The handler itself is the function that is decorated with this decorator. It must follow the function signature of
+        `parse_attr_name` as shown above.
+        """
+
+        def decorator(handler):
+            AttrHandler.ATTR_HANDLERS[attr_name] = handler
+            return handler
+
+        return decorator
+
+
+@AttrHandler.register_handler("tt.device")
+def parse_tt_device(attr):
+    device = tt.ir.DeviceAttr.maybe_downcast(attr)
+    result = []
+    result.append(
+        graph_builder.KeyValue(
+            key="device_chip_ids", value=", ".join(map(str, device.chip_ids))
+        )
+    )
+    result.append(
+        graph_builder.KeyValue(
+            key="device_grid_shape", value=str(device.grid_attr.shape)
+        )
+    )
+    if device.mesh_shape:
+        result.append(
+            graph_builder.KeyValue(
+                key="device_mesh_shape", value=str(device.mesh_shape)
+            )
+        )
+    result.append(graph_builder.KeyValue(key="device_l1_map", value=str(device.l1_map)))
+    result.append(
+        graph_builder.KeyValue(key="device_dram_map", value=str(device.dram_map))
+    )
+    return result
+
+
+@AttrHandler.register_handler("tt.system_desc")
+def parse_tt_system_desc(attr):
+    system_desc = tt.ir.SystemDescAttr.maybe_downcast(attr)
+    result = []
+    for i, chip_desc, chip_coord, chip_capability in zip(
+        system_desc.chip_desc_indices,
+        system_desc.chip_descs,
+        system_desc.chip_coords,
+        system_desc.chip_capabilities,
+    ):
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-arch", value=str(tt.Arch(chip_desc.arch.arch_as_int))
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-capability",
+                value=str(tt.ChipCapability(chip_capability.capability_as_int)),
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-coord",
+                value="x".join(
+                    map(
+                        str,
+                        (chip_coord.rack, chip_coord.shelf, chip_coord.y, chip_coord.x),
+                    )
+                ),
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-dram-channel-size",
+                value=str(chip_desc.dram_channel_size),
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-dram-unreserved-base",
+                value=str(chip_desc.dram_unreserved_base),
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-dram-unreserved-end",
+                value=str(chip_desc.dram_unreserved_end),
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-erisc-l1-unreserved-size",
+                value=str(chip_desc.erisc_l1_unreserved_base),
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-grid", value="x".join(map(str, chip_desc.grid))
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-l1-size", value=str(chip_desc.l1_size)
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-l1-unreserved-base",
+                value=str(chip_desc.l1_unreserved_base),
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-noc-dram-address-align-bytes",
+                value=str(chip_desc.noc_dram_address_align_bytes),
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-noc-l1-address-align-bytes",
+                value=str(chip_desc.noc_l1_address_align_bytes),
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-num-cbs", value=str(chip_desc.num_cbs)
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-num-dram-channels",
+                value=str(chip_desc.num_dram_channels),
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-pcie-address-align-bytes",
+                value=str(chip_desc.pcie_address_align_bytes),
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-usable-dram-channel-size",
+                value=str(chip_desc.usable_dram_channel_size),
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-usable-l1-size", value=str(chip_desc.usable_l1_size)
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-supported-data-types",
+                value=", ".join(
+                    [
+                        str(tt.DataType(dt.data_type_as_int))
+                        for dt in chip_desc.supported_data_types
+                    ]
+                ),
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-supported-tile-sizes",
+                value=", ".join(
+                    [
+                        "x".join(map(str, (tsize.y, tsize.x)))
+                        for tsize in chip_desc.supported_tile_sizes
+                    ]
+                ),
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-dram-core-coords",
+                value=", ".join(
+                    [
+                        "x".join(map(str, (coord.y, coord.x)))
+                        for coord in chip_desc.chip_physical_cores.dram
+                    ]
+                ),
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-eth-core-coords",
+                value=", ".join(
+                    [
+                        "x".join(map(str, (coord.y, coord.x)))
+                        for coord in chip_desc.chip_physical_cores.eth
+                    ]
+                ),
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-eth-inactive-core-coords",
+                value=", ".join(
+                    [
+                        "x".join(map(str, (coord.y, coord.x)))
+                        for coord in chip_desc.chip_physical_cores.eth_inactive
+                    ]
+                ),
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key=f"chip#{i}-worker-core-coords",
+                value=", ".join(
+                    [
+                        "x".join(map(str, (coord.y, coord.x)))
+                        for coord in chip_desc.chip_physical_cores.worker
+                    ]
+                ),
+            )
+        )
+    return result
+
+
+@AttrHandler.register_handler("mesh_shape")
+def parse_mesh_shape(attr):
+    mesh_shape = ttnn.ir.MeshShapeAttr.maybe_downcast(attr)
+    return [
+        graph_builder.KeyValue(
+            key="mesh_shape", value="x".join(map(str, (mesh_shape.y, mesh_shape.x)))
+        )
+    ]
+
+
+@AttrHandler.register_handler("layout")
+def parse_layout(attr):
+    # This is for parsing TTNN Layouts (Enum)
+    layout = ttnn.ir.LayoutAttr.maybe_downcast(attr)
+    return [graph_builder.KeyValue(key="layout", value=str(ttnn.Layout(layout.value)))]
+
+
+@AttrHandler.register_handler("memory_config")
+def parse_memory_config(attr):
+    memory_config = ttnn.ir.MemoryConfigAttr.maybe_downcast(attr)
+    result = []
+    result.append(
+        graph_builder.KeyValue(
+            key="buffer-type",
+            value=str(ttnn.BufferType(memory_config.buffer_type.value)),
+        )
+    )
+    result.append(
+        graph_builder.KeyValue(
+            key="shard-shape",
+            value="x".join(map(str, memory_config.shard_spec.shard_shape.shape)),
+        )
+    )
+    result.append(
+        graph_builder.KeyValue(
+            key="tensor-memory-layout",
+            value=str(
+                ttnn.TensorMemoryLayout(memory_config.tensor_memory_layout.value)
+            ),
+        )
+    )
+    return result
+
+
+@AttrHandler.register_handler("force")
+def parse_force(attr):
+    return [graph_builder.KeyValue(key="force", value=str(attr.value))]
+
+
+@AttrHandler.register_handler("dtype")
+def parse_dtype(attr):
+    dtype = tt.ir.DataTypeAttr.maybe_downcast(attr)
+    return [
+        graph_builder.KeyValue(
+            key="dtype", value=str(tt.DataType(dtype.data_type_as_int))
+        )
+    ]
+
+
+@AttrHandler.register_handler("shape")
+def parse_shape(attr):
+    shape = ttnn.ir.ShapeAttr.maybe_downcast(attr)
+    if not shape:
+        return [graph_builder.KeyValue(key="shape", value=str(attr))]
+    return [graph_builder.KeyValue(key="shape", value="x".join(map(str, shape.shape)))]
+
+
+@AttrHandler.register_handler("operandSegmentSizes")
+def parse_operandSegmentSizes(attr):
+    return [graph_builder.KeyValue(key="operandSegmentSizes", value=str(list(attr)))]
+
+
+@AttrHandler.register_handler("dimension")
+def parse_dimension(attr):
+    return [graph_builder.KeyValue(key="dimension", value=str(attr.value))]
+
+
+@AttrHandler.register_handler("tt.layout")
+def parse_tt_layout(attr):
+    layout = tt.ir.LayoutAttr.maybe_downcast(attr)
+    result = []
+    result.append(graph_builder.KeyValue(key="linear", value=str(layout.linear)))
+    result.append(
+        graph_builder.KeyValue(
+            key="memory_space", value=str(tt.MemorySpace(layout.memory_space_as_int))
+        )
+    )
+    result.append(
+        graph_builder.KeyValue(
+            key="memory_layout",
+            value=str(tt.TensorMemoryLayout(layout.memory_layout_as_int)),
+        )
+    )
+    result.append(
+        graph_builder.KeyValue(
+            key="grid_shape", value="x".join(map(str, layout.grid_attr.shape))
+        )
+    )
+    result.append(
+        graph_builder.KeyValue(key="memref_shape", value=str(layout.memref.shape))
+    )
+    result.append(
+        graph_builder.KeyValue(key="memref_rank", value=str(layout.memref.rank))
+    )
+    tile_type = tt.ir.TileType.maybe_downcast(layout.memref.element_type)
+    if tile_type is not None:
+        result.append(
+            graph_builder.KeyValue(
+                key="tile_datatype", value=str(tt.DataType(tile_type.data_type_as_int))
+            )
+        )
+        result.append(
+            graph_builder.KeyValue(
+                key="tile_shape", value="x".join(map(str, tile_type.shape))
+            )
+        )
+    return result
+
+
+@AttrHandler.register_handler("ttnn_layout")
+def parse_ttnn_ttnn_layout(attr):
+    layout = ttnn.ir.TTNNLayoutAttr.maybe_downcast(attr)
+    result = []
+    result.append(graph_builder.KeyValue(key="linear", value=str(layout.linear)))
+    result.append(
+        graph_builder.KeyValue(
+            key="memory_layout",
+            value=str(ttnn.TensorMemoryLayout(layout.memory_layout_as_int)),
+        )
+    )
+    result.append(
+        graph_builder.KeyValue(
+            key="grid_shape", value="x".join(map(str, layout.grid_attr.shape))
+        )
+    )
+    result.append(
+        graph_builder.KeyValue(key="memref_shape", value=str(layout.memref.shape))
+    )
+    result.append(
+        graph_builder.KeyValue(key="memref_rank", value=str(layout.memref.rank))
+    )
+    buffer_attr = ttnn.ir.BufferTypeAttr.maybe_downcast(layout.memref.memory_space)
+    result.append(
+        graph_builder.KeyValue(
+            key="memref_memory_space", value=str(ttnn.BufferType(buffer_attr.value))
+        )
+    )
+    return result
+
+
+class OpHandler:
+    def __init__(self, op):
+        self.op = op
+
+    def get_id(self, names: defaultdict):
+        name = get_loc_str(self.op.location)
+        name_num = names[name]
+        id = name + "__" + str(name_num)
+        names[name] += 1
+        return id
+
+    def get_namespace(self, parent_op=None):
+        op = self.op if not parent_op else parent_op
+        name = get_loc_str(op.location)
+        if op.parent and op.parent.name != "builtin.module":
+            return self.get_namespace(op.parent) + "/" + name
+        return name
+
+    def get_attributes(self):
+        # Parse Op Attributes themselves
+        result = []
+        for attr in self.op.attributes:
+            result.extend(AttrHandler.parse_attr(attr))
+        return result
+
+    def make_graph_node(self, name_dict):
+        return graph_builder.GraphNode(
+            id=self.get_id(name_dict),
+            label=self.op.name,
+            namespace=self.get_namespace(),
+            attrs=self.get_attributes(),
+        )
+
+    def make_constant_node(self, name_dict, constant_name):
+        return graph_builder.GraphNode(
+            id=self.get_id(name_dict),
+            label=constant_name,
+            namespace=self.get_namespace(),
+        )
+
+
+EMPTY_OPS = [
+    "ttnn.empty",
+    "tensor.empty",
+]
+
+FILTERED_OPS = [
+    "ttnn.deallocate",
+    "ttnn.get_device",
+]
+
+
+def build_graph(module):
+    name_dict = defaultdict(int)
+    output_connections = defaultdict(int)
+    graph = graph_builder.Graph(id="tt-graph")
+
+    op_to_graph_node = {}
+
+    module_op = OpHandler(module.operation)
+    graph.nodes.append(module_op.make_graph_node(name_dict))
+
+    for op in module.body.operations:
+        append_later = []
+        for region in op.regions:
+            for block in region.blocks:
+                for op in block.operations:
+                    # Create all the nodes and constants in the first pass.
+                    operation = OpHandler(op)
+                    graph_node = operation.make_graph_node(name_dict)
+
+                    if op.name in EMPTY_OPS:
+                        append_later.append(graph_node)
+                    elif op.name not in FILTERED_OPS:
+                        graph.nodes.append(graph_node)
+
+                    op_to_graph_node[op] = graph_node
+
+                    for operand in op.operands:
+                        if isinstance(operand, ir.Value):
+                            # This is a constant and we need to create a node for it.
+                            operand_node = operation.make_constant_node(
+                                name_dict, operand.get_name()
+                            )
+                            graph.nodes.append(operand_node)
+                            op_to_graph_node[operand] = operand_node
+
+                # This puts the node at the far right when viewing which is a bit more consistant with it being the last operand.
+                for node in append_later:
+                    graph.nodes.append(node)
+
+                for op in block.operations:
+                    # Create all the edges in the second pass.
+                    for operand_index, operand in enumerate(op.operands):
+                        if operand.owner == block:
+                            source_node = op_to_graph_node[operand]
+                        else:
+                            source_node = op_to_graph_node[operand.owner]
+
+                        target_node = op_to_graph_node[op]
+
+                        target_node.incomingEdges.append(
+                            graph_builder.IncomingEdge(
+                                sourceNodeId=source_node.id,
+                                sourceNodeOutputId=output_connections[source_node.id],
+                                targetNodeInputId=operand_index,
+                            )
+                        )
+
+                        output_attrs = []
+                        if isinstance(operand.type, ir.RankedTensorType):
+                            output_attrs = [
+                                graph_builder.KeyValue(
+                                    key="shape", value=str(operand.type.shape)
+                                ),
+                                graph_builder.KeyValue(
+                                    key="dtype", value=str(operand.type.element_type)
+                                ),
+                                graph_builder.KeyValue(
+                                    key="rank", value=str(operand.type.rank)
+                                ),
+                            ]
+                        if hasattr(operand.type, "encoding") and operand.type.encoding:
+                            if "ttnn_layout" in str(operand.type.encoding):
+                                output_attrs.extend(
+                                    AttrHandler.parse_attr(
+                                        operand.type.encoding.get_named("ttnn_layout")
+                                    )
+                                )
+                            else:
+                                # Parse as a standard layout
+                                output_attrs.extend(
+                                    AttrHandler.parse_attr(
+                                        operand.type.encoding.get_named("tt.layout")
+                                    )
+                                )
+                        source_node.outputsMetadata.append(
+                            graph_builder.MetadataItem(
+                                id=str(output_connections[source_node.id]),
+                                attrs=[
+                                    graph_builder.KeyValue(
+                                        key="__tensor_tag", value=target_node.label
+                                    ),
+                                ]
+                                + output_attrs,
+                            )
+                        )
+                        output_connections[source_node.id] += 1
+
+    return graph
diff --git a/tools/explorer/tt_adapter/src/tt_adapter/ttir.py b/tools/explorer/tt_adapter/src/tt_adapter/ttir.py
deleted file mode 100644
index 76cd470b0f..0000000000
--- a/tools/explorer/tt_adapter/src/tt_adapter/ttir.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
-#
-# SPDX-License-Identifier: Apache-2.0
-# Library to manipulate TTIR Modules
-
-from model_explorer import graph_builder
-from ttmlir.dialects import tt, ttir, ttkernel
-from collections import defaultdict
-
-
-def get_loc_str(loc):
-    # TODO(odjuricic) Need to expose this in python bindings, if possible.
-    try:
-        res = str(loc).split('"')[1]
-    except:
-        res = "unknown"
-    return res
-
-
-def create_id(op, name_dict):
-    name = get_loc_str(op.location)
-    name_num = name_dict[name]
-    id = name + "__" + str(name_num)
-    name_dict[name] += 1
-    return id
-
-
-def get_attrs(op):
-    result = []
-    for attr in op.attributes:
-        result.append(graph_builder.KeyValue(key=attr.name, value=str(attr.attr)))
-    return result
-
-
-def create_namespace(op):
-    name = get_loc_str(op.location)
-    if op.parent and op.parent.name != "builtin.module":
-        return create_namespace(op.parent) + "/" + name
-    return name
-
-
-def get_layout_attrs(tensor):
-    attrs = [
-        graph_builder.KeyValue(key="shape", value=str(tensor.type.shape)),
-        graph_builder.KeyValue(
-            key="element_type",
-            value=str(tensor.type.element_type),
-        ),
-        graph_builder.KeyValue(key="rank", value=str(tensor.type.rank)),
-    ]
-
-    if hasattr(tensor.type, "encoding") and tensor.type.encoding:
-        layout = tt.ir.LayoutAttr.getLayout(tensor.type)
-        attrs.extend(
-            [
-                graph_builder.KeyValue(
-                    key="Memory Space",
-                    value=str(tt.MemorySpace(layout.memory_space_as_int)),
-                ),
-                graph_builder.KeyValue(
-                    key="Memory Layout",
-                    value=str(tt.TensorMemoryLayout(layout.memory_layout_as_int)),
-                ),
-                graph_builder.KeyValue(
-                    key="Grid Shape",
-                    value=str(list(layout.grid_attr.shape)),
-                ),
-            ]
-        )
-
-    return attrs
-
-
-def ttir_to_graph(module):
-    # Can assume that to-layout pass has already been run on the module.
-    name_dict = defaultdict(int)
-    output_connections = defaultdict(int)
-    graph = graph_builder.Graph(id="ttir-graph")
-
-    op_to_graph_node = dict()
-
-    for op in module.body.operations:
-        append_later = []
-        for region in op.regions:
-            for block in region.blocks:
-                for op in block.operations:
-                    # Create all the nodes and constants in the first pass.
-                    graph_node = graph_builder.GraphNode(
-                        id=create_id(op, name_dict),
-                        label=op.name,
-                        namespace=create_namespace(op),
-                        attrs=get_attrs(op),
-                    )
-
-                    if op.name == "tensor.empty":
-                        append_later.append(graph_node)
-                    else:
-                        graph.nodes.append(graph_node)
-
-                    op_to_graph_node[op] = graph_node
-
-                    for operand in op.operands:
-                        if operand.owner == block and operand not in op_to_graph_node:
-                            # This is a constant and we need to create a node for it.
-                            operand_node = graph_builder.GraphNode(
-                                id=create_id(op, name_dict),
-                                label=operand.get_name(),
-                                namespace=create_namespace(op),
-                            )
-                            graph.nodes.append(operand_node)
-                            op_to_graph_node[operand] = operand_node
-
-                # This puts the node at the far right when viewing which is a bit more consistant with it being the last operand.
-                for node in append_later:
-                    graph.nodes.append(node)
-
-                for op in block.operations:
-                    # Create all the edges in the second pass.
-                    for operand_index, operand in enumerate(op.operands):
-                        if operand.owner == block:
-                            source_node = op_to_graph_node[operand]
-                        else:
-                            source_node = op_to_graph_node[operand.owner]
-
-                        target_node = op_to_graph_node[op]
-
-                        target_node.incomingEdges.append(
-                            graph_builder.IncomingEdge(
-                                sourceNodeId=source_node.id,
-                                sourceNodeOutputId=output_connections[source_node.id],
-                                targetNodeInputId=operand_index,
-                            )
-                        )
-
-                        output_attrs = get_layout_attrs(operand)
-                        source_node.outputsMetadata.append(
-                            graph_builder.MetadataItem(
-                                id=str(output_connections[source_node.id]),
-                                attrs=[
-                                    graph_builder.KeyValue(
-                                        key="__tensor_tag", value=target_node.label
-                                    ),
-                                ]
-                                + output_attrs,
-                            )
-                        )
-                        output_connections[source_node.id] += 1
-
-    return graph
diff --git a/tools/explorer/tt_adapter/src/tt_adapter/utils.py b/tools/explorer/tt_adapter/src/tt_adapter/utils.py
index fe68d89ac5..bca7e640b4 100644
--- a/tools/explorer/tt_adapter/src/tt_adapter/utils.py
+++ b/tools/explorer/tt_adapter/src/tt_adapter/utils.py
@@ -6,8 +6,8 @@
 
 def parse_mlir_file(model_path):
     with ttmlir.ir.Context() as ctx, open(model_path, "r") as model_file:
-        ttmlir.dialects.ttkernel.register_dialect(ctx)
         ttmlir.dialects.ttir.register_dialect(ctx)
         ttmlir.dialects.tt.register_dialect(ctx)
-        module = ttmlir.ir.Module.parse("".join(model_file.readlines()), ctx)
+        ttmlir.dialects.ttnn.register_dialect(ctx)
+        module = ttmlir.ir.Module.parse(model_file.read(), ctx)
         return module

From f4cfebae3b6b81f4802f4d225b65538130fea68b Mon Sep 17 00:00:00 2001
From: Jacob DeSousa <jdesousa@tenstorrent.com>
Date: Mon, 25 Nov 2024 12:45:35 -0500
Subject: [PATCH 18/84] Add NoC Mcast Ops in D2M (#1100)

---
 .../ttmlir/Dialect/TTKernel/IR/TTKernelOps.td | 62 +++++++++++++++++++
 .../TTKernelToEmitC/TTKernelToEmitC.cpp       |  6 ++
 2 files changed, 68 insertions(+)

diff --git a/include/ttmlir/Dialect/TTKernel/IR/TTKernelOps.td b/include/ttmlir/Dialect/TTKernel/IR/TTKernelOps.td
index 4b6da4b683..c0f6d43619 100644
--- a/include/ttmlir/Dialect/TTKernel/IR/TTKernelOps.td
+++ b/include/ttmlir/Dialect/TTKernel/IR/TTKernelOps.td
@@ -503,6 +503,68 @@ def TTKernel_NocAsyncWriteBarrierOp : TTKernel_Op<"noc_async_write_barrier"> {
     }];
 }
 
+//===----------------------------------------------------------------------===//
+// TTKernel Multicast NoC operations
+//===----------------------------------------------------------------------===//
+
+def TTKernel_GetNocMulticastAddrOp : TTKernel_Op<"get_noc_multicast_addr"> {
+  let summary = "GetNocMulticastAddr";
+  let description = [{
+    GetNocMulticastAddr
+  }];
+
+  let arguments = (ins I32:$noc_x_start, I32:$noc_y_start, I32:$noc_x_end, I32:$noc_y_end, I32:$addr, Optional<I8>:$noc);
+  let results = (outs TTKernel_NocAddr:$mcastNocAddr);
+}
+
+def TTKernel_NocAsyncWriteMulticastOnePacketOp : TTKernel_Op<"noc_async_write_multicast_one_packet"> {
+  let summary = "NocAsyncWriteMulticastOnePacket";
+  let description = [{
+    NocAsyncWriteMulticastOnePacket
+    this issues only a single packet with size <= NOC_MAX_BURST_SIZE (ie maximum packet size)
+  }];
+
+  let arguments = (ins I32:$srcLocalL1Addr, TTKernel_NocAddr:$dstNocAddrMulticast, I32:$size, I32:$num_dests, OptionalAttr<BoolAttr>:$linked, OptionalAttr<BoolAttr>:$multicast_path_reserve, Optional<I8>:$noc);
+}
+
+def TTKernel_NocAsyncWriteMulticastOp : TTKernel_Op<"noc_async_write_multicast"> {
+  let summary = "NocAsyncWriteMulticast";
+  let description = [{
+    Initiates an asynchronous write from a source address in L1 memory on the
+    Tensix core executing this function call to a rectangular destination grid.
+    The destinations are specified using a uint64_t encoding referencing an
+    on-chip grid of nodes located at NOC coordinate range
+    (x_start,y_start,x_end,y_end) and a local address created using
+    *get_noc_multicast_addr* function. Also, *see noc_async_write_barrier*.
+
+    The destination nodes can only be a set of Tensix cores + L1 memory address.
+    The destination nodes must form a rectangular grid. The destination L1
+    memory address must be the same on all destination nodes.
+
+    With this API, the multicast sender cannot be part of the multicast
+    destinations. If the multicast sender has to be in the multicast
+    destinations (i.e. must perform a local L1 write), the other API variant
+    *noc_async_write_multicast_loopback_src* can be used.
+
+    Note: The number of destinations needs to be non-zero. Besides that,
+    there is no restriction on the number of destinations, i.e. the
+    multicast destinations can span the full chip. However, as mentioned
+    previously, the multicast source cannot be part of the destinations. So, the
+    maximum number of destinations is 119.
+  }];
+
+  let arguments = (ins I32:$srcLocalL1Addr, TTKernel_NocAddr:$dstNocAddrMulticast, I32:$size, I32:$num_dests, OptionalAttr<BoolAttr>:$linked, OptionalAttr<BoolAttr>:$multicast_path_reserve, Optional<I8>:$noc);
+}
+
+def TTKernel_NocAsyncWriteMulticastLoopbackSrcOp : TTKernel_Op<"noc_async_write_multicast_loopback_src"> {
+  let summary = "NocAsyncWriteMulticastLoopbackSrc";
+  let description = [{
+    NocAsyncWriteMulticastLoopbackSrc
+  }];
+
+  let arguments = (ins I32:$srcLocalL1Addr, TTKernel_NocAddr:$dstNocAddrMulticast, I32:$size, I32:$num_dests, OptionalAttr<BoolAttr>:$linked, OptionalAttr<BoolAttr>:$multicast_path_reserve, Optional<I8>:$noc);
+}
+
 //===----------------------------------------------------------------------===//
 // TTKernel Misc operations
 //===----------------------------------------------------------------------===//
diff --git a/lib/Conversion/TTKernelToEmitC/TTKernelToEmitC.cpp b/lib/Conversion/TTKernelToEmitC/TTKernelToEmitC.cpp
index 312377eb6e..b907ad7f36 100644
--- a/lib/Conversion/TTKernelToEmitC/TTKernelToEmitC.cpp
+++ b/lib/Conversion/TTKernelToEmitC/TTKernelToEmitC.cpp
@@ -419,6 +419,12 @@ class ConvertTTKernelToEmitCPass
                TTMetalToEmitCOpaqueRewriter<ttkernel::NocAsyncReadBarrierOp>,
                TTMetalToEmitCOpaqueRewriter<ttkernel::NocAsyncWriteOp>,
                TTMetalToEmitCOpaqueRewriter<ttkernel::NocAsyncWriteBarrierOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::GetNocMulticastAddrOp>,
+               TTMetalToEmitCOpaqueRewriter<
+                   ttkernel::NocAsyncWriteMulticastOnePacketOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::NocAsyncWriteMulticastOp>,
+               TTMetalToEmitCOpaqueRewriter<
+                   ttkernel::NocAsyncWriteMulticastLoopbackSrcOp>,
                TTMetalToEmitCOpaqueRewriter<ttkernel::UnaryOpInitCommonOp>,
                TTMetalToEmitCOpaqueRewriter<ttkernel::CopyTileOp>,
                TTMetalToEmitCOpaqueRewriter<ttkernel::ExpTileInitOp>,

From fff85dd99fe7c9f4166c15967e175c97d423f16b Mon Sep 17 00:00:00 2001
From: Kyle Mabee <118925087+kmabeeTT@users.noreply.github.com>
Date: Mon, 25 Nov 2024 17:40:42 -0500
Subject: [PATCH 19/84] Uplift tt-metal Nov25 69870bd (#1406)

---
 third_party/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index 682f559024..c9ff431bf1 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -1,6 +1,6 @@
 include(ExternalProject)
 
-set(TT_METAL_VERSION "b057e090e19c2f18e209817b8de538209765db6d")
+set(TT_METAL_VERSION "69870bdeaf1c9270e325810249def6a3e9f38fb4")
 
 if ("$ENV{ARCH_NAME}" STREQUAL "grayskull")
   set(ARCH_NAME "grayskull")

From 3c35a605e476d5b92fb86b5e52f7abeadfede634 Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <157983820+vmilosevic@users.noreply.github.com>
Date: Tue, 26 Nov 2024 10:57:30 +0100
Subject: [PATCH 20/84] Fix uplift PR (#1400)

---
 .github/workflows/nightly-uplift.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/nightly-uplift.yml b/.github/workflows/nightly-uplift.yml
index 332044a027..b8dbf3d05c 100644
--- a/.github/workflows/nightly-uplift.yml
+++ b/.github/workflows/nightly-uplift.yml
@@ -30,12 +30,12 @@ jobs:
         env:
           GH_TOKEN: ${{ github.token }}
         run: |
-          LATEST_TT_METAL_VERSION=$(gh api repos/tenstorrent/tt-mlir/commits/main --jq '.sha')
+          LATEST_TT_METAL_VERSION=$(gh api repos/tenstorrent/tt-metal/commits/main --jq '.sha')
           echo "LATEST_TT_METAL_VERSION=$LATEST_TT_METAL_VERSION" >> $GITHUB_ENV
 
       - name: Update tt-metal reference in third_party/CMakeLists.txt
         run: |
-          echo "Updating tt-mlir to SHA: ${{ env.LATEST_TT_METAL_VERSION }}"
+          echo "Updating tt-metal to SHA: ${{ env.LATEST_TT_METAL_VERSION }}"
           sed -i "s/set(TT_METAL_VERSION \".*\")/set(TT_METAL_VERSION \"${{ env.LATEST_TT_METAL_VERSION }}\")/" third_party/CMakeLists.txt
 
       - name: Create Pull Request

From ebde568747296e267008e5b9676576a66ce8bebe Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <157983820+vmilosevic@users.noreply.github.com>
Date: Tue, 26 Nov 2024 12:01:40 +0100
Subject: [PATCH 21/84] Collect test reports (#1368)

Initial commit for collecting workflow data, SFTP upload is commented out until we can confirm that generated files are in the correct format.
---
 .github/workflows/produce_data.yml | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 .github/workflows/produce_data.yml

diff --git a/.github/workflows/produce_data.yml b/.github/workflows/produce_data.yml
new file mode 100644
index 0000000000..e53ccc0f60
--- /dev/null
+++ b/.github/workflows/produce_data.yml
@@ -0,0 +1,28 @@
+name: "[internal] Collect workflow data"
+
+on:
+  workflow_run:
+    workflows: # List workflow that we want to collect data for
+      - "On PR"
+      - "On push"
+      - "Build on macos-latest"
+      - "Build and Test"
+    types:
+      - completed
+
+jobs:
+  produce-cicd-data:
+    runs-on: ubuntu-latest
+    env:
+        GH_TOKEN: ${{ github.token }}
+    steps:
+      - name: Collect CI/CD data
+        uses: tenstorrent/tt-github-actions/.github/actions/collect_data@main
+        if: ${{ github.event_name == 'workflow_run' }}
+        with:
+          repository: ${{ github.repository }}
+          run_id: ${{ github.event.workflow_run.id }}
+          run_attempt: ${{ github.event.workflow_run.run_attempt }}
+          sftp_host: ${{ secrets.SFTP_CICD_WRITER_HOSTNAME }}
+          sftp_user: ${{ secrets.SFTP_CICD_WRITER_USERNAME }}
+          ssh-private-key: ${{ secrets.SFTP_CICD_WRITER_KEY }}

From 3d029b66d82206bd5f59c908c169a7ca85611fa8 Mon Sep 17 00:00:00 2001
From: Wooseok Lee <wooseoklee@tenstorrent.com>
Date: Tue, 26 Nov 2024 07:58:21 -0600
Subject: [PATCH 22/84] Enable conversion of all_reduce and GSPMD custom_op
 into TTIR dialect (#1351)

1. TT_Reduce_Type is created to share compution type with TTNN dialect
  2. AllReduceOp in TTIR is introdcued to accomodate stableHLO
     all_reduce op
  3. MeshShardOp in TTIR is introduced to capture GSPMD custom sharding
  4. Realistic test cases are added from JAX/PJRT output

Current verion of importing is targetting GSPMD input, but our future
plans mainly focus on supporting Shardy-based JAX/PJRT output.
---
 include/ttmlir/Dialect/TT/IR/TTOpsEnums.td    |  50 ++
 include/ttmlir/Dialect/TT/IR/TTOpsTypes.td    |  14 +
 include/ttmlir/Dialect/TTIR/IR/TTIROps.td     | 119 ++++-
 .../StableHLOToTTIRPatterns.cpp               | 443 ++++++++++++++++++
 lib/Dialect/TTIR/IR/TTIROps.cpp               |  34 ++
 .../Conversion/StableHLOToTTIR/ccl_ops.mlir   |  83 ++++
 6 files changed, 722 insertions(+), 21 deletions(-)
 create mode 100644 test/ttmlir/Conversion/StableHLOToTTIR/ccl_ops.mlir

diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsEnums.td b/include/ttmlir/Dialect/TT/IR/TTOpsEnums.td
index b82c71c3f6..aee19f63c6 100644
--- a/include/ttmlir/Dialect/TT/IR/TTOpsEnums.td
+++ b/include/ttmlir/Dialect/TT/IR/TTOpsEnums.td
@@ -137,6 +137,7 @@ def TT_OperandConstraintSingleBank : I32BitEnumAttrCaseBit<"SingleBank", 7, "sin
 def TT_OperandConstraintHeightSharded : I32BitEnumAttrCaseBit<"HeightSharded", 8, "height_sharded">;
 def TT_OperandConstraintWidthSharded : I32BitEnumAttrCaseBit<"WidthSharded", 9, "width_sharded">;
 def TT_OperandConstraintBlockSharded : I32BitEnumAttrCaseBit<"BlockSharded", 10, "block_sharded">;
+def TT_OperandConstraintSystemScalar : I32BitEnumAttrCaseGroup<"SystemScalar", [TT_OperandConstraintSystem, TT_OperandConstraintScalar], "system_scalar">;
 def TT_OperandConstraintAnyLayout : I32BitEnumAttrCaseGroup<"AnyLayout", [TT_OperandConstraintNone, TT_OperandConstraintInterleaved, TT_OperandConstraintSingleBank, TT_OperandConstraintHeightSharded, TT_OperandConstraintWidthSharded, TT_OperandConstraintBlockSharded], "any_layout">;
 def TT_OperandConstraintAny : I32BitEnumAttrCaseGroup<"Any", [TT_OperandConstraintSystem, TT_OperandConstraintDRAM, TT_OperandConstraintL1, TT_OperandConstraintScalar, TT_OperandConstraintTile, TT_OperandConstraintAnyLayout], "any">;
 def TT_OperandConstraintAnyDevice : I32BitEnumAttrCaseGroup<"AnyDevice", [TT_OperandConstraintDRAM, TT_OperandConstraintL1, TT_OperandConstraintScalar, TT_OperandConstraintTile, TT_OperandConstraintAnyLayout], "any_device">;
@@ -155,6 +156,7 @@ def TT_OperandConstraint : I32BitEnumAttr<"OperandConstraint", "TT Operand Const
                             TT_OperandConstraintHeightSharded,
                             TT_OperandConstraintWidthSharded,
                             TT_OperandConstraintBlockSharded,
+                            TT_OperandConstraintSystemScalar,
                             TT_OperandConstraintAnyLayout,
                             TT_OperandConstraintAny,
                             TT_OperandConstraintAnyDevice,
@@ -189,6 +191,54 @@ def TT_BufferAccess : I32BitEnumAttr<"BufferAccess", "TT Buffer Access",
   let cppNamespace = "::mlir::tt";
 }
 
+def TT_ReduceType_Sum  : I32EnumAttrCase<"Sum",  0, "sum">;
+def TT_ReduceType_Mean : I32EnumAttrCase<"Mean", 1, "mean">;
+def TT_ReduceType_Max  : I32EnumAttrCase<"Max",  2, "max">;
+def TT_ReduceType_Min  : I32EnumAttrCase<"Min",  3, "min">;
+def TT_ReduceType_Std  : I32EnumAttrCase<"Std",  4, "std">;
+def TT_ReduceType_Var  : I32EnumAttrCase<"Var",  5, "var">;
+
+def TT_ReduceType: I32EnumAttr<"ReduceType", "TT Reduce Type",
+                          [
+                           TT_ReduceType_Sum,
+                           TT_ReduceType_Mean,
+                           TT_ReduceType_Max,
+                           TT_ReduceType_Min,
+                           TT_ReduceType_Std,
+                           TT_ReduceType_Var,
+                          ]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::tt";
+}
+
+def TT_MeshShardDirection_FullToShard : I32EnumAttrCase<"FullToShard",  0, "full_to_shard">;
+def TT_MeshShardDirection_ShardToFull : I32EnumAttrCase<"ShardToFull",  1, "shard_to_full">;
+
+def TT_MeshShardDirection: I32EnumAttr<"MeshShardDirection", "TT MeshShardDirection",
+                          [
+                           TT_MeshShardDirection_FullToShard,
+                           TT_MeshShardDirection_ShardToFull,
+                          ]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::tt";
+}
+
+def TT_MeshShardType_Manual    : I32EnumAttrCase<"Manual",    0, "manual">;
+def TT_MeshShardType_Replicate : I32EnumAttrCase<"Replicate", 1, "replicate">;
+def TT_MeshShardType_Maximal   : I32EnumAttrCase<"Maximal",   2, "maximal">;
+def TT_MeshShardType_Devices   : I32EnumAttrCase<"Devices",   3, "devices">;
+
+def TT_MeshShardType: I32EnumAttr<"MeshShardType", "TT MeshShardType",
+                          [
+                           TT_MeshShardType_Manual,
+                           TT_MeshShardType_Replicate,
+                           TT_MeshShardType_Maximal,
+                           TT_MeshShardType_Devices,
+                          ]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::tt";
+}
+
 def TT_CPURoleHost : I32EnumAttrCase<"Host", 0, "host">;
 def TT_CPURoleDevice : I32EnumAttrCase<"Device", 1, "device">;
 
diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td
index d9ff13164e..99caac0c2a 100644
--- a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td
+++ b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td
@@ -443,6 +443,20 @@ def TT_ArgumentAllocationAttr : TT_Attr<"ArgumentAllocation", "arg_alloc", []> {
   let assemblyFormat = "`<` $address `,` $size `,` $memorySpace `>`";
 }
 
+def TT_ReduceTypeAttr : EnumAttr<TT_Dialect, TT_ReduceType, "reduce_type"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+def TT_ReduceTypeArrayAttr : TypedArrayAttrBase<TT_ReduceTypeAttr, "">;
+
+def TT_MeshShardDirectionAttr : EnumAttr<TT_Dialect, TT_MeshShardDirection, "shard_direction"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+def TT_MeshShardTypeAttr : EnumAttr<TT_Dialect, TT_MeshShardType, "shard_type"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
 //===----------------------------------------------------------------------===//
 // TT type definitions
 //===----------------------------------------------------------------------===//
diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
index aeb2de1aed..b3701b22e8 100644
--- a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
+++ b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
@@ -719,27 +719,6 @@ def TTIR_BroadcastOp : TTIR_DPSOp<"broadcast"> {
     }];
 }
 
-// CCL ops
-def TTIR_AllGatherOp : TTIR_DPSOp<"all_gather"> {
-    let summary = "All gather operation.";
-    let description = [{
-      All gather op.
-    }];
-
-    let arguments = (ins AnyRankedTensor:$input,
-                         AnyRankedTensor:$output,
-                         SI32Attr:$dim,
-                         TT_OperandConstraintArrayAttr:$operand_constraints);
-
-    let results = (outs AnyRankedTensor:$result);
-
-    let extraClassDeclaration = [{
-      MutableOperandRange getDpsInitsMutable() { return getOutputMutable(); }
-    }];
-
-    let hasVerifier = 1;
-}
-
 def TTIR_Conv2dOp : TTIR_DPSOp<"conv2d"> {
     let summary = "Conv2d operation.";
     let description = [{
@@ -1317,4 +1296,102 @@ def TTIR_YieldOp : TTIR_Op<"yield", [Pure, ReturnLike, Terminator]> {
     let arguments = (ins Variadic<AnyRankedTensorOrMemRef>:$values);
 }
 
+//===----------------------------------------------------------------------===//
+// TTIR ccl ops
+//===----------------------------------------------------------------------===//
+
+def TTIR_AllGatherOp : TTIR_DPSOp<"all_gather"> {
+    let summary = "All gather operation.";
+    let description = [{
+      All gather op.
+    }];
+
+    let arguments = (ins AnyRankedTensor:$input,
+                         AnyRankedTensor:$output,
+                         SI32Attr:$dim,
+                         TT_OperandConstraintArrayAttr:$operand_constraints);
+
+    let results = (outs AnyRankedTensor:$result);
+
+    let extraClassDeclaration = [{
+      MutableOperandRange getDpsInitsMutable() { return getOutputMutable(); }
+    }];
+
+    let hasVerifier = 1;
+}
+
+def TTIR_AllReduceOp : TTIR_DPSOp<"all_reduce"> {
+    let summary = "AllReduce operation.";
+    let description = [{
+      AllReduce op.
+    }];
+
+    let arguments = (ins
+      Variadic<AnyRankedTensor>:$inputs,
+      AnyRankedTensor:$output,
+      I64ElementsAttr:$replica_groups,
+      SI32Attr:$dim,
+      OptionalAttr<SI32Attr>:$channel_handle,
+      UnitAttr:$use_global_device_ids,
+      TT_ReduceTypeAttr:$reduce_type,
+      TT_OperandConstraintArrayAttr:$operand_constraints
+    );
+
+    let results = (outs Variadic<AnyRankedTensor>:$results);
+
+    let extraClassDeclaration = [{
+      MutableOperandRange getDpsInitsMutable() { return getOutputMutable(); }
+    }];
+
+    let hasVerifier = 1;
+}
+
+def TTIR_MeshShardOp : TTIR_DPSOp<"mesh_shard"> {
+    let summary = "Mesh shard operation.";
+    let description = [{
+      MeshShard op shards the inputs (FullToShard) or concatnates the outputs (ShardToFull) for ccl ops.
+
+      shard_direction attribute determines whether to shard or concat.
+
+      shard_type attribute determines how to shard or concat.
+        manual: no sharding
+        replicate: all devices have identical data
+        maximal: only one device contains full data
+        devices: shard_shape determines sharded dimensions
+
+      For example, on 2x4 mesh hardware, following op shards arg0 to 8 slices, row divided by 2
+      and col divided by 4.
+
+        %1 = "ttir.mesh_shard"(%arg0, %0) <
+          {... shard_direction = #tt.shard_direction<full_to_shard>,
+               shard_shape = #tt.grid<2x4>,
+               shard_type = #tt.shard_type<devices>}> :  (tensor<8192x784xf32>, ...) -> tensor<4096x196xf32>
+
+      On the other hand, this op concatnates %4 to single tensor by concatnating
+      one of the top row tensor with one of the bottom row tensor.
+
+        %6 = "ttir.mesh_shard"(%4, %5) <
+          {..., shard_direction = #tt.shard_direction<shard_to_full>,
+                shard_shape = #tt.grid<2x1>,
+                shard_type = #tt.shard_type<devices>}> : (tensor<4096x16384xf32>, ...) -> tensor<8192x16384xf32>
+    }];
+
+    let arguments = (ins
+      AnyRankedTensor:$input,
+      AnyRankedTensor:$output,
+      TT_MeshShardTypeAttr:$shard_type,
+      TT_MeshShardDirectionAttr:$shard_direction,
+      TT_GridAttr:$shard_shape,
+      TT_OperandConstraintArrayAttr:$operand_constraints
+    );
+
+    let results = (outs AnyRankedTensor:$result);
+
+    let extraClassDeclaration = [{
+      MutableOperandRange getDpsInitsMutable() { return getOutputMutable(); }
+    }];
+
+    let hasVerifier = 1;
+}
+
 #endif
diff --git a/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp b/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
index 8db1b44e69..ccf21ff275 100644
--- a/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
+++ b/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
@@ -9,6 +9,7 @@
 #include "mlir/Dialect/Traits.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributeInterfaces.h"
+#include "mlir/IR/Region.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Support/LLVM.h"
 
@@ -1057,6 +1058,440 @@ class StableHLOToTTIROpLogicalOpConversionPattern
   }
 };
 
+template <typename SrcOpTy>
+LogicalResult getReduceType(SrcOpTy &srcOp, ReduceType &reduceType) {
+  if constexpr (!std::is_same<SrcOpTy, mlir::stablehlo::AllReduceOp>::value) {
+    return failure();
+  }
+  // Check operations in the first block and determine reduce type for now
+  // TODO(wooseoklee): This pattern matching mechanism may need to be updated as
+  // we see complicated patterns of reduce block in the future.
+  auto &block = srcOp.getRegion().front();
+  for (Operation &op : block) {
+    if (isa<mlir::stablehlo::AddOp>(op)) {
+      reduceType = ReduceType::Sum;
+      return success();
+    }
+    if (isa<mlir::stablehlo::MaxOp>(op)) {
+      reduceType = ReduceType::Max;
+      return success();
+    }
+    if (isa<mlir::stablehlo::MinOp>(op)) {
+      reduceType = ReduceType::Min;
+      return success();
+    }
+  }
+  // Other reduce types are currently not supported
+  return failure();
+}
+
+// StalbeHLO spec.md defines following channel type for ccl ops
+enum StableHLOChannelType {
+  // CHANNEL_TYPE_INVALID = 0 : Invalid primitive type to serve as
+  // default.
+  kChannelTypeInvalid = 0,
+  // DEVICE_TO_DEVICE = 1 : A channel for sending data between
+  // devices.
+  kChannelTypeDeviceToDevice = 1,
+  // DEVICE_TO_HOST = 2 : A channel for sending data from the
+  // device to the host. Can only be used with a Send operation.
+  kChannelTypeDeviceToHost = 2,
+  // HOST_TO_DEVICE = 3 : A channel for sending data from the host to
+  // the device. Can only be used with a Recv operation.
+  kChannelTypeHostToDevice = 3,
+};
+
+class StableHLOToTTIRAllReduceOpConversionPattern
+    : public OpConversionPattern<mlir::stablehlo::AllReduceOp> {
+
+  using OpConversionPattern<mlir::stablehlo::AllReduceOp>::OpConversionPattern;
+
+public:
+  LogicalResult
+  matchAndRewrite(mlir::stablehlo::AllReduceOp srcOp,
+                  mlir::stablehlo::AllReduceOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    // Check legality of the operation
+    LogicalResult err = checkBasicLegality(srcOp, adaptor, rewriter);
+    if (failed(err)) {
+      return err;
+    }
+
+    // Create the output tensor type based on inputs
+    auto outputType = mlir::cast<RankedTensorType>(
+        getTypeConverter()->convertType(srcOp.getResult(0).getType()));
+
+    // Create an empty output tensor with the computed shape
+    tensor::EmptyOp outputTensor = rewriter.create<tensor::EmptyOp>(
+        srcOp.getLoc(), outputType.getShape(), outputType.getElementType());
+
+    SmallVector<Type> ttirTypes;
+    if (failed(this->getTypeConverter()->convertTypes(srcOp->getResultTypes(),
+                                                      ttirTypes))) {
+      return failure();
+    }
+
+    auto ttirOperands = srcOp.getOperandsMutable();
+    ttirOperands.append(ValueRange(outputTensor));
+
+    SmallVector<NamedAttribute> srcAttrs = to_vector(srcOp->getAttrs());
+    SmallVector<NamedAttribute> ttirAttrs;
+    for (auto srcAttr : srcAttrs) {
+      StringAttr srcName = srcAttr.getName();
+      if (srcName == "channel_handle") {
+        auto srcChannelHandleAttr =
+            dyn_cast<mlir::stablehlo::ChannelHandleAttr>(srcAttr.getValue());
+        if (!srcChannelHandleAttr) {
+          return failure();
+        }
+
+        // channelType is supposed to be DEVICE_TO_DEVICE for CCL ops.
+        // Currently, we ensure if it is DEVICE_TO_DEVICE commmuincaiton.
+        // Consider preserving this information in the future if the attribute
+        // is non-DEVICE_TO_DEVICE values.
+        auto channelType = static_cast<int32_t>(srcChannelHandleAttr.getType());
+        if (channelType != kChannelTypeDeviceToDevice) {
+          return failure();
+        }
+
+        IntegerAttr channelHandleAttr = rewriter.getSI32IntegerAttr(
+            static_cast<int32_t>(srcChannelHandleAttr.getHandle()));
+        if (!channelHandleAttr) {
+          return failure();
+        }
+        ttirAttrs.push_back({srcName, channelHandleAttr});
+      } else {
+        ttirAttrs.push_back(srcAttr);
+      }
+    }
+
+    // Algorithm here is to search for the first non-one working dimension
+    auto replicaGroupsShape = adaptor.getReplicaGroups().getType().getShape();
+    size_t dim = 0;
+    for (auto s : replicaGroupsShape) {
+      if (s != 1) {
+        break;
+      }
+      ++dim;
+    }
+    if (dim > replicaGroupsShape.size()) {
+      // all one shape, then select the fastest dim
+      dim = replicaGroupsShape.size();
+    }
+    StringAttr dimName = StringAttr::get(this->getContext(), "dim");
+    IntegerAttr dimAttr =
+        rewriter.getSI32IntegerAttr(static_cast<int32_t>(dim));
+    ttirAttrs.push_back({dimName, dimAttr});
+
+    // Parse computation in region and add it to ttirAttrs
+    ReduceType reduceType;
+    if (failed(getReduceType(srcOp, reduceType))) {
+      return rewriter.notifyMatchFailure(
+          srcOp, "AllReduceOp cannot specify reduce type.");
+    }
+    StringAttr reduceTypeAttrName =
+        StringAttr::get(this->getContext(), "reduce_type");
+    Attribute reduceTypeAttr = rewriter.getAttr<ReduceTypeAttr>(reduceType);
+    ttirAttrs.push_back({reduceTypeAttrName, reduceTypeAttr});
+
+    StringAttr operationConstraintAttrName =
+        StringAttr::get(this->getContext(), "operand_constraints");
+    Attribute operationConstraintAttr = rewriter.getArrayAttr(
+        SmallVector<Attribute>(adaptor.getOperands().size() + 1,
+                               rewriter.getAttr<OperandConstraintAttr>(
+                                   OperandConstraint::AnyDeviceTile)));
+    ttirAttrs.push_back({operationConstraintAttrName, operationConstraintAttr});
+
+    auto ttirAllReduceOp = rewriter.create<mlir::tt::ttir::AllReduceOp>(
+        srcOp.getLoc(), ttirTypes, ValueRange(ttirOperands.getAsOperandRange()),
+        ttirAttrs);
+
+    rewriter.replaceOp(srcOp, ttirAllReduceOp);
+
+    return success();
+  }
+
+private:
+  LogicalResult
+  checkBasicLegality(mlir::stablehlo::AllReduceOp &srcOp,
+                     mlir::stablehlo::AllReduceOp::Adaptor adaptor,
+                     ConversionPatternRewriter &rewriter) const {
+    if (srcOp.getOperands().empty() || srcOp.getOperands().size() > 1) {
+      return rewriter.notifyMatchFailure(
+          srcOp, "AllReduceOp must have one input/output for now.");
+    }
+
+    return success();
+  }
+}; // namespace
+
+class StableHLOToTTIRCustomCallOpConversionPattern
+    : public OpConversionPattern<mlir::stablehlo::CustomCallOp> {
+
+  using OpConversionPattern<mlir::stablehlo::CustomCallOp>::OpConversionPattern;
+
+public:
+  LogicalResult
+  matchAndRewrite(mlir::stablehlo::CustomCallOp srcOp,
+                  mlir::stablehlo::CustomCallOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    // Check legality of the operation
+    LogicalResult err = checkBasicLegality(srcOp, adaptor, rewriter);
+    if (failed(err)) {
+      return err;
+    }
+
+    const std::string kShardingTarget = "Sharding";
+    const std::string kSPMDFullToShardShapeTarget = "SPMDFullToShardShape";
+    const std::string kSPMDShardToFullShapeTarget = "SPMDShardToFullShape";
+
+    auto callTargetName = adaptor.getCallTargetNameAttr();
+
+    // Currently stablehlo.custom_call with following functions from
+    // jax/openxla are supported
+    if (callTargetName != kShardingTarget &&
+        callTargetName != kSPMDFullToShardShapeTarget &&
+        callTargetName != kSPMDShardToFullShapeTarget) {
+      return failure();
+    }
+
+    auto shardingAttr = dyn_cast_or_null<StringAttr>(
+        adaptor.getAttributes().get("mhlo.sharding"));
+    if (!shardingAttr) {
+      return failure();
+    }
+    StringRef shardingStr = shardingAttr.getValue();
+    if (!shardingStr.consume_front("{") || !shardingStr.consume_back("}")) {
+      return failure();
+    }
+    SmallVector<StringRef> shardingStrAttrs;
+    shardingStr.split(shardingStrAttrs, " ");
+    struct ShardAttrValue shardAttrValue;
+    if (failed(parseShardingAttr(rewriter, shardingStrAttrs, shardAttrValue))) {
+      return failure();
+    }
+
+    if (callTargetName == kSPMDFullToShardShapeTarget) {
+      Operation *shardingOp = srcOp->getOperand(0).getDefiningOp();
+      if (!shardingOp) {
+        return rewriter.notifyMatchFailure(
+            srcOp, "requires operand to be defined by an op");
+      }
+
+      // TODO(wooseoklee): a bit rough approach here to match output dim
+      shardingOp->getResult(0).setType(srcOp->getResult(0).getType());
+      srcOp.getResult(0).replaceAllUsesWith(shardingOp->getResult(0));
+      rewriter.eraseOp(srcOp);
+    } else if (callTargetName == kSPMDShardToFullShapeTarget) {
+      Operation *shardingOp = srcOp->getOperand(0).getDefiningOp();
+      if (!shardingOp) {
+        return rewriter.notifyMatchFailure(
+            srcOp, "requires operand to be defined by an op");
+      }
+
+      // Create the output tensor type based on inputs
+      auto outputType = mlir::cast<RankedTensorType>(
+          getTypeConverter()->convertType(srcOp->getResult(0).getType()));
+
+      // Create an empty output tensor with the computed shape
+      tensor::EmptyOp outputTensor = rewriter.create<tensor::EmptyOp>(
+          srcOp.getLoc(), outputType.getShape(), outputType.getElementType());
+
+      SmallVector<Type> outputTypes;
+      if (failed(this->getTypeConverter()->convertTypes(srcOp->getResultTypes(),
+                                                        outputTypes))) {
+        return failure();
+      }
+
+      shardAttrValue.shardDirection = mlir::tt::MeshShardDirection::ShardToFull;
+      if (failed(createMeshShardOp(srcOp, adaptor, outputTensor, outputTypes,
+                                   shardAttrValue, rewriter))) {
+        return failure();
+      }
+    } else if (callTargetName == kShardingTarget) {
+      if (shardAttrValue.shardType == mlir::tt::MeshShardType::Manual) {
+        // "manual" sharding indicates match between input/output tensor shape
+        // and no sharding is required.
+        srcOp.getResult(0).replaceAllUsesWith(srcOp->getOperand(0));
+        rewriter.eraseOp(srcOp);
+      } else {
+        auto *user = *srcOp.getResult(0).user_begin();
+        auto userOp = dyn_cast_or_null<mlir::stablehlo::CustomCallOp>(user);
+        if (!userOp) {
+          return failure();
+        }
+
+        // Create the output tensor type based on inputs
+        auto outputType = mlir::cast<RankedTensorType>(
+            getTypeConverter()->convertType(userOp->getResult(0).getType()));
+
+        // Create an empty output tensor with the computed shape
+        tensor::EmptyOp outputTensor = rewriter.create<tensor::EmptyOp>(
+            srcOp.getLoc(), outputType.getShape(), outputType.getElementType());
+
+        SmallVector<Type> outputTypes;
+        if (failed(this->getTypeConverter()->convertTypes(
+                userOp->getResultTypes(), outputTypes))) {
+          return failure();
+        }
+
+        shardAttrValue.shardDirection =
+            mlir::tt::MeshShardDirection::FullToShard;
+        if (failed(createMeshShardOp(srcOp, adaptor, outputTensor, outputTypes,
+                                     shardAttrValue, rewriter))) {
+          return failure();
+        }
+      }
+    }
+    return success();
+  }
+
+private:
+  struct ShardAttrValue {
+    mlir::tt::MeshShardDirection shardDirection;
+    mlir::tt::MeshShardType shardType;
+    bool lastTileDimReplicate;
+    std::vector<int64_t> shardShape;
+  };
+
+  // OpenXLA has its own lexer, but we will use simple string-based parser here
+  // This parsing is mainly based on "Sharding Attribute" section in
+  // https://github.com/sdasgup3/stablehlo/blob/80082431d1af0933e6202ecc8a6f8801e039235b/docs/spec.md
+  LogicalResult parseShardingAttr(ConversionPatternRewriter &rewriter,
+                                  SmallVector<StringRef> shardingStrAttrs,
+                                  struct ShardAttrValue &shardAttrValue) const {
+    MeshShardType shardType = mlir::tt::MeshShardType::Manual;
+    bool lastTileDimReplicate = false;
+    for (auto str : shardingStrAttrs) {
+      if (str.contains("replicated")) {
+        assert(shardType == mlir::tt::MeshShardType::Manual &&
+               "Fail to parse sharding info.");
+        // replicated: all devices have whole data
+        shardType = mlir::tt::MeshShardType::Replicate;
+        shardAttrValue.shardShape.push_back(1);
+      } else if (str.contains("maximal")) {
+        assert(shardType == mlir::tt::MeshShardType::Manual &&
+               "Fail to parse sharding info.");
+        // maximal: one device has whole data
+        shardType = mlir::tt::MeshShardType::Maximal;
+        shardAttrValue.shardShape.push_back(1);
+      } else if (str.contains("device=")) {
+        // maximal should followed by "device" to put data on
+        assert(shardType == mlir::tt::MeshShardType::Maximal &&
+               "Fail to parse sharding info.");
+        int64_t d;
+        if (!str.consume_front("device=")) {
+          return failure();
+        }
+        if (str.getAsInteger<int64_t>(10, d)) {
+          return failure();
+        }
+        shardAttrValue.shardShape.push_back(d);
+      } else if (str.contains("manual")) {
+        assert(shardType == mlir::tt::MeshShardType::Manual &&
+               "Fail to parse sharding info.");
+        // manual: already sharded, so no action is needed
+        assert(!lastTileDimReplicate &&
+               "last time dim duplicate option shouldn't be set here.");
+        shardAttrValue.shardShape.push_back(1);
+      } else if (str.contains("devices=")) {
+        // other: "devices" detail sharding plan
+        assert(shardType == mlir::tt::MeshShardType::Manual &&
+               "Fail to parse sharding info.");
+        shardType = mlir::tt::MeshShardType::Devices;
+        if (!str.consume_front("devices=")) {
+          return failure();
+        }
+        auto [devicesStr, restStr] = str.split("<=");
+        // parse devices ex. [4,2,1]
+        if (!devicesStr.consume_front("[") || !devicesStr.consume_back("]")) {
+          return failure();
+        }
+        SmallVector<StringRef> dimsStr;
+        devicesStr.split(dimsStr, ",");
+        for (auto dim : dimsStr) {
+          int64_t d;
+          if (dim.getAsInteger<int64_t>(10, d)) {
+            return failure();
+          }
+          shardAttrValue.shardShape.push_back(d);
+        }
+      } else if (str.contains("last_tile_dim_replicate")) {
+        assert(shardType == mlir::tt::MeshShardType::Devices &&
+               "Fail to parse sharding info.");
+        // other: replicate last tile dim
+        lastTileDimReplicate = true;
+      }
+    }
+    shardAttrValue.shardType = shardType;
+    shardAttrValue.lastTileDimReplicate = lastTileDimReplicate;
+    return success();
+  }
+
+  LogicalResult
+  createMeshShardOp(mlir::stablehlo::CustomCallOp &srcOp,
+                    mlir::stablehlo::CustomCallOp::Adaptor adaptor,
+                    tensor::EmptyOp &outputTensor,
+                    SmallVector<Type> &outputTypes,
+                    ShardAttrValue &shardAttrValue,
+                    ConversionPatternRewriter &rewriter) const {
+
+    auto meshShardOperands = srcOp.getInputsMutable();
+    meshShardOperands.append(ValueRange(outputTensor));
+    SmallVector<NamedAttribute> meshShardAttrs;
+
+    StringAttr shardTypeAttrName = rewriter.getStringAttr("shard_type");
+    Attribute shardTypeAttr =
+        rewriter.getAttr<MeshShardTypeAttr>(shardAttrValue.shardType);
+    meshShardAttrs.push_back({shardTypeAttrName, shardTypeAttr});
+
+    StringAttr shardDirectionAttrName =
+        rewriter.getStringAttr("shard_direction");
+    Attribute shardDirectionAttr =
+        rewriter.getAttr<MeshShardDirectionAttr>(shardAttrValue.shardDirection);
+    meshShardAttrs.push_back({shardDirectionAttrName, shardDirectionAttr});
+
+    StringAttr shardShapeAttrName = rewriter.getStringAttr("shard_shape");
+    if (shardAttrValue.lastTileDimReplicate) {
+      shardAttrValue.shardShape.pop_back();
+    }
+    GridAttr shardShape =
+        GridAttr::get(this->getContext(), shardAttrValue.shardShape);
+    meshShardAttrs.push_back({shardShapeAttrName, shardShape});
+
+    StringAttr operationConstraintAttrName =
+        StringAttr::get(this->getContext(), "operand_constraints");
+    Attribute operationConstraintAttr = rewriter.getArrayAttr(
+        SmallVector<Attribute>(adaptor.getOperands().size() + 1,
+                               rewriter.getAttr<OperandConstraintAttr>(
+                                   OperandConstraint::SystemScalar)));
+    meshShardAttrs.push_back(
+        {operationConstraintAttrName, operationConstraintAttr});
+
+    auto meshShardOp = rewriter.create<mlir::tt::ttir::MeshShardOp>(
+        srcOp.getLoc(), outputTypes,
+        ValueRange(meshShardOperands.getAsOperandRange()), meshShardAttrs);
+    rewriter.replaceOp(srcOp, meshShardOp);
+
+    return success();
+  }
+
+  LogicalResult
+  checkBasicLegality(mlir::stablehlo::CustomCallOp &srcOp,
+                     mlir::stablehlo::CustomCallOp::Adaptor adaptor,
+                     ConversionPatternRewriter &rewriter) const {
+
+    // Expect single input/output, otherwise do not convert
+    if (adaptor.getInputs().size() != 1 && srcOp->getResults().size() != 1) {
+      return failure();
+    }
+
+    return success();
+  }
+}; // namespace
+
 class StableHLOToTTIRSliceOpConversionPattern
     : public OpConversionPattern<mlir::stablehlo::SliceOp> {
 
@@ -1364,6 +1799,13 @@ void addReshapeOpConversionPattern(MLIRContext *ctx,
   patterns.add<StableHLOToTTIRReshapeOpConversionPattern>(typeConverter, ctx);
 }
 
+void addCCLOpsConversionPattern(MLIRContext *ctx, RewritePatternSet &patterns,
+                                TypeConverter &typeConverter) {
+  patterns.add<StableHLOToTTIRAllReduceOpConversionPattern>(typeConverter, ctx);
+  patterns.add<StableHLOToTTIRCustomCallOpConversionPattern>(typeConverter,
+                                                             ctx);
+}
+
 void addLogicalOpConversionPattern(MLIRContext *ctx,
                                    RewritePatternSet &patterns,
                                    TypeConverter &typeConverter) {
@@ -1425,6 +1867,7 @@ void populateStableHLOToTTIRPatterns(MLIRContext *ctx,
   addConcatOpsConversionPatterns(ctx, patterns, typeConverter);
   addReshapeOpConversionPattern(ctx, patterns, typeConverter);
   addLogicalOpConversionPattern(ctx, patterns, typeConverter);
+  addCCLOpsConversionPattern(ctx, patterns, typeConverter);
   addSliceOpConversionPattern(ctx, patterns, typeConverter);
   addClampOpConversionPattern(ctx, patterns, typeConverter);
   addGatherOpConversionPattern(ctx, patterns, typeConverter);
diff --git a/lib/Dialect/TTIR/IR/TTIROps.cpp b/lib/Dialect/TTIR/IR/TTIROps.cpp
index 3cd28626a4..8f404323e2 100644
--- a/lib/Dialect/TTIR/IR/TTIROps.cpp
+++ b/lib/Dialect/TTIR/IR/TTIROps.cpp
@@ -1289,6 +1289,40 @@ ::mlir::LogicalResult mlir::tt::ttir::AllGatherOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// AllReduceOp
+//===----------------------------------------------------------------------===//
+
+// AllReduceOp verification
+::mlir::LogicalResult mlir::tt::ttir::AllReduceOp::verify() {
+  ::mlir::RankedTensorType inputType =
+      mlir::cast<RankedTensorType>(getInputs().front().getType());
+  int32_t dim = getDim();
+
+  if (dim >= inputType.getRank()) {
+    return emitOpError("Invalid dimension for all_reduce op.");
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// MeshShardOp
+//===----------------------------------------------------------------------===//
+
+// MeshShardOp verification
+::mlir::LogicalResult mlir::tt::ttir::MeshShardOp::verify() {
+  auto shardType = getShardType();
+
+  // currently we are only supporting replicate or devices from StableHLO
+  if (shardType != mlir::tt::MeshShardType::Replicate &&
+      shardType != mlir::tt::MeshShardType::Devices) {
+    return emitOpError("Invalid shard_type for mesh_shard op.");
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // GenericOp
 //===----------------------------------------------------------------------===//
diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/ccl_ops.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/ccl_ops.mlir
new file mode 100644
index 0000000000..5fbab794c6
--- /dev/null
+++ b/test/ttmlir/Conversion/StableHLOToTTIR/ccl_ops.mlir
@@ -0,0 +1,83 @@
+// REQUIRES: stablehlo
+// RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s
+
+// jax/pjrt sharding target 1x2 for n300
+module @jit_matmul_basic attributes {mhlo.num_partitions = 2 : i32, mhlo.num_replicas = 1 : i32} {
+  func.func public @main(%arg0: tensor<8192x784xf32> {mhlo.layout_mode = "default"}, %arg1: tensor<784x16384xf32> {mhlo.layout_mode = "default"}) -> (tensor<8192x16384xf32> {jax.result_info = "", mhlo.layout_mode = "default"}) {
+    %0 = stablehlo.custom_call @Sharding(%arg0) {backend_config = "", mhlo.sharding = "{devices=[1,2]<=[2]}"} : (tensor<8192x784xf32>) -> tensor<8192x784xf32>
+    %1 = stablehlo.custom_call @SPMDFullToShardShape(%0) {backend_config = "", mhlo.sharding = "{manual}"} : (tensor<8192x784xf32>) -> tensor<8192x392xf32>
+    // CHECK: %[[C:.*]] = "ttir.mesh_shard"[[C:.*]]
+    %2 = stablehlo.custom_call @Sharding(%arg1) {backend_config = "", mhlo.sharding = "{devices=[2,1]<=[2]}"} : (tensor<784x16384xf32>) -> tensor<784x16384xf32>
+    %3 = stablehlo.custom_call @SPMDFullToShardShape(%2) {backend_config = "", mhlo.sharding = "{manual}"} : (tensor<784x16384xf32>) -> tensor<392x16384xf32>
+    // CHECK: %[[C:.*]] = "ttir.mesh_shard"[[C:.*]]
+    %4 = call @shmap_body(%1, %3) : (tensor<8192x392xf32>, tensor<392x16384xf32>) -> tensor<8192x16384xf32>
+    %5 = stablehlo.custom_call @Sharding(%4) {backend_config = "", mhlo.sharding = "{manual}"} : (tensor<8192x16384xf32>) -> tensor<8192x16384xf32>
+    %6 = stablehlo.custom_call @SPMDShardToFullShape(%5) {backend_config = "", mhlo.sharding = "{replicated}"} : (tensor<8192x16384xf32>) -> tensor<8192x16384xf32>
+    // CHECK: %[[C:.*]] = "ttir.mesh_shard"[[C:.*]]
+    return %6 : tensor<8192x16384xf32>
+  }
+  func.func private @shmap_body(%arg0: tensor<8192x392xf32>, %arg1: tensor<392x16384xf32>) -> (tensor<8192x16384xf32> {jax.result_info = "[('x',), None]"}) {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<8192x392xf32>, tensor<392x16384xf32>) -> tensor<8192x16384xf32>
+    %1 = "stablehlo.all_reduce"(%0) <{channel_handle = #stablehlo.channel_handle<handle = 1, type = 1>, replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>, use_global_device_ids}> ({
+    ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+      %2 = stablehlo.add %arg2, %arg3 : tensor<f32>
+      stablehlo.return %2 : tensor<f32>
+    }) : (tensor<8192x16384xf32>) -> tensor<8192x16384xf32>
+    // CHECK: %[[C:.*]] = "ttir.all_reduce"[[C:.*]]
+    return %1 : tensor<8192x16384xf32>
+  }
+}
+
+// jax/pjrt sharding target 2x4 for t3k
+module @jit_matmul_basic2 attributes {mhlo.num_partitions = 8 : i32, mhlo.num_replicas = 1 : i32} {
+  func.func public @main(%arg0: tensor<8192x784xf32>, %arg1: tensor<784x16384xf32>) -> (tensor<8192x16384xf32> {jax.result_info = ""}) {
+    %0 = stablehlo.custom_call @Sharding(%arg0) {backend_config = "", mhlo.sharding = "{devices=[2,4]<=[8]}"} : (tensor<8192x784xf32>) -> tensor<8192x784xf32>
+    %1 = stablehlo.custom_call @SPMDFullToShardShape(%0) {backend_config = "", mhlo.sharding = "{manual}"} : (tensor<8192x784xf32>) -> tensor<4096x196xf32>
+    // CHECK: %[[C:.*]] = "ttir.mesh_shard"[[C:.*]]
+    %2 = stablehlo.custom_call @Sharding(%arg1) {backend_config = "", mhlo.sharding = "{devices=[4,1,2]<=[2,4]T(1,0) last_tile_dim_replicate}"} : (tensor<784x16384xf32>) -> tensor<784x16384xf32>
+    %3 = stablehlo.custom_call @SPMDFullToShardShape(%2) {backend_config = "", mhlo.sharding = "{manual}"} : (tensor<784x16384xf32>) -> tensor<196x16384xf32>
+    // CHECK: %[[C:.*]] = "ttir.mesh_shard"[[C:.*]]
+    %4 = call @shmap_body(%1, %3) : (tensor<4096x196xf32>, tensor<196x16384xf32>) -> tensor<4096x16384xf32>
+    %5 = stablehlo.custom_call @Sharding(%4) {backend_config = "", mhlo.sharding = "{manual}"} : (tensor<4096x16384xf32>) -> tensor<4096x16384xf32>
+    %6 = stablehlo.custom_call @SPMDShardToFullShape(%5) {backend_config = "", mhlo.sharding = "{devices=[2,1,4]<=[8] last_tile_dim_replicate}"} : (tensor<4096x16384xf32>) -> tensor<8192x16384xf32>
+    // CHECK: %[[C:.*]] = "ttir.mesh_shard"[[C:.*]]
+    return %6 : tensor<8192x16384xf32>
+  }
+  func.func private @shmap_body(%arg0: tensor<4096x196xf32>, %arg1: tensor<196x16384xf32>) -> (tensor<4096x16384xf32> {jax.result_info = "[('x',), None]"}) {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<4096x196xf32>, tensor<196x16384xf32>) -> tensor<4096x16384xf32>
+    %1 = "stablehlo.all_reduce"(%0) <{channel_handle = #stablehlo.channel_handle<handle = 1, type = 1>, replica_groups = dense<[[0, 1, 2, 3], [4, 5, 6, 7]]> : tensor<2x4xi64>, use_global_device_ids}> ({
+    ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+      %2 = stablehlo.add %arg2, %arg3 : tensor<f32>
+      stablehlo.return %2 : tensor<f32>
+    }) : (tensor<4096x16384xf32>) -> tensor<4096x16384xf32>
+    // CHECK: %[[C:.*]] = "ttir.all_reduce"[[C:.*]]
+    return %1 : tensor<4096x16384xf32>
+  }
+}
+
+// jax/pjrt sharding target 1x8 for t3k
+module @jit_matmul_basic3 attributes {mhlo.num_partitions = 8 : i32, mhlo.num_replicas = 1 : i32} {
+  func.func public @main(%arg0: tensor<8192x784xf32> {mhlo.layout_mode = "default"}, %arg1: tensor<784x16384xf32> {mhlo.layout_mode = "default"}) -> (tensor<8192x16384xf32> {jax.result_info = "", mhlo.layout_mode = "default"}) {
+    %0 = stablehlo.custom_call @Sharding(%arg0) {backend_config = "", mhlo.sharding = "{devices=[1,8]<=[8]}"} : (tensor<8192x784xf32>) -> tensor<8192x784xf32>
+    %1 = stablehlo.custom_call @SPMDFullToShardShape(%0) {backend_config = "", mhlo.sharding = "{manual}"} : (tensor<8192x784xf32>) -> tensor<8192x98xf32>
+    // CHECK: %[[C:.*]] = "ttir.mesh_shard"[[C:.*]]
+    %2 = stablehlo.custom_call @Sharding(%arg1) {backend_config = "", mhlo.sharding = "{devices=[8,1]<=[8]}"} : (tensor<784x16384xf32>) -> tensor<784x16384xf32>
+    %3 = stablehlo.custom_call @SPMDFullToShardShape(%2) {backend_config = "", mhlo.sharding = "{manual}"} : (tensor<784x16384xf32>) -> tensor<98x16384xf32>
+    // CHECK: %[[C:.*]] = "ttir.mesh_shard"[[C:.*]]
+    %4 = call @shmap_body(%1, %3) : (tensor<8192x98xf32>, tensor<98x16384xf32>) -> tensor<8192x16384xf32>
+    %5 = stablehlo.custom_call @Sharding(%4) {backend_config = "", mhlo.sharding = "{manual}"} : (tensor<8192x16384xf32>) -> tensor<8192x16384xf32>
+    %6 = stablehlo.custom_call @SPMDShardToFullShape(%5) {backend_config = "", mhlo.sharding = "{replicated}"} : (tensor<8192x16384xf32>) -> tensor<8192x16384xf32>
+    // CHECK: %[[C:.*]] = "ttir.mesh_shard"[[C:.*]]
+    return %6 : tensor<8192x16384xf32>
+  }
+  func.func private @shmap_body(%arg0: tensor<8192x98xf32>, %arg1: tensor<98x16384xf32>) -> (tensor<8192x16384xf32> {jax.result_info = "[('x',), None]"}) {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<8192x98xf32>, tensor<98x16384xf32>) -> tensor<8192x16384xf32>
+    %1 = "stablehlo.all_reduce"(%0) <{channel_handle = #stablehlo.channel_handle<handle = 1, type = 1>, replica_groups = dense<[[0, 1, 2, 3, 4, 5, 6, 7]]> : tensor<1x8xi64>, use_global_device_ids}> ({
+    ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+      %2 = stablehlo.add %arg2, %arg3 : tensor<f32>
+      stablehlo.return %2 : tensor<f32>
+    }) : (tensor<8192x16384xf32>) -> tensor<8192x16384xf32>
+    // CHECK: %[[C:.*]] = "ttir.all_reduce"[[C:.*]]
+    return %1 : tensor<8192x16384xf32>
+  }
+}

From 4083e98b3cf1c2e7975e87af6226e13087aa8646 Mon Sep 17 00:00:00 2001
From: Aleksandar Zecevic <azecevic@tenstorrent.com>
Date: Tue, 26 Nov 2024 15:36:35 +0100
Subject: [PATCH 23/84] Minor fix of TTNNToFlatbuffer createOps (#1398)

Concrete op types shouldn't be shadowed by template arguments when they are only ever going to be instantiated with concrete types with same names anyway, it's confusing for no reason.
---
 lib/Target/TTNN/TTNNToFlatbuffer.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/lib/Target/TTNN/TTNNToFlatbuffer.cpp b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
index 5677ce94b1..e7df85956f 100644
--- a/lib/Target/TTNN/TTNNToFlatbuffer.cpp
+++ b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
@@ -595,7 +595,6 @@ createReductionOp(FlatbufferObjectCache &cache, ReductionOp op) {
                                                dim_arg, op.getKeepDim());
 }
 
-template <typename TransposeOp>
 ::flatbuffers::Offset<::tt::target::ttnn::TransposeOp>
 createTransposeOp(FlatbufferObjectCache &cache, TransposeOp op) {
   auto in =
@@ -608,7 +607,6 @@ createTransposeOp(FlatbufferObjectCache &cache, TransposeOp op) {
   return ::tt::target::ttnn::CreateTransposeOp(*cache.fbb, in, out, dim0, dim1);
 }
 
-template <typename ConcatOp>
 ::flatbuffers::Offset<::tt::target::ttnn::ConcatOp>
 createConcatOp(FlatbufferObjectCache &cache, ConcatOp op) {
   std::vector<::flatbuffers::Offset<::tt::target::TensorRef>> ins;
@@ -623,7 +621,6 @@ createConcatOp(FlatbufferObjectCache &cache, ConcatOp op) {
   return ::tt::target::ttnn::CreateConcatOpDirect(*cache.fbb, &ins, out, dim);
 }
 
-template <typename EmbeddingOp>
 ::flatbuffers::Offset<::tt::target::ttnn::EmbeddingOp>
 createEmbeddingOp(FlatbufferObjectCache &cache, EmbeddingOp op) {
   auto in0 =
@@ -635,7 +632,6 @@ createEmbeddingOp(FlatbufferObjectCache &cache, EmbeddingOp op) {
   return ::tt::target::ttnn::CreateEmbeddingOp(*cache.fbb, in0, in1, output);
 }
 
-template <typename ReshapeOp>
 ::flatbuffers::Offset<::tt::target::ttnn::ReshapeOp>
 createReshapeOp(FlatbufferObjectCache &cache, ReshapeOp op) {
   auto in =
@@ -648,7 +644,6 @@ createReshapeOp(FlatbufferObjectCache &cache, ReshapeOp op) {
   return ::tt::target::ttnn::CreateReshapeOp(*cache.fbb, in, out, shape);
 }
 
-template <typename SliceOp>
 ::flatbuffers::Offset<::tt::target::ttnn::SliceOp>
 createSliceOp(FlatbufferObjectCache &cache, SliceOp op) {
   auto in =
@@ -666,7 +661,6 @@ createSliceOp(FlatbufferObjectCache &cache, SliceOp op) {
                                            step);
 }
 
-template <typename MaxPool2dOp>
 ::flatbuffers::Offset<::tt::target::ttnn::MaxPool2dOp>
 createMaxPool2dOp(FlatbufferObjectCache &cache, MaxPool2dOp op) {
   auto in =
@@ -684,7 +678,6 @@ createMaxPool2dOp(FlatbufferObjectCache &cache, MaxPool2dOp op) {
       op.getPaddingWidth());
 }
 
-template <typename SoftmaxOp>
 ::flatbuffers::Offset<::tt::target::ttnn::SoftmaxOp>
 createSoftmaxOp(FlatbufferObjectCache &cache, SoftmaxOp op) {
   auto in =
@@ -696,7 +689,6 @@ createSoftmaxOp(FlatbufferObjectCache &cache, SoftmaxOp op) {
   return ::tt::target::ttnn::CreateSoftmaxOp(*cache.fbb, in, out, dimension);
 }
 
-template <typename DeallocateOp>
 ::flatbuffers::Offset<::tt::target::ttnn::DeallocateOp>
 createDeallocateOp(FlatbufferObjectCache &cache, DeallocateOp op) {
   auto in =

From d7798cff64b039483d7a2e3845618198abcf1a8e Mon Sep 17 00:00:00 2001
From: Aleksandar Zecevic <azecevic@tenstorrent.com>
Date: Tue, 26 Nov 2024 22:39:22 +0100
Subject: [PATCH 24/84] Additional verifications of TTIR dialect ops (#1399)

- Refactoring of ElementwiseOpInteface to better reflect intention, with
  fix of broadcast shape calculation, considering that operand that
represetnts destination shouldn't affect output shape.
- Check number of operands for AttrSizedOperandSegments ops with simple
  traits.
- Minor refactoring of TTIR_GenericOp.

Addresses https://github.com/tenstorrent/tt-mlir/issues/1289, but I would
leave it as open to track further progress with similar traits and
interfaces needed in the TTNN dialect.
---
 include/ttmlir/Dialect/TTIR/IR/TTIROps.td     | 51 +++++--------------
 .../Dialect/TTIR/IR/TTIROpsInterfaces.h       |  2 +-
 .../Dialect/TTIR/IR/TTIROpsInterfaces.td      | 20 +++++++-
 lib/Dialect/TTIR/IR/TTIROpsInterfaces.cpp     | 38 +++++++-------
 .../TTIR/ttir_broadcastable_negative.mlir     | 28 ++++++++++
 .../Dialect/TTIR/ttir_noperands_negative.mlir | 37 ++++++++++++++
 6 files changed, 115 insertions(+), 61 deletions(-)
 create mode 100644 test/ttmlir/Dialect/TTIR/ttir_broadcastable_negative.mlir
 create mode 100644 test/ttmlir/Dialect/TTIR/ttir_noperands_negative.mlir

diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
index b3701b22e8..cbb5e5ab8d 100644
--- a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
+++ b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
@@ -172,8 +172,12 @@ def TTIR_DeallocOp : TTIR_Op<"dealloc"> {
 // TTIR top level named ops
 //===----------------------------------------------------------------------===//
 
+def TwoOperands : ParamNativeOpTrait<"NOperands", "2">;
+def ThreeOperands : ParamNativeOpTrait<"NOperands", "3">;
+def FourOperands : ParamNativeOpTrait<"NOperands", "4">;
+
 class TTIR_ElementwiseOp<string mnemonic, list<Trait> traits = []> :
-    TTIR_DPSOp<mnemonic, !listconcat(traits, [AttrSizedOperandSegments, TTIR_ElementwiseOpInterface])> {
+    TTIR_DPSOp<mnemonic, !listconcat(traits, [AttrSizedOperandSegments, TTIR_Broadcastable])> {
 
     let description = [{
       Base class for elementwise operations. Elementwise operations can take inputs with different shape,
@@ -187,7 +191,7 @@ class TTIR_ElementwiseOp<string mnemonic, list<Trait> traits = []> :
 }
 
 class TTIR_ElementwiseTernaryOp<string mnemonic, list<Trait> traits = []> :
-    TTIR_ElementwiseOp<mnemonic, traits> {
+    TTIR_ElementwiseOp<mnemonic, !listconcat(traits, [FourOperands])> {
     let summary = "Eltwise ternary op.";
     let description = [{
       Eltwise ternary op.
@@ -210,7 +214,7 @@ def TTIR_WhereOp: TTIR_ElementwiseTernaryOp<"where"> {
 }
 
 class TTIR_ElementwiseUnaryOp<string mnemonic, list<Trait> traits = []> :
-    TTIR_ElementwiseOp<mnemonic, traits> {
+    TTIR_ElementwiseOp<mnemonic, !listconcat(traits, [TwoOperands])> {
     let summary = "Eltwise unary op.";
     let description = [{
       Eltwise unary op.
@@ -424,7 +428,7 @@ def TTIR_LeakyReluOp : TTIR_ElementwiseUnaryWithFloatParameterOp<"leaky_relu"> {
 }
 
 class TTIR_ElementwiseBinaryOp<string mnemonic, list<Trait> traits = []> :
-    TTIR_ElementwiseOp<mnemonic, traits> {
+    TTIR_ElementwiseOp<mnemonic, !listconcat(traits, [ThreeOperands])> {
     let summary = "Eltwise binary op.";
     let description = [{
       Eltwise binary op.
@@ -1175,11 +1179,10 @@ class TTIR_GenericElementwiseUnaryOp<string mnemonic, list<Trait> traits = []> :
       void buildGenericRegion(::mlir::OpBuilder &opBuilder, ::mlir::Block* block);
 
       std::pair<::mlir::ArrayAttr, ::mlir::ArrayAttr> getIndexingMaps(Builder &builder) {
-        assert(getNumOperands() == 2 && "Input and output operand must have the same rank");
-        assert(sameRank(getOperands()) &&
-               "Elementwise unary op must have only one input and one output operand.");
+        assert(sameRank(getOperation()->getOperands()) &&
+               "Input and output operand must have the same rank");
 
-        auto rank = mlir::cast<RankedTensorType>(getOperand(0).getType()).getRank();
+        auto rank = mlir::cast<RankedTensorType>(getOperation()->getOperand(0).getType()).getRank();
 
         SmallVector<AffineMap> indexingMaps(2, builder.getMultiDimIdentityMap(rank));
         SmallVector<Attribute> iteratorTypes(
@@ -1188,19 +1191,6 @@ class TTIR_GenericElementwiseUnaryOp<string mnemonic, list<Trait> traits = []> :
         return {builder.getAffineMapArrayAttr(indexingMaps),
                 builder.getArrayAttr(iteratorTypes)};
       }
-
-      static bool sameRank(mlir::OperandRange operands) {
-        if (operands.empty()) {
-          return true;
-        }
-        auto rank = mlir::cast<RankedTensorType>(operands[0].getType()).getRank();
-        for (auto operand : operands) {
-          if (mlir::cast<RankedTensorType>(operand.getType()).getRank() != rank) {
-            return false;
-          }
-        }
-        return true;
-      }
     }];
 }
 
@@ -1220,29 +1210,16 @@ class TTIR_GenericElementwiseBinaryOp<string mnemonic, list<Trait> traits = []>
       void buildGenericRegion(::mlir::OpBuilder &opBuilder, ::mlir::Block* block);
 
       std::pair<::mlir::ArrayAttr, ::mlir::ArrayAttr> getIndexingMaps(Builder &builder) {
-        assert(sameRank(getOperands()) &&
+        assert(sameRank(getOperation()->getOperands()) &&
                "For now all operands must have the same rank");
-        auto rank = mlir::cast<RankedTensorType>(getOperand(0).getType()).getRank();
-        SmallVector<AffineMap> indexingMaps(getNumOperands(),
+        auto rank = mlir::cast<RankedTensorType>(getOperation()->getOperand(0).getType()).getRank();
+        SmallVector<AffineMap> indexingMaps(getOperation()->getNumOperands(),
                                             builder.getMultiDimIdentityMap(rank));
         SmallVector<Attribute> iteratorTypes(
             rank, builder.getAttr<IteratorTypeAttr>(IteratorType::Parallel));
         return {builder.getAffineMapArrayAttr(indexingMaps),
                 builder.getArrayAttr(iteratorTypes)};
       }
-
-      static bool sameRank(mlir::OperandRange operands) {
-        if (operands.empty()) {
-          return true;
-        }
-        auto rank = mlir::cast<RankedTensorType>(operands[0].getType()).getRank();
-        for (auto operand : operands) {
-          if (mlir::cast<RankedTensorType>(operand.getType()).getRank() != rank) {
-            return false;
-          }
-        }
-        return true;
-      }
     }];
 }
 
diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.h b/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.h
index 1d88e8a657..01b6772972 100644
--- a/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.h
+++ b/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.h
@@ -12,7 +12,7 @@ namespace mlir {
 namespace tt {
 namespace ttir {
 namespace detail {
-mlir::LogicalResult verifyElementwiseOp(mlir::Operation *op);
+mlir::LogicalResult verifyBroadcastable(mlir::Operation *op);
 } // namespace detail
 } // namespace ttir
 } // namespace tt
diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.td b/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.td
index cbc0056737..a130332f0d 100644
--- a/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.td
+++ b/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.td
@@ -64,11 +64,13 @@ def TTIROpInterface : OpInterface<"TTIROp"> {
   ];
 }
 
-def TTIR_ElementwiseOpInterface : OpInterface<"ElementwiseOp"> {
+def TTIR_Broadcastable : OpInterface<"Broadcastable"> {
   let cppNamespace = "::mlir::tt::ttir";
 
+  let dependentTraits = [AttrSizedOperandSegments];
+
   let verify = [{
-    return detail::verifyElementwiseOp($_op);
+    return detail::verifyBroadcastable($_op);
   }];
 }
 
@@ -105,6 +107,20 @@ def TTIR_GenericRegionOpInterface : OpInterface<"GenericRegionOp"> {
       /*methodBody=*/"",
       /*defaultImplementation=*/""
     >,
+    StaticInterfaceMethod<
+      /*desc=*/[{
+        Return if the given operands have the same rank.
+      }],
+      /*retTy=*/"bool",
+      /*methodName=*/"sameRank",
+      /*args=*/(ins "::mlir::OperandRange":$operands),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        return llvm::all_equal(llvm::map_range(operands, [](Value operand) {
+          return mlir::cast<RankedTensorType>(operand.getType()).getRank();
+        }));
+      }]
+    >
   ];
 }
 
diff --git a/lib/Dialect/TTIR/IR/TTIROpsInterfaces.cpp b/lib/Dialect/TTIR/IR/TTIROpsInterfaces.cpp
index 84409174a3..10619f24b8 100644
--- a/lib/Dialect/TTIR/IR/TTIROpsInterfaces.cpp
+++ b/lib/Dialect/TTIR/IR/TTIROpsInterfaces.cpp
@@ -17,37 +17,33 @@
 #include "llvm/ADT/SmallVector.h"
 
 mlir::LogicalResult
-mlir::tt::ttir::detail::verifyElementwiseOp(mlir::Operation *op) {
+mlir::tt::ttir::detail::verifyBroadcastable(mlir::Operation *op) {
+  const auto getShape = [](const Value val) {
+    return mlir::cast<mlir::RankedTensorType>(val.getType()).getShape();
+  };
+
+  const auto operandSegmentSizes =
+      op->getAttrOfType<mlir::DenseI32ArrayAttr>("operandSegmentSizes");
+  // DPS operands shouldn't affect the result shape.
+  const auto outputSegmentSize =
+      operandSegmentSizes[operandSegmentSizes.size() - 1];
+  const auto operandShapes = llvm::map_range(op->getOperands(), getShape);
   llvm::SmallVector<int64_t, 4> broadcastedShape;
-  mlir::OperandRange operands = op->getOperands();
-  mlir::OperandRange::iterator operand_it = operands.begin();
-  llvm::SmallVector<int64_t, 4> prevOperandShape(
-      mlir::cast<mlir::RankedTensorType>((*operand_it).getType()).getShape());
-
-  while (++operand_it != operands.end()) {
-    llvm::SmallVector<int64_t, 4> nextOperandShape(
-        mlir::cast<mlir::RankedTensorType>((*operand_it).getType()).getShape());
-
-    if (!OpTrait::util::getBroadcastedShape(prevOperandShape, nextOperandShape,
+  for (const auto operandShape :
+       llvm::drop_end(operandShapes, outputSegmentSize)) {
+    const auto prevBroadcastedShape = broadcastedShape;
+    if (!OpTrait::util::getBroadcastedShape(prevBroadcastedShape, operandShape,
                                             broadcastedShape)) {
       return op->emitOpError("Operands are not broadcast compatible");
     }
-    prevOperandShape = broadcastedShape;
   }
 
-  llvm::SmallVector<int64_t, 4> resultShape(
-      mlir::cast<mlir::RankedTensorType>(op->getResult(0).getType())
-          .getShape());
+  // Check that the result shape matches the broadcasted shape of the operands.
+  llvm::SmallVector<int64_t, 4> resultShape(getShape(op->getResults().front()));
   if (broadcastedShape != resultShape) {
     return op->emitOpError(
         "Result shape must match operand shapes after broadcasting");
   }
 
-  TypeID expectedBaseTy = op->getResultTypes().front().getTypeID();
-  if (!llvm::all_of(op->getOperandTypes(),
-                    [&](Type t) { return t.getTypeID() == expectedBaseTy; })) {
-    return op->emitOpError() << "All operands/results must have the same type";
-  }
-
   return success();
 }
diff --git a/test/ttmlir/Dialect/TTIR/ttir_broadcastable_negative.mlir b/test/ttmlir/Dialect/TTIR/ttir_broadcastable_negative.mlir
new file mode 100644
index 0000000000..e1454ad0a0
--- /dev/null
+++ b/test/ttmlir/Dialect/TTIR/ttir_broadcastable_negative.mlir
@@ -0,0 +1,28 @@
+// RUN: not ttmlir-opt --split-input-file %s 2>&1 | FileCheck %s
+// Negative tests for Broadcastable interface
+
+// CHECK: 'ttir.abs' op Result shape must match operand shapes after broadcasting
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+func.func @eltwise_unary(%arg0: tensor<1x64xbf16>) -> tensor<2x64xbf16> {
+  %0 = tensor.empty() : tensor<2x64xbf16>
+  %1 = "ttir.abs"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device_tile, #any_device_tile]}> : (tensor<1x64xbf16>, tensor<2x64xbf16>) -> tensor<2x64xbf16>
+  return %1 : tensor<2x64xbf16>
+}
+
+// -----
+// CHECK: error: 'ttir.add' op Result shape must match operand shapes after broadcasting
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+func.func @eltwise_binary(%arg0: tensor<2x3x64xf32>, %arg1: tensor<64xf32>) -> tensor<4x2x3x64xf32> {
+  %0 = tensor.empty() : tensor<4x2x3x64xf32>
+  %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<2x3x64xf32>, tensor<64xf32>, tensor<4x2x3x64xf32>) -> tensor<4x2x3x64xf32>
+  return %1 : tensor<4x2x3x64xf32>
+}
+
+// -----
+// CHECK: error: 'ttir.where' op Result shape must match operand shapes after broadcasting
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+func.func @eltwise_ternary(%arg0: tensor<3x64xf32>, %arg1: tensor<1x3x64xf32>, %arg2: tensor<2x1x64xf32>) -> tensor<1x2x3x64xf32> {
+  %0 = tensor.empty() : tensor<1x2x3x64xf32>
+  %1 = "ttir.where"(%arg0, %arg1, %arg2, %0) <{operandSegmentSizes = array<i32: 3, 1>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<3x64xf32>, tensor<1x3x64xf32>, tensor<2x1x64xf32>, tensor<1x2x3x64xf32>) -> tensor<1x2x3x64xf32>
+  return %1 : tensor<1x2x3x64xf32>
+}
diff --git a/test/ttmlir/Dialect/TTIR/ttir_noperands_negative.mlir b/test/ttmlir/Dialect/TTIR/ttir_noperands_negative.mlir
new file mode 100644
index 0000000000..a22dc28370
--- /dev/null
+++ b/test/ttmlir/Dialect/TTIR/ttir_noperands_negative.mlir
@@ -0,0 +1,37 @@
+// RUN: not ttmlir-opt --split-input-file %s 2>&1 | FileCheck %s
+// Negative tests for NOperands trait
+
+// CHECK: error: 'ttir.abs' op expected 2 operands, but found 3
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+func.func @eltwise_unary(%arg0: tensor<64x64xbf16>) -> tensor<64x64xbf16> {
+  %0 = tensor.empty() : tensor<64x64xbf16>
+  %1 = "ttir.abs"(%arg0, %arg0, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<64x64xbf16>, tensor<64x64xbf16>, tensor<64x64xbf16>) -> tensor<64x64xbf16>
+  return %1 : tensor<64x64xbf16>
+}
+
+// -----
+// CHECK: error: 'ttir.add' op expected 3 operands, but found 4
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+func.func @eltwise_binary(%arg0: tensor<64x64xf32>, %arg1: tensor<64x64xf32>) -> tensor<64x64xf32> {
+  %0 = tensor.empty() : tensor<64x64xf32>
+  %1 = "ttir.add"(%arg0, %arg1, %arg1, %0) <{operandSegmentSizes = array<i32: 3, 1>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>) -> tensor<64x64xf32>
+  return %1 : tensor<64x64xf32>
+}
+
+// -----
+// CHECK: error: 'ttir.add' op expected 3 operands, but found 2
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+func.func @eltwise_binary(%arg0: tensor<64x64xf32>) -> tensor<64x64xf32> {
+  %0 = tensor.empty() : tensor<64x64xf32>
+  %1 = "ttir.add"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device_tile, #any_device_tile]}> : (tensor<64x64xf32>, tensor<64x64xf32>) -> tensor<64x64xf32>
+  return %1 : tensor<64x64xf32>
+}
+
+// -----
+// CHECK: error: 'ttir.where' op expected 4 operands, but found 5
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+func.func @eltwise_ternary(%arg0: tensor<64x64xf32>, %arg1: tensor<64x64xf32>, %arg2: tensor<64x64xf32>) -> tensor<64x64xf32> {
+  %0 = tensor.empty() : tensor<64x64xf32>
+  %1 = "ttir.where"(%arg0, %arg1, %arg2, %arg2, %0) <{operandSegmentSizes = array<i32: 4, 1>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>) -> tensor<64x64xf32>
+  return %1 : tensor<64x64xf32>
+}

From b1986685f27ecfcb9abbdaa533a34bf46533339a Mon Sep 17 00:00:00 2001
From: Nick Smith <127986401+nsmithtt@users.noreply.github.com>
Date: Tue, 26 Nov 2024 14:22:59 -0800
Subject: [PATCH 25/84] Docs update section Docker Notes (#1408)

---
 docs/src/SUMMARY.md                           |  2 +-
 .../{internal-build.md => docker-notes.md}    | 20 +++++--------------
 2 files changed, 6 insertions(+), 16 deletions(-)
 rename docs/src/{internal-build.md => docker-notes.md} (72%)

diff --git a/docs/src/SUMMARY.md b/docs/src/SUMMARY.md
index beeb35883a..41ca83528c 100644
--- a/docs/src/SUMMARY.md
+++ b/docs/src/SUMMARY.md
@@ -5,7 +5,7 @@
 # User Guide
 
 - [Building](./build.md)
-  - [Internal Build Notes / IRD](./internal-build.md)
+  - [Docker Notes](./docker-notes.md)
 - [Tools](./tools.md)
   - [ttmlir-opt](./ttmlir-opt.md)
   - [ttmlir-translate](./ttmlir-translate.md)
diff --git a/docs/src/internal-build.md b/docs/src/docker-notes.md
similarity index 72%
rename from docs/src/internal-build.md
rename to docs/src/docker-notes.md
index 11d2fb8642..1674bf2efc 100644
--- a/docs/src/internal-build.md
+++ b/docs/src/docker-notes.md
@@ -1,21 +1,11 @@
-# Internal Build Notes / IRD
-
-- When building the runtime we must use Ubuntu 22.04 docker image
-  - When making an IRD reservation use `--docker-image
-    yyz-gitlab.local.tenstorrent.com:5005/tenstorrent/infra/ird-ubuntu-22-04-amd64:latest`
-- You'll have to manaully install a newer version of cmake, at least 3.22, the easiest way to do this is to `pip install cmake` and make sure this one is in your path
-- You'll want LLVM installation to persist IRD reservations, you can achieve this by:
-  - mkdir /localdev/$USER/ttmlir-toolchain
-  - When requesting an IRD use `--volumes /localdev/$USER/ttmlir-toolchain:/opt/ttmlir-toolchain`
-
-## Working with Docker Images
+# Working with Docker Images
 
 Components:
   - Dockerfile
   - Workflow for building Docker image
   - Project build using Docker image
 
-### Overview
+## Overview
 
 We use docker images to prepare project enviroment, install dependancies, tooling and prebuild toolchain.
 Project builds four docker images:
@@ -29,11 +19,11 @@ Base image starts with a supported base image (Ubuntu 22.04) and installs depend
 
 During the CI Docker build, the project is built and tests are run to ensure that everything is set up correctly. If any dependencies are missing, the Docker build will fail.
 
-### Building the Docker Image using GitHub Actions
+## Building the Docker Image using GitHub Actions
 
 The GitHub Actions workflow [Build and Publish Docker Image](.github/workflows/build-image.yml) builds the Docker images and uploads them to GitHub Packages at https://github.com/orgs/tenstorrent/packages?repo_name=tt-mlir. We use the git SHA we build from as the tag.
 
-### Building the Docker Image Locally
+## Building the Docker Image Locally
 
 To test the changes and build the image locally, use the following command:
 ```bash
@@ -43,7 +33,7 @@ docker build -f .github/Dockerfile.ird -build-args FROM_IMAGE=base -t ghcr.io/te
 docker build -f .github/Dockerfile.ird -build-args FROM_IMAGE=ci -t ghcr.io/tenstorrent/tt-mlir/tt-mlir-ird-ubuntu-22-04:latest .
 ```
 
-### Using the Image in GitHub Actions Jobs
+## Using the Image in GitHub Actions Jobs
 
 The GitHub Actions workflow [Build in Docker](.github/workflows/docker-build.yml) uses a Docker container for building:
 ```yaml

From d22057f9ac87616e590f179bdece3a771e47c607 Mon Sep 17 00:00:00 2001
From: Milan Topalovic <163355844+mtopalovicTT@users.noreply.github.com>
Date: Wed, 27 Nov 2024 12:42:16 +0100
Subject: [PATCH 26/84] Minor API fixes for TTNN encoding ettribute (#1390)

This PR adds couple convenience methods to TTNN tensor encoding attribute and also removes redundant utils functions.

Renaming/Adding some new functions...

* `getDataType` to get scalar data type:
    * `memref<2x2x!tt.tile<32x32xf32>>` returns float data type
    * `memref<128x128xi32>` returns int data type
* `getElementType` to get type from memref:
    * `memref<2x2x!tt.tile<32x32xf32>>` returns TileType
    * `memref<128x128xi32>` returns IntegerType
* `getLayout` - gets layout of encoding i.e Tile/RowMajor
* `getShardShape`:
    * `memref<2x2x!tt.tile<32x32xf32>>` returns `(2, 2)`
    * `memref<128x128xi32>` returns `(128, 128)`
* `getScalarShardShape`:
    * `memref<2x2x!tt.tile<32x32xf32>>` returns `(64, 64)`
    * `memref<128x128xi32>` returns `(128, 128)`
---
 .../ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td    |  15 +-
 include/ttmlir/Dialect/TTNN/Utils/Utils.h     |   4 -
 lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp      |  42 ++---
 lib/Dialect/TTNN/IR/TTNNOps.cpp               |  17 +-
 lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp          | 145 ++++++++++--------
 lib/Dialect/TTNN/Transforms/Optimizer.cpp     |  18 +--
 lib/Dialect/TTNN/Transforms/Passes.cpp        |  18 +--
 7 files changed, 121 insertions(+), 138 deletions(-)

diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td b/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td
index bba7fe6f2c..e45fba0031 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td
@@ -109,6 +109,13 @@ def TTNN_TTNNLayoutAttr: TTNN_Attr<"TTNNLayout", "ttnn_layout"> {
   let summary = "Tensor encoding attribute used for types in ttnn";
   let description = [{
     Layout attribute in ttnn. This attribute is used to encode different information about tensor memory layout.
+    Here is how tensor will look like after layout tensor<32x32x64xf32, #ttnn.ttnn_layout<linear, grid, memref, mem_layout>>
+    Lets break down what each parameter means:
+    - linear: An affine map that defines how the logical tensor dimensions map to physical space.
+    - grid: The grid shape (of tensix cores) where tensor is divided onto.
+    - memref: A memref is used to describe shard size and memory space. Shard size is calculated by dividing the tensor size by grid size.
+    - mem_layout: The layout of the tensor in memory. For tensor on host it should be None. For tensor on device
+    it can be interleaved or sharded.
   }];
 
   let parameters = (ins AttrParameter<"AffineMap", "An affine map that defines how the logical tensor dimensions map to a grid shape.">:$linear,
@@ -142,15 +149,15 @@ def TTNN_TTNNLayoutAttr: TTNN_Attr<"TTNNLayout", "ttnn_layout"> {
       bool hasShardedL1TensorMemoryLayout() const;
       bool hasInterleavedL1TensorMemoryLayout() const;
       bool isTiled() const;
+      Layout getLayout() const;
       Type getElementType() const;
-      DataType getDataTypeFromMemRef() const;
+      DataType getDataType() const;
       uint64_t getElementSizeBytes() const;
       int64_t getTensorSizeInBytes(ArrayRef<int64_t> tensorShape, ::mlir::tt::DeviceAttr device) const;
       llvm::SmallVector<int64_t> getStride(ArrayRef<int64_t> logicalShape) const;
-      llvm::SmallVector<int64_t> getPhysicalShape(ArrayRef<int64_t> logicalShape) const;
-      llvm::SmallVector<int64_t> getShardShape(bool convertTileToScalar = true) const;
+      llvm::SmallVector<int64_t> getShardShape() const;
+      llvm::SmallVector<int64_t> getScalarShardShape() const;
       AffineMap replaceMemoryMapSymbolsWithShardShape(AffineMap physicalMemoryMap) const;
-      AffineMap projectOnto(AffineMap linearMap, AffineMap physicalMemoryMap) const;
       AffineMap getIdentityTileLinearMap() const;
       llvm::SmallVector<int64_t> getTiledShape(ArrayRef<int64_t> logicalTensorShape) const;
   }];
diff --git a/include/ttmlir/Dialect/TTNN/Utils/Utils.h b/include/ttmlir/Dialect/TTNN/Utils/Utils.h
index a6e10c0991..533235a610 100644
--- a/include/ttmlir/Dialect/TTNN/Utils/Utils.h
+++ b/include/ttmlir/Dialect/TTNN/Utils/Utils.h
@@ -31,10 +31,6 @@ mlir::tt::TensorMemoryLayout toTTTensorMemoryLayout(
 mlir::tt::MemorySpace
 toTTMemorySpace(const mlir::tt::ttnn::BufferType bufferType);
 
-DataType getDataTypeFromMemRef(mlir::MemRefType memref);
-
-Layout getLayoutFromMemRef(mlir::MemRefType memref);
-
 mlir::Type createRowMajorTypeFromDtype(::mlir::MLIRContext *context,
                                        DataType dtype);
 
diff --git a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
index 9dbc9cf978..3241928f45 100644
--- a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
+++ b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
@@ -65,20 +65,15 @@ class TensorEmptyConversionPattern
 
     // Get the shape of the tensor, tensor layout, and data type
     //
-    mlir::MemRefType memref = layoutAttr.getMemref();
     ttnn::ShapeAttr shapeAttr = ttnn::ShapeAttr::get(
         rewriter.getContext(),
         mlir::cast<RankedTensorType>(op->getResult(0).getType()).getShape());
-    Type elementType = memref.getElementType();
-    DataType dtype = DataType::Float32;
+    DataType dtype = layoutAttr.getDataType();
     ttnn::Layout ttnnLayoutEnum = ttnn::Layout::RowMajor;
-    if (llvm::isa<TileType>(elementType)) {
+    if (layoutAttr.isTiled()) {
       ttnnLayoutEnum = ttnn::Layout::Tile;
-      auto tileType = mlir::cast<TileType>(elementType);
-      dtype = tileType.getDataType();
     } else {
       ttnnLayoutEnum = ttnn::Layout::RowMajor;
-      dtype = elementTypeToDataType(elementType);
     }
     DataTypeAttr dTypeAttr = DataTypeAttr::get(rewriter.getContext(), dtype);
     ttnn::LayoutAttr tensorLayoutAttr =
@@ -101,13 +96,14 @@ class TensorEmptyConversionPattern
     // Create MemoryConfigAttr
     //
     auto device = getOrInsertDevice(rewriter, op);
+    llvm::SmallVector<int64_t> shardShape = layoutAttr.getShardShape();
     ttnn::MemoryConfigAttr memoryConfigAttr = ttnn::MemoryConfigAttr::get(
         op.getContext(),
         ttnn::TensorMemoryLayoutAttr::get(op.getContext(), memLayout),
         ttnn::BufferTypeAttr::get(op.getContext(), bufferType),
         ttnn::ShardSpecAttr::get(
             op.getContext(),
-            ttnn::ShapeAttr::get(op.getContext(), memref.getShape())));
+            ttnn::ShapeAttr::get(op.getContext(), shardShape)));
 
     rewriter.replaceOpWithNewOp<ttnn::EmptyOp>(
         op, this->getTypeConverter()->convertType(op.getType()), device,
@@ -137,18 +133,15 @@ class ToLayoutOpConversionPattern
     auto outputLayoutAttr = mlir::cast<ttnn::TTNNLayoutAttr>(
         op.getResult().getType().getEncoding());
 
-    auto outputMemref = outputLayoutAttr.getMemref();
-
     // Determine the output data type
-    DataType dtype = ttnn::utils::getDataTypeFromMemRef(outputMemref);
+    DataType dtype = outputLayoutAttr.getDataType();
     DataTypeAttr outputDataType =
         DataTypeAttr::get(rewriter.getContext(), dtype);
 
     // Determine the output layout (tile or row major)
     ttnn::BufferType outputBufferType = outputLayoutAttr.getBufferType();
 
-    ttnn::Layout outputLayoutEnum =
-        ttnn::utils::getLayoutFromMemRef(outputMemref);
+    ttnn::Layout outputLayoutEnum = outputLayoutAttr.getLayout();
 
     bool isOutputOnHost = (outputBufferType == ttnn::BufferType::SystemMemory);
 
@@ -176,13 +169,14 @@ class ToLayoutOpConversionPattern
         op.getResult().setType(result);
         outputLayoutAttr =
             mlir::cast<ttnn::TTNNLayoutAttr>(result.getEncoding());
-        outputMemref = outputLayoutAttr.getMemref();
         outputLayoutEnum = newOutputLayoutEnum;
       }
     }
 
     ttnn::LayoutAttr outputLayout =
         ttnn::LayoutAttr::get(rewriter.getContext(), outputLayoutEnum);
+    llvm::SmallVector<int64_t> outputShardShape =
+        outputLayoutAttr.getShardShape();
 
     // Determine output memory config attr
     ttnn::TensorMemoryLayout outputTensorMemoryLayout =
@@ -193,8 +187,8 @@ class ToLayoutOpConversionPattern
                                           outputTensorMemoryLayout),
         ttnn::BufferTypeAttr::get(rewriter.getContext(), outputBufferType),
         ttnn::ShardSpecAttr::get(
-            op.getContext(), ttnn::ShapeAttr::get(rewriter.getContext(),
-                                                  outputMemref.getShape())));
+            op.getContext(),
+            ttnn::ShapeAttr::get(rewriter.getContext(), outputShardShape)));
 
     rewriter.replaceOpWithNewOp<ttnn::ToLayoutOp>(
         op, this->getTypeConverter()->convertType(result), adaptor.getInput(),
@@ -222,15 +216,16 @@ class ToLayoutOpConversionPattern
                               ttnn::Layout newOutputLayoutEnum) const {
     auto oldOutputLayoutAttr =
         mlir::cast<ttnn::TTNNLayoutAttr>(oldOutput.getEncoding());
-    auto oldOutputMemref = oldOutputLayoutAttr.getMemref();
-    DataType outputDtype = ttnn::utils::getDataTypeFromMemRef(oldOutputMemref);
-    llvm::ArrayRef<std::int64_t> oldShardShape = oldOutputMemref.getShape();
+    DataType outputDtype = oldOutputLayoutAttr.getDataType();
+    SmallVector<std::int64_t> oldShardShape =
+        oldOutputLayoutAttr.getShardShape();
     size_t shardShapeSize = oldShardShape.size();
     assert(shardShapeSize >= 2 && "expected at least 2D shape");
 
     if (newOutputLayoutEnum == ttnn::Layout::RowMajor) {
       // Set shard shape to match convention of row major layout
-      auto tileType = mlir::cast<TileType>(oldOutputMemref.getElementType());
+      auto tileType =
+          mlir::cast<TileType>(oldOutputLayoutAttr.getElementType());
       llvm::SmallVector<int64_t> newShardShape(oldShardShape.begin(),
                                                oldShardShape.end());
       newShardShape[shardShapeSize - 2] =
@@ -804,9 +799,7 @@ class TypecastOpConversionPattern
     ttnn::TTNNLayoutAttr outputLayoutAttr =
         mlir::cast<ttnn::TTNNLayoutAttr>(result.getType().getEncoding());
 
-    mlir::MemRefType outputMemref = outputLayoutAttr.getMemref();
-
-    DataType outputDataType = ttnn::utils::getDataTypeFromMemRef(outputMemref);
+    DataType outputDataType = outputLayoutAttr.getDataType();
 
     if (op->getUsers().empty()) {
       return rewriter.notifyMatchFailure(
@@ -950,8 +943,7 @@ class ArangeOpConversionPattern : public OpConversionPattern<ttir::ArangeOp> {
                 layoutAttr.getMemLayout()),
             rewriter.getAttr<ttnn::BufferTypeAttr>(layoutAttr.getBufferType()),
             rewriter.getAttr<ttnn::ShardSpecAttr>(
-                rewriter.getAttr<ttnn::ShapeAttr>(
-                    layoutAttr.getMemref().getShape())));
+                rewriter.getAttr<ttnn::ShapeAttr>(layoutAttr.getShardShape())));
 
     rewriter.replaceOpWithNewOp<ttnn::ArangeOp>(
         op, outputType, adaptor.getStart(), adaptor.getEnd(), adaptor.getStep(),
diff --git a/lib/Dialect/TTNN/IR/TTNNOps.cpp b/lib/Dialect/TTNN/IR/TTNNOps.cpp
index b3201cf67c..cd2746aadf 100644
--- a/lib/Dialect/TTNN/IR/TTNNOps.cpp
+++ b/lib/Dialect/TTNN/IR/TTNNOps.cpp
@@ -190,25 +190,12 @@ ::mlir::LogicalResult mlir::tt::ttnn::EmptyOp::verify() {
 
   // DataType and Layout
   //
-  mlir::MemRefType memref = layoutAttr.getMemref();
-  Type elementType = memref.getElementType();
   if (getLayout().has_value()) {
-    ttnn::Layout ttnnLayoutEnum;
-    if (llvm::isa<TileType>(elementType)) {
-      ttnnLayoutEnum = ttnn::Layout::Tile;
-    } else {
-      ttnnLayoutEnum = ttnn::Layout::RowMajor;
-    }
+    ttnn::Layout ttnnLayoutEnum = layoutAttr.getLayout();
     assert(ttnnLayoutEnum == getLayoutAttr().getValue());
   }
   if (getDtype().has_value()) {
-    tt::DataType dtype;
-    if (llvm::isa<TileType>(elementType)) {
-      auto tileType = mlir::cast<TileType>(elementType);
-      dtype = tileType.getDataType();
-    } else {
-      dtype = elementTypeToDataType(elementType);
-    }
+    tt::DataType dtype = layoutAttr.getDataType();
     assert(dtype == getDtype());
   }
 
diff --git a/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp b/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp
index d80815f919..8aaae12618 100644
--- a/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp
+++ b/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp
@@ -34,6 +34,11 @@ bool TTNNLayoutAttr::isTiled() const {
   return ::mlir::isa<::mlir::tt::TileType>(getElementType());
 }
 
+// Get layout of the tensor (RowMajor/Tile)
+Layout TTNNLayoutAttr::getLayout() const {
+  return isTiled() ? Layout::Tile : Layout::RowMajor;
+}
+
 // Check if the tensor memory layout is sharded
 bool TTNNLayoutAttr::hasShardedTensorMemoryLayout() const {
   return (getMemLayout() == TensorMemoryLayout::HeightSharded ||
@@ -119,19 +124,19 @@ mlir::Type TTNNLayoutAttr::getElementType() const {
   return getMemref().getElementType();
 }
 
-// Extract data type from the memref. Example:
-// memref<2x2xf32> -> f32
-// memref<2x2x!tt.tile<32x32xf32>> -> f32
-mlir::tt::DataType TTNNLayoutAttr::getDataTypeFromMemRef() const {
+// Get scalar element type.
+// Example: memref<2x2xf32> -> f32
+// Example: memref<2x2x!tt.tile<32x32xf32>> -> f32
+//
+// return The scalar element type.
+mlir::tt::DataType TTNNLayoutAttr::getDataType() const {
   Type elementType = getElementType();
-  DataType dtype = DataType::Float32;
-  if (llvm::isa<TileType>(elementType)) {
+  if (isTiled()) {
     TileType tileType = mlir::cast<TileType>(elementType);
-    dtype = tileType.getDataType();
-  } else {
-    dtype = elementTypeToDataType(elementType);
+    return tileType.getDataType();
   }
-  return dtype;
+
+  return elementTypeToDataType(elementType);
 }
 
 // Gets the size of shard in bytes
@@ -139,10 +144,10 @@ mlir::tt::DataType TTNNLayoutAttr::getDataTypeFromMemRef() const {
 // This function returns the size of the shard in bytes.
 // Size is calculated by multiplying shard shape with element size.
 //
-// /return The size of the shard in bytes.
+// return The size of the shard in bytes.
 uint64_t TTNNLayoutAttr::getElementSizeBytes() const {
   mlir::Type elementType = getElementType();
-  if (mlir::isa<TileType>(elementType)) {
+  if (isTiled()) {
     TileType tileType = mlir::cast<TileType>(elementType);
     return tileType.getSizeBytes();
   }
@@ -151,21 +156,31 @@ uint64_t TTNNLayoutAttr::getElementSizeBytes() const {
 
 // Get shard shape
 //
-// This function returns the shape of the shard. If element type is TileType
-// and convertTileToScalar is true, then the shape is converted to scalar shape.
-// Example: (convertToScalar = true) memref<2x2x!tt.tile<32x32xf32>> -> {64, 64}
-// Example: (convertToScalar = false) memref<2x2x!tt.tile<32x32xf32>> -> {2, 2}
-// Example: memref<128x128xf32> -> {128, 128}
+// Return the shape of the shard.
+// Example: memref<2x2x!tt.tile<32x32xf32>> -> { 2, 2 }
+// Example: memref<128x128xf32> -> { 128, 128 }
+// Example: memref<2x3!tt.tile<32x32xf32>> -> { 2, 3 }
 //
-// /param convertTileToScalar If true, convert tile shape to scalar shape.
-// /return The shape of the shard.
-llvm::SmallVector<int64_t>
-TTNNLayoutAttr::getShardShape(bool convertTileToScalar) const {
+// return The shape of the shard.
+llvm::SmallVector<int64_t> TTNNLayoutAttr::getShardShape() const {
+  return SmallVector<int64_t>(getMemref().getShape());
+}
+
+// Get scalar shard shape
+//
+// If the element type is TileType, this function returns the scalar shape of
+// the shard.
+// Example: memref<2x2x!tt.tile<32x32xf32>> -> { 64, 64 }
+// Example: memref<128x128xf32> -> { 128, 128 }
+// Example: memref<2x3!tt.tile<32x32xf32>> -> { 64, 96 }
+//
+// return The scalar shape of the shard.
+llvm::SmallVector<int64_t> TTNNLayoutAttr::getScalarShardShape() const {
   SmallVector<int64_t> shardShape(getMemref().getShape());
-  Type elementType = getElementType();
-  if (mlir::isa<TileType>(elementType) && convertTileToScalar) {
-    return mlir::cast<TileType>(elementType).getScalarShape(shardShape);
+  if (isTiled()) {
+    return mlir::cast<TileType>(getElementType()).getScalarShape(shardShape);
   }
+
   return shardShape;
 }
 
@@ -178,8 +193,8 @@ TTNNLayoutAttr::getShardShape(bool convertTileToScalar) const {
 // d2) and tile shape (32, 32) The result is (90, 10) which is then divided by
 // tile shape (32, 32) -> (3, 1)
 //
-// /param tensorShape The shape of the tensor
-// /return The size of the tensor in tiles.
+// param tensorShape The shape of the tensor
+// return The size of the tensor in tiles.
 llvm::SmallVector<int64_t>
 TTNNLayoutAttr::getTiledShape(llvm::ArrayRef<int64_t> tensorShape) const {
   assert(isTiled() && "Expected a tiled layout");
@@ -214,10 +229,9 @@ TTNNLayoutAttr::getTiledShape(llvm::ArrayRef<int64_t> tensorShape) const {
 // Element size for TileType is tile width * tile height * sizeof(element).
 // For scalar types, element size is sizeof(element).
 //
-// /return The size of the shard in bytes.
+// return The size of the shard in bytes.
 uint64_t TTNNLayoutAttr::getShardSizeInBytes() const {
-  MemRefType ty = getMemref();
-  ArrayRef<int64_t> shape = ty.getShape();
+  SmallVector<int64_t> shape = getShardShape();
   uint64_t size = getElementSizeBytes();
   return std::accumulate(shape.begin(), shape.end(), size,
                          std::multiplies<uint64_t>());
@@ -228,7 +242,7 @@ uint64_t TTNNLayoutAttr::getShardSizeInBytes() const {
 // This function returns a new identity affine map
 // with the same number of dimensions as the linear map.
 //
-// /return The new identity affine map.
+// return The new identity affine map.
 mlir::AffineMap TTNNLayoutAttr::getIdentityTileLinearMap() const {
   assert(isTiled() && "Expected a tiled layout");
 
@@ -241,12 +255,11 @@ mlir::AffineMap TTNNLayoutAttr::getIdentityTileLinearMap() const {
 // This function takes a physical memory map and replaces the symbols with the
 // shard shape
 //
-// /param physicalMemoryMap The physical memory map (d0, d1)[s0, s1]
-// /return New memory map with symbols replaced with shard shape.
+// param physicalMemoryMap The physical memory map (d0, d1)[s0, s1]
+// return New memory map with symbols replaced with shard shape.
 mlir::AffineMap TTNNLayoutAttr::replaceMemoryMapSymbolsWithShardShape(
     AffineMap physicalMemoryMap) const {
-  mlir::SmallVector<int64_t> shardShape =
-      getShardShape(false /*convertTileToScalar*/);
+  mlir::SmallVector<int64_t> shardShape = getShardShape();
   assert(physicalMemoryMap.getNumSymbols() == shardShape.size() &&
          "Physical memory map must have same number of symbols as logical "
          "shard rank");
@@ -289,11 +302,11 @@ int64_t TTNNLayoutAttr::getTensorSizeInBytes(ArrayRef<int64_t> tensorShape,
 // This function creates a new TTNNLayoutAttr with the given parameters.
 // The element type, buffer type and memory layout are preserved.
 //
-// /param context The MLIR context.
-// /param tensorShape The shape of the tensor (i.e 6x10x10)
-// /param grid The grid where the tensor will be placed (i.e 2x3)
-// /param collapseIntervals The intervals to collapse (i.e. {{0, -1}})
-// /return The constructed TTNNLayoutAttr
+// param context The MLIR context.
+// param tensorShape The shape of the tensor (i.e 6x10x10)
+// param grid The grid where the tensor will be placed (i.e 2x3)
+// param collapseIntervals The intervals to collapse (i.e. {{0, -1}})
+// return The constructed TTNNLayoutAttr
 TTNNLayoutAttr TTNNLayoutAttr::withGrid(
     ::mlir::MLIRContext *context, ArrayRef<int64_t> tensorShape, GridAttr grid,
     ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals) {
@@ -307,10 +320,10 @@ TTNNLayoutAttr TTNNLayoutAttr::withGrid(
 // The shape of the tensor, buffer type, element type and memory layout are
 // preserved.
 //
-// /param context The MLIR context.
-// /param grid The grid where the tensor will be placed.
-// /param collapseIntervals The intervals to collapse (i.e. {{0, -1}})
-// /return The constructed TTNNLayoutAttr
+// param context The MLIR context.
+// param grid The grid where the tensor will be placed.
+// param collapseIntervals The intervals to collapse (i.e. {{0, -1}})
+// return The constructed TTNNLayoutAttr
 TTNNLayoutAttr TTNNLayoutAttr::withGrid(
     ::mlir::MLIRContext *context, RankedTensorType ty, GridAttr grid,
     ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals) {
@@ -324,14 +337,14 @@ TTNNLayoutAttr TTNNLayoutAttr::withGrid(
 // This function creates a deep copy of the current TTNNLayoutAttr and
 // replaces the element type with the given one.
 //
-// /param context The MLIR context.
-// /param elementType The new element type.
-// /return The new TTNNLayoutAttr with the given element type.
+// param context The MLIR context.
+// param elementType The new element type.
+// return The new TTNNLayoutAttr with the given element type.
 TTNNLayoutAttr TTNNLayoutAttr::withElementType(::mlir::MLIRContext *context,
                                                Type elementType) {
   return TTNNLayoutAttr::get(
       context, getLinear(), getGrid(),
-      buildMemRef<BufferType, BufferTypeAttr>(context, getShardShape(),
+      buildMemRef<BufferType, BufferTypeAttr>(context, getScalarShardShape(),
                                               elementType, getBufferType()),
       getMemLayout());
 }
@@ -341,14 +354,14 @@ TTNNLayoutAttr TTNNLayoutAttr::withElementType(::mlir::MLIRContext *context,
 // This function creates a deep copy of the current TTNNLayoutAttr and
 // replaces the memory space with the given one.
 //
-// /param context The MLIR context.
-// /param memorySpace The new memory space.
-// /return The new TTNNLayoutAttr with the given memory space.
+// param context The MLIR context.
+// param memorySpace The new memory space.
+// return The new TTNNLayoutAttr with the given memory space.
 TTNNLayoutAttr TTNNLayoutAttr::withBufferType(::mlir::MLIRContext *context,
                                               BufferType memorySpace) {
   return TTNNLayoutAttr::get(
       context, getLinear(), getGrid(),
-      buildMemRef<BufferType, BufferTypeAttr>(context, getShardShape(),
+      buildMemRef<BufferType, BufferTypeAttr>(context, getScalarShardShape(),
                                               getElementType(), memorySpace),
       getMemLayout());
 }
@@ -358,15 +371,15 @@ TTNNLayoutAttr TTNNLayoutAttr::withBufferType(::mlir::MLIRContext *context,
 // This function creates a deep copy of the current TTNNLayoutAttr and
 // replaces the memory layout with the given one.
 //
-// /param context The MLIR context.
-// /param memLayout The new memory layout.
-// /return The new TTNNLayoutAttr with the given memory layout.
+// param context The MLIR context.
+// param memLayout The new memory layout.
+// return The new TTNNLayoutAttr with the given memory layout.
 TTNNLayoutAttr TTNNLayoutAttr::withMemoryLayout(::mlir::MLIRContext *context,
                                                 TensorMemoryLayout memLayout) {
   return TTNNLayoutAttr::get(
       context, getLinear(), getGrid(),
       buildMemRef<BufferType, BufferTypeAttr>(
-          context, getShardShape(), getElementType(), getBufferType()),
+          context, getScalarShardShape(), getElementType(), getBufferType()),
       memLayout);
 }
 
@@ -375,9 +388,9 @@ TTNNLayoutAttr TTNNLayoutAttr::withMemoryLayout(::mlir::MLIRContext *context,
 // This function creates a deep copy of the current TTNNLayoutAttr and
 // replaces shard shape with the given one.
 //
-// /param context The MLIR context.
-// /param shardShape The new shard shape.
-// /return The new TTNNLayoutAttr with the given shard shape.
+// param context The MLIR context.
+// param shardShape The new shard shape.
+// return The new TTNNLayoutAttr with the given shard shape.
 TTNNLayoutAttr
 TTNNLayoutAttr::withShardShape(::mlir::MLIRContext *context,
                                llvm::SmallVector<int64_t> shardShape) {
@@ -392,14 +405,14 @@ TTNNLayoutAttr::withShardShape(::mlir::MLIRContext *context,
 //
 // This function constructs a new TTNNLayoutAttr with the given parameters.
 //
-// /param context The MLIR context.
-// /param tensorShape The shape of the tensor (i.e 6x10x10)
-// /param elementType The type of the element i.e TileType/FloatType/IntegerType
-// /param bufferType The type of the buffer
-// /param grid The grid where the tensor will be placed (i.e 2x3)
-// /param collapseIntervals The intervals to collapse (i.e. {{0, -1}})
-// /param memLayout The memory layout of the tensor
-// /return The constructed TTNNLayoutAttr
+// param context The MLIR context.
+// param tensorShape The shape of the tensor (i.e 6x10x10)
+// param elementType The type of the element i.e TileType/FloatType/IntegerType
+// param bufferType The type of the buffer
+// param grid The grid where the tensor will be placed (i.e 2x3)
+// param collapseIntervals The intervals to collapse (i.e. {{0, -1}})
+// param memLayout The memory layout of the tensor
+// return The constructed TTNNLayoutAttr
 TTNNLayoutAttr TTNNLayoutAttr::get(
     ::mlir::MLIRContext *context, ArrayRef<int64_t> tensorShape,
     Type elementType, BufferType bufferType, GridAttr grid,
diff --git a/lib/Dialect/TTNN/Transforms/Optimizer.cpp b/lib/Dialect/TTNN/Transforms/Optimizer.cpp
index 05ff417a69..e5d2f86d86 100644
--- a/lib/Dialect/TTNN/Transforms/Optimizer.cpp
+++ b/lib/Dialect/TTNN/Transforms/Optimizer.cpp
@@ -276,7 +276,7 @@ class TTNNOptimizer : public impl::TTNNOptimizerBase<TTNNOptimizer> {
             EmptyOp emptyOp =
                 mlir::cast<EmptyOp>(op->getOperands().back().getDefiningOp());
 
-            emptyOp.setDtype(layoutAttr.getDataTypeFromMemRef());
+            emptyOp.setDtype(layoutAttr.getDataType());
             if (layoutAttr.isTiled()) {
               emptyOp.setLayout(ttnn::Layout::Tile);
             } else {
@@ -449,16 +449,17 @@ class TTNNOptimizer : public impl::TTNNOptimizerBase<TTNNOptimizer> {
       BufferType outputBufferType = consumerOpOutputLayout.getBufferType();
       TensorMemoryLayout outputTensorMemoryLayout =
           consumerOpOutputLayout.getMemLayout();
-      MemRefType outputMemref = consumerOpOutputLayout.getMemref();
 
+      llvm::SmallVector<int64_t> shardShape =
+          consumerOpOutputLayout.getShardShape();
       MemoryConfigAttr outputMemConfigAttr = MemoryConfigAttr::get(
           consumerOp->getContext(),
           TensorMemoryLayoutAttr::get(consumerOp->getContext(),
                                       outputTensorMemoryLayout),
           BufferTypeAttr::get(consumerOp->getContext(), outputBufferType),
-          ShardSpecAttr::get(consumerOp->getContext(),
-                             ShapeAttr::get(consumerOp->getContext(),
-                                            outputMemref.getShape())));
+          ShardSpecAttr::get(
+              consumerOp->getContext(),
+              ShapeAttr::get(consumerOp->getContext(), shardShape)));
 
       // If producerOp is a toLayoutOp, adjust its output layout(update
       // inplace) to reflect consumerOp's output layout. If producerOp is not a
@@ -472,10 +473,9 @@ class TTNNOptimizer : public impl::TTNNOptimizerBase<TTNNOptimizer> {
       } else {
         OpBuilder builder(consumerOp);
 
-        DataTypeAttr outputDataType =
-            DataTypeAttr::get(consumerOp->getContext(),
-                              utils::getDataTypeFromMemRef(outputMemref));
-        Layout outputLayoutEnum = utils::getLayoutFromMemRef(outputMemref);
+        DataTypeAttr outputDataType = DataTypeAttr::get(
+            consumerOp->getContext(), consumerOpOutputLayout.getDataType());
+        Layout outputLayoutEnum = consumerOpOutputLayout.getLayout();
         LayoutAttr outputLayout =
             LayoutAttr::get(consumerOp->getContext(), outputLayoutEnum);
         Operation *memoryReconfigOp = builder.create<ToLayoutOp>(
diff --git a/lib/Dialect/TTNN/Transforms/Passes.cpp b/lib/Dialect/TTNN/Transforms/Passes.cpp
index 79bfeb4049..e22540a7da 100644
--- a/lib/Dialect/TTNN/Transforms/Passes.cpp
+++ b/lib/Dialect/TTNN/Transforms/Passes.cpp
@@ -198,24 +198,12 @@ class TTNNDecomposeLayouts
     }
   };
 
-  ttnn::Layout getLayoutFromMemRef(mlir::MemRefType memref) const {
-    ttnn::Layout ttnnLayoutEnum = ttnn::Layout::RowMajor;
-    Type elementType = memref.getElementType();
-    if (llvm::isa<TileType>(elementType)) {
-      ttnnLayoutEnum = ttnn::Layout::Tile;
-    } else {
-      ttnnLayoutEnum = ttnn::Layout::RowMajor;
-    }
-    return ttnnLayoutEnum;
-  }
-
   std::pair<LayoutInfo, LayoutInfo>
   getInputOutputLayouts(ttnn::ToLayoutOp op) const {
     LayoutInfo input, output;
 
     auto inputLayoutAttr =
         mlir::cast<TTNNLayoutAttr>(op.getInput().getType().getEncoding());
-    auto inputMemref = inputLayoutAttr.getMemref();
 
     assert(op.getMemoryConfig().has_value());
     MemoryConfigAttr outputMemoryConfig = op.getMemoryConfig().value();
@@ -223,10 +211,10 @@ class TTNNDecomposeLayouts
     input.bufferType = inputLayoutAttr.getBufferType();
     output.bufferType = outputMemoryConfig.getBufferType().getValue();
 
-    input.layoutEnum = getLayoutFromMemRef(inputMemref);
+    input.layoutEnum = inputLayoutAttr.getLayout();
     output.layoutEnum = op.getLayout();
 
-    input.dataType = ttnn::utils::getDataTypeFromMemRef(inputMemref);
+    input.dataType = inputLayoutAttr.getDataType();
     assert(op.getDtype().has_value());
     output.dataType = op.getDtype().value();
 
@@ -234,7 +222,7 @@ class TTNNDecomposeLayouts
     output.tensorMemoryLayout =
         outputMemoryConfig.getTensorMemoryLayout().getValue();
 
-    input.shardShape = inputMemref.getShape();
+    input.shardShape = inputLayoutAttr.getShardShape();
     output.shardShape = outputMemoryConfig.getShardShapeArray();
     return {input, output};
   }

From f3f3f0ec33bc5a3527ebf321e2899364d0932cce Mon Sep 17 00:00:00 2001
From: Nick Smith <127986401+nsmithtt@users.noreply.github.com>
Date: Wed, 27 Nov 2024 05:28:02 -0800
Subject: [PATCH 27/84] Rename tt.Layout to tt.MetalLayout (#1386)

---
 docs/src/dialects-overview.md                 |  2 +-
 docs/src/specs/device.md                      |  6 +-
 docs/src/specs/tensor-layout.md               | 32 +++----
 include/ttmlir-c/TTAttrs.h                    |  6 +-
 include/ttmlir/Dialect/TT/IR/TTOpsTypes.td    | 30 +++----
 include/ttmlir/Dialect/TTIR/IR/TTIROps.td     |  4 +-
 .../ttmlir/Target/Utils/MLIRToFlatbuffer.h    | 11 +--
 lib/CAPI/TTAttrs.cpp                          | 16 ++--
 .../TTIRToTTMetal/TTIRToTTMetal.cpp           | 20 ++---
 lib/Dialect/TT/IR/TTDialect.cpp               |  6 +-
 lib/Dialect/TT/IR/TTOpsTypes.cpp              | 89 ++++++++++---------
 lib/Dialect/TTIR/IR/TTIROps.cpp               |  6 +-
 lib/Dialect/TTIR/Transforms/Allocate.cpp      |  4 +-
 lib/Dialect/TTIR/Transforms/Generic.cpp       | 12 +--
 lib/Dialect/TTIR/Transforms/Layout.cpp        | 45 +++++-----
 lib/Dialect/TTMetal/IR/TTMetalOps.cpp         |  8 +-
 lib/Target/TTMetal/TTMetalToFlatbuffer.cpp    | 20 ++---
 python/TTModule.cpp                           | 61 +++++++------
 test/python/tensor_layout.py                  | 26 +++---
 .../Dialect/TTIR/split_compound_layout.mlir   | 28 +++---
 test/ttmlir/Dialect/TTIR/test_allocate.mlir   |  2 +-
 .../ttmlir/Silicon/TTMetal/simple_reduce.mlir |  8 +-
 .../ttmlir/Silicon/TTMetal/tiled_reblock.mlir | 24 ++---
 test/ttmlir/Silicon/TTMetal/to_layout.mlir    | 18 ++--
 test/ttmlir/Silicon/TTNN/perf_unit/mnist.mlir |  4 +-
 .../tt_adapter/src/tt_adapter/mlir.py         |  2 +-
 26 files changed, 252 insertions(+), 238 deletions(-)

diff --git a/docs/src/dialects-overview.md b/docs/src/dialects-overview.md
index e886fb90c1..0dbf5fbed1 100644
--- a/docs/src/dialects-overview.md
+++ b/docs/src/dialects-overview.md
@@ -3,7 +3,7 @@
 Here is a brief overview of the dialects in the project, please refer to the
 individual dialect documentation for more details.:
 
-- `tt`: Common types such as, `tt.tile`, `tt.layout`, `tt.grid`, etc. and enums such as, data formats, memory spaces, iterator types etc.
+- `tt`: Common types such as, `tt.tile`, `tt.metal_layout`, `tt.grid`, etc. and enums such as, data formats, memory spaces, iterator types etc.
 - `ttir`: A high level dialect that models the tensor compute graph on tenstorrent devices. Accepts `tosa` and `linalg` input.
   - `ttir.generic`: Generically describe compute work.
   - `ttir.to_layout`: Convert between different tensor memory layouts and transfer between different memory spaces.
diff --git a/docs/src/specs/device.md b/docs/src/specs/device.md
index ae72fe638c..64bc91cfa9 100644
--- a/docs/src/specs/device.md
+++ b/docs/src/specs/device.md
@@ -135,7 +135,7 @@ the logical device grid:
 
 ```mlir
 tensor<16x3x64x128xf32,
-  #tt.layout<(d0, d1, d2, d3) -> (d0, d1 * 64 + d2, d3),
+  #tt.metal_layout<(d0, d1, d2, d3) -> (d0, d1 * 64 + d2, d3),
     undef,
     <2x2x4>,
     memref<8x3x1x!tt.tile<32 x 32, bfp_bf8>, #tt.memory_space<l1>>
@@ -170,7 +170,7 @@ the logical device grid:
 
 ```mlir
 tensor<256x1024xf32,
-  #tt.layout<(d0, d1) -> (d0, d1),
+  #tt.metal_layout<(d0, d1) -> (d0, d1),
     undef,
     <4x16>,
     memref<2x2x!tt.tile<32 x 32, bfp_bf8>, #tt.memory_space<l1>>
@@ -205,7 +205,7 @@ We can consider the following tensor to map onto this grid:
 
 ```mlir
 tensor<64x256x1024xf32,
-  #tt.layout<(d0, d1) -> (d0, d1),
+  #tt.metal_layout<(d0, d1) -> (d0, d1),
     undef,
     <2x4x16>,
     memref<32x2x2x!tt.tile<32 x 32, bfp_bf8>, #tt.memory_space<l1>>
diff --git a/docs/src/specs/tensor-layout.md b/docs/src/specs/tensor-layout.md
index d523f51ed2..52c6931895 100644
--- a/docs/src/specs/tensor-layout.md
+++ b/docs/src/specs/tensor-layout.md
@@ -33,7 +33,7 @@ been used by the TT dialect to encode the tensor's layout.  This looks like:
 
 ```mlir
 tensor<2x3x64x128xf32,
-  #tt.layout<
+  #tt.metal_layout<
     (d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3),
     undef,
     <1x1>,
@@ -76,7 +76,7 @@ topics:
 
 ### Dimension Collapsing
 
-Probably the most important concept in `tt.layout` is dimension collapsing.
+Probably the most important concept in `tt.metal_layout` is dimension collapsing.
 This is captured by the affine map `linear` property which provides a
 mapping from tensor dim space to a reduced physical dimensional space.  This
 single-handedly touches on most of the tensor layout goals mentioned at the
@@ -106,7 +106,7 @@ to get our remapped offset:
 This remapped offset `(262, 100)` corresponds to the row and column index of the
 collapsed physical memory.
 
-By default, the dim range `[0, -1)` is collapsed, but the `tt.layout` contructor
+By default, the dim range `[0, -1)` is collapsed, but the `tt.metal_layout` contructor
 can actually take a programmable range called `collapseIntervals`.
 `collapseIntervals` is a list of pairs, where each pair is a dim range interval,
 left inclusive, right exclusive. Let's consider a few examples:
@@ -137,7 +137,7 @@ Let's consider the original example again, but on a larger grid than `1x1`, say
 
 ```mlir
 tensor<2x3x64x128xf32,
-  #tt.layout<
+  #tt.metal_layout<
     (d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3),
     undef,
     <2x4>,
@@ -173,7 +173,7 @@ Here's a few more example mlir snippets:
 
 ```mlir
 tensor<8x300xf32,
-  #tt.layout<(d0, d1) -> (d0, d1),
+  #tt.metal_layout<(d0, d1) -> (d0, d1),
     undef,
     <1x2>,
     memref<8x150xf32, #tt.memory_space<l1>>
@@ -181,7 +181,7 @@ tensor<8x300xf32,
 >
 
 tensor<8x96x32xf32,
-  #tt.layout<(d0, d1, d2) -> (d0 * 96 + d1, d2),
+  #tt.metal_layout<(d0, d1, d2) -> (d0 * 96 + d1, d2),
     undef,
     <2x1>,
     memref<384x32xf32, #tt.memory_space<l1>>
@@ -189,7 +189,7 @@ tensor<8x96x32xf32,
 >
 
 tensor<8x96x32xf32,
-  #tt.layout<(d0, d1, d2) -> (d0 * 96 + d1, d1, d2),
+  #tt.metal_layout<(d0, d1, d2) -> (d0 * 96 + d1, d1, d2),
     undef,
     <2x1x2>,
     memref<384x96x16xf32, #tt.memory_space<l1>>
@@ -197,7 +197,7 @@ tensor<8x96x32xf32,
 >
 
 tensor<5x3x2x2x7x32x32xf32,
-  #tt.layout<
+  #tt.metal_layout<
     (d0, d1, d2, d3, d4, d5, d6)
       -> (d0 * 2688 + d1 * 896 + d2 * 448 + d3 * 224 + d4 * 32 + d5, d4, d5, d6),
     undef,
@@ -226,7 +226,7 @@ A tilized tensor is one with a memref that has a tile element type.
 Given some tensor with scalar layout:
 ```mlir
 tensor<3x64x128xf32,
-  #tt.layout<
+  #tt.metal_layout<
     (d0, d1, d2) -> (d0 * 64 + d1, d2),
     undef,
     <3x2>,
@@ -238,7 +238,7 @@ tensor<3x64x128xf32,
 After tilizing we'll have:
 ```mlir
 tensor<3x64x128xf32,
-  #tt.layout<
+  #tt.metal_layout<
     (d0, d1, d2) -> (d0 * 64 + d1, d2),
     undef,
     <3x2>,
@@ -256,7 +256,7 @@ intact.
 Padding can be a bit of an overloaded term, but in this context it refers to an
 out of bounds area in the physical memory allocation that has no real tensor
 data in it.  The contents of this area is tracked by `oob_val` and the padding
-area can be automatically derived from the attributes of `tt.layout`.
+area can be automatically derived from the attributes of `tt.metal_layout`.
 
 Padding is a necessary evil that arises when a tensor is not evenly divisible by
 a grid shape or tile shape.  It can also arise due to minimum Noc addressing
@@ -265,7 +265,7 @@ requirements.
 Example of non-divisible grid:
 ```mlir
 tensor<53x63xf32,
-  #tt.layout<
+  #tt.metal_layout<
     (d0, d1) -> (d0, d1),
     undef,
     <3x2>,
@@ -284,7 +284,7 @@ cores and 1 scalar column of padding on the last column of cores.
 Taking the above example a step further, we could tilize it:
 ```mlir
 tensor<53x63xf32,
-  #tt.layout<
+  #tt.metal_layout<
     (d0, d1) -> (d0, d1),
     undef,
     <3x2>,
@@ -308,7 +308,7 @@ stride between dimensions.  Consider tensor (w/ batch dim `2`):
 
 ```mlir
 tensor<2x8x32xf32,
-  #tt.layout<
+  #tt.metal_layout<
     (d0, d1, d2) -> (d0 * 8 + d1, d2),
     undef,
     <1x2>,
@@ -356,7 +356,7 @@ consider the following example with a 3d grid and `collapseIntervals=[(1, -1)]`.
 
 ```mlir
 tensor<2x3x64x128xf32,
-  #tt.layout<(d0, d1, d2, d3) -> (d0, d1 * 64 + d2, d3),
+  #tt.metal_layout<(d0, d1, d2, d3) -> (d0, d1 * 64 + d2, d3),
     undef,
     <2x2x4>,
     memref<1x3x1x!tt.tile<32 x 32, bfp_bf8>, #tt.memory_space<l1>>
@@ -387,7 +387,7 @@ under the same grid primitive that also divides tensor rows and columns.
 
 ## Concerns
 
-- `tt.layout` is deliberately flexible and tries to capture as many problematic
+- `tt.metal_layout` is deliberately flexible and tries to capture as many problematic
   use-cases we've ran into in the past in a single, succinct representation.
   This flexibility will need to be further constrained by backends to avoid
   unsupported programming of this attribute.
diff --git a/include/ttmlir-c/TTAttrs.h b/include/ttmlir-c/TTAttrs.h
index 2e164ac132..263cd1d8e4 100644
--- a/include/ttmlir-c/TTAttrs.h
+++ b/include/ttmlir-c/TTAttrs.h
@@ -50,9 +50,9 @@ MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTSystemDescAttrGet(
     size_t chipCoordsSize, MlirAttribute *chipChannels,
     size_t chipChannelsSize);
 
-MLIR_CAPI_EXPORTED MlirAttribute
-ttmlirTTLayoutAttrGet(MlirContext ctx, MlirAffineMap linear, unsigned oobVal,
-                      MlirAttribute grid, MlirType memref, unsigned memLayout);
+MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTMetalLayoutAttrGet(
+    MlirContext ctx, MlirAffineMap linear, unsigned oobVal, MlirAttribute grid,
+    MlirType memref, unsigned memLayout);
 
 MLIR_CAPI_EXPORTED MlirAttribute
 ttmlirTTMemorySpaceAttrGet(MlirContext ctx, uint32_t memorySpace);
diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td
index 99caac0c2a..d5dc22e28d 100644
--- a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td
+++ b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td
@@ -214,7 +214,7 @@ def TT_SystemDescAttr : TT_Attr<"SystemDesc", "system_desc"> {
   }];
 }
 
-def TT_LayoutAttr : TT_Attr<"Layout", "layout"> {
+def TT_MetalLayoutAttr : TT_Attr<"MetalLayout", "metal_layout"> {
   let summary = "Tensor layout attribute";
   let description = [{
     The tensor layout attribute captures how tensor data is sharded across a grid of devices, cores, and
@@ -241,7 +241,7 @@ def TT_LayoutAttr : TT_Attr<"Layout", "layout"> {
     Examples:
     ```mlir
     tensor<8x300xf32,
-      #tt.layout<(d0, d1) -> (d0, d1),
+      #tt.metal_layout<(d0, d1) -> (d0, d1),
         undef,
         <1x2>,
         memref<8x150xf32, #tt.memory_space<l1>>
@@ -249,7 +249,7 @@ def TT_LayoutAttr : TT_Attr<"Layout", "layout"> {
     >
 
     tensor<8x96x32xf32,
-      #tt.layout<(d0, d1, d2) -> (d0 * 96 + d1, d2),
+      #tt.metal_layout<(d0, d1, d2) -> (d0 * 96 + d1, d2),
         undef,
         <2x1>,
         memref<384x32xf32, #tt.memory_space<l1>>
@@ -257,7 +257,7 @@ def TT_LayoutAttr : TT_Attr<"Layout", "layout"> {
     >
 
     tensor<8x96x32xf32,
-      #tt.layout<(d0, d1, d2) -> (d0 * 96 + d1, d1, d2),
+      #tt.metal_layout<(d0, d1, d2) -> (d0 * 96 + d1, d1, d2),
         undef,
         <2x1x2>,
         memref<384x96x16xf32, #tt.memory_space<l1>>
@@ -265,7 +265,7 @@ def TT_LayoutAttr : TT_Attr<"Layout", "layout"> {
     >
 
     tensor<5x3x2x2x7x32x32xf32,
-      #tt.layout<
+      #tt.metal_layout<
         (d0, d1, d2, d3, d4, d5, d6)
           -> (d0 * 2688 + d1 * 896 + d2 * 448 + d3 * 224 + d4 * 32 + d5, d4, d5, d6),
         undef,
@@ -284,7 +284,7 @@ def TT_LayoutAttr : TT_Attr<"Layout", "layout"> {
   let assemblyFormat = "`<` $linear`,` $oob_val`,` $grid`,` $memref (`,` $mem_layout^)? `>`";
 
   let extraClassDeclaration = [{
-      static LayoutAttr get(::mlir::MLIRContext *context,
+      static MetalLayoutAttr get(::mlir::MLIRContext *context,
                             ArrayRef<int64_t> tensorShape,
                             Type elementType,
                             MemorySpace memorySpace = MemorySpace::System,
@@ -292,28 +292,28 @@ def TT_LayoutAttr : TT_Attr<"Layout", "layout"> {
                             ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}},
                             OOBVal oobVal = OOBVal::Undef,
                             TensorMemoryLayout memLayout = TensorMemoryLayout::None);
-      static LayoutAttr get(::mlir::MLIRContext *context,
+      static MetalLayoutAttr get(::mlir::MLIRContext *context,
                             RankedTensorType ty,
                             MemorySpace memorySpace = MemorySpace::System,
                             GridAttr grid = {},
                             ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}},
                             OOBVal oobVal = OOBVal::Undef,
                             TensorMemoryLayout memLayout = TensorMemoryLayout::None);
-      static LayoutAttr get(::mlir::MLIRContext *context,
+      static MetalLayoutAttr get(::mlir::MLIRContext *context,
                             RankedTensorType ty,
                             MemorySpace memorySpace,
                             GridAttr grid,
                             Type elementType,
                             TensorMemoryLayout memLayout = TensorMemoryLayout::None);
-      LayoutAttr withGrid(::mlir::MLIRContext *context, ArrayRef<int64_t> tensorShape, GridAttr grid, ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}});
-      LayoutAttr withGrid(::mlir::MLIRContext *context,
+      MetalLayoutAttr withGrid(::mlir::MLIRContext *context, ArrayRef<int64_t> tensorShape, GridAttr grid, ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}});
+      MetalLayoutAttr withGrid(::mlir::MLIRContext *context,
                           RankedTensorType ty,
                           GridAttr grid,
                           ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}});
-      LayoutAttr withElementType(::mlir::MLIRContext *context, Type elementType);
-      LayoutAttr withMemorySpace(::mlir::MLIRContext *context, MemorySpace memorySpace);
-      LayoutAttr withMemoryLayout(::mlir::MLIRContext *context, TensorMemoryLayout memLayout);
-      LayoutAttr withShardShape(::mlir::MLIRContext *context, llvm::SmallVector<int64_t> shardShape);
+      MetalLayoutAttr withElementType(::mlir::MLIRContext *context, Type elementType);
+      MetalLayoutAttr withMemorySpace(::mlir::MLIRContext *context, MemorySpace memorySpace);
+      MetalLayoutAttr withMemoryLayout(::mlir::MLIRContext *context, TensorMemoryLayout memLayout);
+      MetalLayoutAttr withShardShape(::mlir::MLIRContext *context, llvm::SmallVector<int64_t> shardShape);
 
       uint64_t getMemrefSizeBytes() const;
       MemorySpace getMemorySpace() const;
@@ -400,7 +400,7 @@ def TT_DeviceAttr : TT_Attr<"Device", "device", []> {
       // - DeviceL1: This ends up being exactly the shard size
       // - DeviceDRAM: Is more nuanced because the whole tensor size gets paged and interleaved between all dram channels,
       //   due to paging and rounding the footprint ends up being close to: the_whole_tensor / num_dram_channels
-      uint64_t getLayoutSizeBytes(ArrayRef<int64_t> tensorShape, LayoutAttr layout, MemorySpace memorySpace) const;
+      uint64_t getLayoutSizeBytes(ArrayRef<int64_t> tensorShape, MetalLayoutAttr layout, MemorySpace memorySpace) const;
 
       // Returns the footprint size in bytes of the tensor distributed across the given memory space.
       // Forwards to getLayoutSizeBytes, see comment there for more info.
diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
index cbb5e5ab8d..f5e284078a 100644
--- a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
+++ b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
@@ -114,8 +114,8 @@ def TTIR_ToLayoutOp : TTIR_Op<"to_layout", [DestinationStyleOpInterface, TTIROpI
         - Some combination of the above
 
       ```llvm
-      #layout = #tt.layout<8192x128x1, undef, <1x1>, memref<64x128xf32, #system>>
-      #layout1 = #tt.layout<8192x128x1, undef, <1x1>, memref<64x128xf32, #l1_>>
+      #layout = #tt.metal_layout<8192x128x1, undef, <1x1>, memref<64x128xf32, #system>>
+      #layout1 = #tt.metal_layout<8192x128x1, undef, <1x1>, memref<64x128xf32, #l1_>>
       %1 = "ttir.to_layout"(%arg0, %0) : (tensor<64x128xf32, #layout>, tensor<64x128xf32, #layout1>) -> tensor<64x128xf32, #layout1>
       ```
     }];
diff --git a/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h b/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h
index ac23b9bb0d..d5be2bb97c 100644
--- a/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h
+++ b/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h
@@ -18,8 +18,9 @@
 namespace mlir::tt {
 
 flatbuffers::Offset<::tt::target::LayoutDesc>
-layoutAttrToFlatbuffer(FlatbufferObjectCache &cache, LayoutAttr attr,
-                       ArrayRef<int64_t> logicalShape, DeviceAttr deviceAttr);
+metalLayoutAttrToFlatbuffer(FlatbufferObjectCache &cache, MetalLayoutAttr attr,
+                            ArrayRef<int64_t> logicalShape,
+                            DeviceAttr deviceAttr);
 
 flatbuffers::Offset<::tt::target::LayoutDesc> ttnnLayoutAttrToFlatbuffer(
     FlatbufferObjectCache &cache, ttnn::TTNNLayoutAttr attr,
@@ -438,9 +439,9 @@ toFlatbuffer(FlatbufferObjectCache &cache, ElementsAttr elementsAttr) {
 inline flatbuffers::Offset<::tt::target::LayoutDesc>
 encodingToFlatbuffer(FlatbufferObjectCache &cache, Attribute attr,
                      ArrayRef<int64_t> logicalShape, DeviceAttr deviceAttr) {
-  if (isa<LayoutAttr>(attr)) {
-    return layoutAttrToFlatbuffer(cache, cast<LayoutAttr>(attr), logicalShape,
-                                  deviceAttr);
+  if (isa<MetalLayoutAttr>(attr)) {
+    return metalLayoutAttrToFlatbuffer(cache, cast<MetalLayoutAttr>(attr),
+                                       logicalShape, deviceAttr);
   }
 
   assert(isa<ttnn::TTNNLayoutAttr>(attr) && "unsupported layout attr");
diff --git a/lib/CAPI/TTAttrs.cpp b/lib/CAPI/TTAttrs.cpp
index 196dc09f47..c329f41d56 100644
--- a/lib/CAPI/TTAttrs.cpp
+++ b/lib/CAPI/TTAttrs.cpp
@@ -119,15 +119,15 @@ MlirAttribute ttmlirTTSystemDescAttrGet(
       chipCapabilitiesUnwrapped, chipCoordsUnwrapped, chipChannelsUnwrapped));
 }
 
-MlirAttribute ttmlirTTLayoutAttrGet(MlirContext ctx, MlirAffineMap linear,
-                                    unsigned oobVal, MlirAttribute grid,
-                                    MlirType memref, unsigned memLayout) {
+MlirAttribute ttmlirTTMetalLayoutAttrGet(MlirContext ctx, MlirAffineMap linear,
+                                         unsigned oobVal, MlirAttribute grid,
+                                         MlirType memref, unsigned memLayout) {
   mlir::AffineMap affineMap = mlir::AffineMap::getFromOpaquePointer(linear.ptr);
-  return wrap(LayoutAttr::get(unwrap(ctx), affineMap,
-                              static_cast<OOBVal>(oobVal),
-                              mlir::cast<GridAttr>(unwrap(grid)),
-                              mlir::cast<MemRefType>(unwrap(memref)),
-                              static_cast<TensorMemoryLayout>(memLayout)));
+  return wrap(MetalLayoutAttr::get(unwrap(ctx), affineMap,
+                                   static_cast<OOBVal>(oobVal),
+                                   mlir::cast<GridAttr>(unwrap(grid)),
+                                   mlir::cast<MemRefType>(unwrap(memref)),
+                                   static_cast<TensorMemoryLayout>(memLayout)));
 }
 
 MlirAttribute ttmlirTTMemorySpaceAttrGet(MlirContext ctx,
diff --git a/lib/Conversion/TTIRToTTMetal/TTIRToTTMetal.cpp b/lib/Conversion/TTIRToTTMetal/TTIRToTTMetal.cpp
index a3bbddc1da..60c0328197 100644
--- a/lib/Conversion/TTIRToTTMetal/TTIRToTTMetal.cpp
+++ b/lib/Conversion/TTIRToTTMetal/TTIRToTTMetal.cpp
@@ -199,8 +199,8 @@ class TTIRToTTMetalLayoutRewriter : public OpRewritePattern<ttir::ToLayoutOp> {
   LogicalResult relayout(ttir::ToLayoutOp op, PatternRewriter &rewriter) const {
     auto inputTy = mlir::cast<RankedTensorType>(op.getInput().getType());
     auto outputTy = mlir::cast<RankedTensorType>(op.getType());
-    auto inputLayout = mlir::cast<tt::LayoutAttr>(inputTy.getEncoding());
-    auto outputLayout = mlir::cast<tt::LayoutAttr>(outputTy.getEncoding());
+    auto inputLayout = mlir::cast<tt::MetalLayoutAttr>(inputTy.getEncoding());
+    auto outputLayout = mlir::cast<tt::MetalLayoutAttr>(outputTy.getEncoding());
     tt::DeviceAttr device = op.getDevice();
     assert(device);
     tt::SystemDescAttr systemDesc = op.getSystemDesc();
@@ -342,8 +342,8 @@ class TTIRToTTMetalLayoutRewriter : public OpRewritePattern<ttir::ToLayoutOp> {
   LogicalResult reformat(ttir::ToLayoutOp op, PatternRewriter &rewriter) const {
     auto inputTy = mlir::cast<RankedTensorType>(op.getInput().getType());
     auto outputTy = mlir::cast<RankedTensorType>(op.getType());
-    auto inputLayout = mlir::cast<tt::LayoutAttr>(inputTy.getEncoding());
-    auto outputLayout = mlir::cast<tt::LayoutAttr>(outputTy.getEncoding());
+    auto inputLayout = mlir::cast<tt::MetalLayoutAttr>(inputTy.getEncoding());
+    auto outputLayout = mlir::cast<tt::MetalLayoutAttr>(outputTy.getEncoding());
     bool shouldTilize = not inputLayout.isTiled() && outputLayout.isTiled();
     bool shouldUntilize = inputLayout.isTiled() && not outputLayout.isTiled();
     assert(shouldTilize ^ shouldUntilize);
@@ -448,10 +448,10 @@ class TTIRToTTMetalLayoutRewriter : public OpRewritePattern<ttir::ToLayoutOp> {
       return failure();
     }
     assert(inputTy.getShape() == outputTy.getShape());
-    assert(mlir::isa<tt::LayoutAttr>(inputTy.getEncoding()));
-    assert(mlir::isa<tt::LayoutAttr>(outputTy.getEncoding()));
-    auto inputLayout = mlir::cast<tt::LayoutAttr>(inputTy.getEncoding());
-    auto outputLayout = mlir::cast<tt::LayoutAttr>(outputTy.getEncoding());
+    assert(mlir::isa<tt::MetalLayoutAttr>(inputTy.getEncoding()));
+    assert(mlir::isa<tt::MetalLayoutAttr>(outputTy.getEncoding()));
+    auto inputLayout = mlir::cast<tt::MetalLayoutAttr>(inputTy.getEncoding());
+    auto outputLayout = mlir::cast<tt::MetalLayoutAttr>(outputTy.getEncoding());
 
     auto components = op.compoundComponents();
     bool isCompound = (static_cast<int>(components.isLayoutChange) +
@@ -1308,10 +1308,10 @@ class TTIRToTTMetalDispatchRewriter : public OpRewritePattern<ttir::GenericOp> {
                   SmallVector<TTIRToTTMetalLayoutRewriter::NocTx>>
   calculateDataMovement(ArrayAttr iteratorTypes, const RankedTensorType &src,
                         const RankedTensorType &dst, DeviceAttr device) const {
-    auto srcLayout = mlir::cast<tt::LayoutAttr>(src.getEncoding());
+    auto srcLayout = mlir::cast<tt::MetalLayoutAttr>(src.getEncoding());
     assert(srcLayout.isTiled());
 
-    auto dstLayout = mlir::cast<tt::LayoutAttr>(dst.getEncoding());
+    auto dstLayout = mlir::cast<tt::MetalLayoutAttr>(dst.getEncoding());
     assert(dstLayout.isTiled());
 
     assert(iteratorTypes.size() >= 2 && "Expected at least 2 iterator types");
diff --git a/lib/Dialect/TT/IR/TTDialect.cpp b/lib/Dialect/TT/IR/TTDialect.cpp
index 6f629d6977..1ac8a22239 100644
--- a/lib/Dialect/TT/IR/TTDialect.cpp
+++ b/lib/Dialect/TT/IR/TTDialect.cpp
@@ -13,13 +13,13 @@
 using namespace mlir;
 using namespace mlir::tt;
 
-// This is needed to hoist tt.layout attributes as named attributes declared at
-// the module level.
+// This is needed to hoist tt.metal_layout attributes as named attributes
+// declared at the module level.
 struct TTOpAsmDialectInterface : public OpAsmDialectInterface {
   using OpAsmDialectInterface::OpAsmDialectInterface;
 
   AliasResult getAlias(Attribute attr, raw_ostream &os) const override {
-    if (llvm::isa<LayoutAttr>(attr)) {
+    if (llvm::isa<MetalLayoutAttr>(attr)) {
       os << "layout";
       return AliasResult::OverridableAlias;
     }
diff --git a/lib/Dialect/TT/IR/TTOpsTypes.cpp b/lib/Dialect/TT/IR/TTOpsTypes.cpp
index bbdd4e2590..12166e4433 100644
--- a/lib/Dialect/TT/IR/TTOpsTypes.cpp
+++ b/lib/Dialect/TT/IR/TTOpsTypes.cpp
@@ -466,7 +466,7 @@ calculateLogicalShardShape(mlir::ArrayRef<int64_t> tensorShape,
   return shardShape;
 }
 
-LayoutAttr LayoutAttr::get(
+MetalLayoutAttr MetalLayoutAttr::get(
     ::mlir::MLIRContext *context, ArrayRef<int64_t> tensorShape,
     Type elementType, MemorySpace memorySpace, GridAttr grid,
     ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals,
@@ -483,7 +483,7 @@ LayoutAttr LayoutAttr::get(
   return get(context, linear, oobVal, grid, memref, memLayout);
 }
 
-LayoutAttr LayoutAttr::get(
+MetalLayoutAttr MetalLayoutAttr::get(
     ::mlir::MLIRContext *context, RankedTensorType ty, MemorySpace memorySpace,
     GridAttr grid,
     ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals,
@@ -493,9 +493,11 @@ LayoutAttr LayoutAttr::get(
              collapseIntervals, oobVal, memLayout);
 }
 
-LayoutAttr LayoutAttr::get(::mlir::MLIRContext *context, RankedTensorType ty,
-                           MemorySpace memorySpace, GridAttr grid,
-                           Type elementType, TensorMemoryLayout memLayout) {
+MetalLayoutAttr MetalLayoutAttr::get(::mlir::MLIRContext *context,
+                                     RankedTensorType ty,
+                                     MemorySpace memorySpace, GridAttr grid,
+                                     Type elementType,
+                                     TensorMemoryLayout memLayout) {
   assert(ty);
   assert(grid);
   return get(context, ty.getShape(), elementType, memorySpace, grid, {{0, -1}},
@@ -506,7 +508,7 @@ LayoutAttr LayoutAttr::get(::mlir::MLIRContext *context, RankedTensorType ty,
 // compute the physical shape of the tensor, i.e the shape of the tensor
 // after the dimensions have been collapsed onto a grid.
 llvm::SmallVector<int64_t>
-LayoutAttr::getPhysicalShape(ArrayRef<int64_t> logicalShape) const {
+MetalLayoutAttr::getPhysicalShape(ArrayRef<int64_t> logicalShape) const {
   llvm::SmallVector<int64_t> physicalShape(getGrid().getShape().size());
   SmallVector<AffineExpr> logicalShapeExprs(
       llvm::map_range(logicalShape, [context = getContext()](std::int64_t e) {
@@ -525,7 +527,7 @@ LayoutAttr::getPhysicalShape(ArrayRef<int64_t> logicalShape) const {
 }
 
 llvm::SmallVector<int64_t>
-LayoutAttr::getStride(ArrayRef<int64_t> logicalShape) const {
+MetalLayoutAttr::getStride(ArrayRef<int64_t> logicalShape) const {
 
   llvm::SmallVector<int64_t> stride(logicalShape.size());
 
@@ -574,7 +576,7 @@ LayoutAttr::getStride(ArrayRef<int64_t> logicalShape) const {
 }
 
 llvm::SmallVector<int64_t>
-LayoutAttr::getShardShape(bool convertTileToScalar) const {
+MetalLayoutAttr::getShardShape(bool convertTileToScalar) const {
   SmallVector<int64_t> shardShape(getMemref().getShape());
   auto elementType = getElementType();
   if (mlir::isa<TileType>(elementType) && convertTileToScalar) {
@@ -583,11 +585,11 @@ LayoutAttr::getShardShape(bool convertTileToScalar) const {
   return shardShape;
 }
 
-mlir::Type LayoutAttr::getElementType() const {
+mlir::Type MetalLayoutAttr::getElementType() const {
   return getMemref().getElementType();
 }
 
-mlir::Type LayoutAttr::getScalarElementType() const {
+mlir::Type MetalLayoutAttr::getScalarElementType() const {
   auto elementType = getElementType();
   if (mlir::isa<TileType>(elementType)) {
     return mlir::cast<TileType>(elementType).getElementType();
@@ -595,33 +597,33 @@ mlir::Type LayoutAttr::getScalarElementType() const {
   return elementType;
 }
 
-bool LayoutAttr::hasShardedTensorMemoryLayout() const {
+bool MetalLayoutAttr::hasShardedTensorMemoryLayout() const {
   return (getMemLayout() == TensorMemoryLayout::HeightSharded or
           getMemLayout() == TensorMemoryLayout::WidthSharded or
           getMemLayout() == TensorMemoryLayout::BlockSharded);
 }
 
-bool LayoutAttr::hasInterleavedTensorMemoryLayout() const {
+bool MetalLayoutAttr::hasInterleavedTensorMemoryLayout() const {
   return (getMemLayout() == TensorMemoryLayout::Interleaved);
 }
 
-bool LayoutAttr::hasShardedL1TensorMemoryLayout() const {
+bool MetalLayoutAttr::hasShardedL1TensorMemoryLayout() const {
   return ::mlir::tt::isL1MemorySpace(getMemorySpace()) and
          (getMemLayout() == TensorMemoryLayout::HeightSharded or
           getMemLayout() == TensorMemoryLayout::WidthSharded or
           getMemLayout() == TensorMemoryLayout::BlockSharded);
 }
 
-bool LayoutAttr::hasInterleavedL1TensorMemoryLayout() const {
+bool MetalLayoutAttr::hasInterleavedL1TensorMemoryLayout() const {
   return ::mlir::tt::isL1MemorySpace(getMemorySpace()) and
          (getMemLayout() == TensorMemoryLayout::Interleaved);
 }
 
-bool LayoutAttr::isTiled() const {
+bool MetalLayoutAttr::isTiled() const {
   return ::mlir::isa<::mlir::tt::TileType>(getElementType());
 }
 
-uint64_t LayoutAttr::getElementSizeBytes() const {
+uint64_t MetalLayoutAttr::getElementSizeBytes() const {
   mlir::Type elementType = getElementType();
   if (mlir::isa<TileType>(elementType)) {
     auto tileType = mlir::cast<TileType>(elementType);
@@ -630,7 +632,7 @@ uint64_t LayoutAttr::getElementSizeBytes() const {
   return elementType.getIntOrFloatBitWidth() / 8;
 }
 
-uint64_t LayoutAttr::getMemrefSizeBytes() const {
+uint64_t MetalLayoutAttr::getMemrefSizeBytes() const {
   MemRefType ty = getMemref();
   auto shape = ty.getShape();
   uint64_t size = getElementSizeBytes();
@@ -638,57 +640,60 @@ uint64_t LayoutAttr::getMemrefSizeBytes() const {
                          std::multiplies<uint64_t>());
 }
 
-LayoutAttr LayoutAttr::withGrid(
+MetalLayoutAttr MetalLayoutAttr::withGrid(
     ::mlir::MLIRContext *context, ArrayRef<int64_t> tensorShape, GridAttr grid,
     ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals) {
   return get(context, tensorShape, getElementType(), getMemorySpace(), grid,
              collapseIntervals, getOobVal(), getMemLayout());
 }
 
-LayoutAttr LayoutAttr::withGrid(
+MetalLayoutAttr MetalLayoutAttr::withGrid(
     ::mlir::MLIRContext *context, RankedTensorType ty, GridAttr grid,
     ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals) {
   assert(ty);
-  return LayoutAttr::withGrid(context, ty.getShape(), grid, collapseIntervals);
+  return MetalLayoutAttr::withGrid(context, ty.getShape(), grid,
+                                   collapseIntervals);
 }
 
-LayoutAttr LayoutAttr::withElementType(::mlir::MLIRContext *context,
-                                       Type elementType) {
-  return LayoutAttr::get(
+MetalLayoutAttr MetalLayoutAttr::withElementType(::mlir::MLIRContext *context,
+                                                 Type elementType) {
+  return MetalLayoutAttr::get(
       context, getLinear(), getOobVal(), getGrid(),
       buildMemRef<MemorySpace, MemorySpaceAttr>(context, getShardShape(),
                                                 elementType, getMemorySpace()),
       getMemLayout());
 }
 
-LayoutAttr LayoutAttr::withMemorySpace(::mlir::MLIRContext *context,
-                                       MemorySpace memorySpace) {
-  return LayoutAttr::get(
+MetalLayoutAttr MetalLayoutAttr::withMemorySpace(::mlir::MLIRContext *context,
+                                                 MemorySpace memorySpace) {
+  return MetalLayoutAttr::get(
       context, getLinear(), getOobVal(), getGrid(),
       buildMemRef<MemorySpace, MemorySpaceAttr>(context, getShardShape(),
                                                 getElementType(), memorySpace),
       getMemLayout());
 }
 
-LayoutAttr LayoutAttr::withMemoryLayout(::mlir::MLIRContext *context,
-                                        TensorMemoryLayout memLayout) {
-  return LayoutAttr::get(
+MetalLayoutAttr
+MetalLayoutAttr::withMemoryLayout(::mlir::MLIRContext *context,
+                                  TensorMemoryLayout memLayout) {
+  return MetalLayoutAttr::get(
       context, getLinear(), getOobVal(), getGrid(),
       buildMemRef<MemorySpace, MemorySpaceAttr>(
           context, getShardShape(), getElementType(), getMemorySpace()),
       memLayout);
 }
 
-LayoutAttr LayoutAttr::withShardShape(::mlir::MLIRContext *context,
-                                      llvm::SmallVector<int64_t> shardShape) {
-  return LayoutAttr::get(
+MetalLayoutAttr
+MetalLayoutAttr::withShardShape(::mlir::MLIRContext *context,
+                                llvm::SmallVector<int64_t> shardShape) {
+  return MetalLayoutAttr::get(
       context, getLinear(), getOobVal(), getGrid(),
       buildMemRef<MemorySpace, MemorySpaceAttr>(
           context, shardShape, getElementType(), getMemorySpace()),
       getMemLayout());
 }
 
-MemorySpace LayoutAttr::getMemorySpace() const {
+MemorySpace MetalLayoutAttr::getMemorySpace() const {
   return mlir::cast<mlir::tt::MemorySpaceAttr>(getMemref().getMemorySpace())
       .getValue();
 }
@@ -696,7 +701,7 @@ MemorySpace LayoutAttr::getMemorySpace() const {
 // Returns shape of the tensor after tilization is applied to the two inner most
 // dimensions.
 llvm::SmallVector<int64_t>
-LayoutAttr::getTiledShape(llvm::ArrayRef<int64_t> tensorShape) const {
+MetalLayoutAttr::getTiledShape(llvm::ArrayRef<int64_t> tensorShape) const {
   assert(isTiled() && "Expected a tiled layout");
 
   mlir::AffineMap linear = getLinear();
@@ -716,7 +721,7 @@ LayoutAttr::getTiledShape(llvm::ArrayRef<int64_t> tensorShape) const {
   return ttmlir::utils::evalShape(tiled, tensorShape);
 }
 
-mlir::AffineMap LayoutAttr::getIdentityTileLinearMap() const {
+mlir::AffineMap MetalLayoutAttr::getIdentityTileLinearMap() const {
   assert(isTiled() && "Expected a tiled layout");
 
   return mlir::AffineMap::getMultiDimIdentityMap(getLinear().getNumResults(),
@@ -735,7 +740,7 @@ mlir::AffineMap LayoutAttr::getIdentityTileLinearMap() const {
 //   (d0, d1)[2, 3] ->
 //     (0, d0 floordiv 2, d1 floordiv 3, (d0 mod 2) * 3 + d1 mod 3)
 //
-mlir::AffineMap LayoutAttr::replaceMemoryMapSymbolsWithShardShape(
+mlir::AffineMap MetalLayoutAttr::replaceMemoryMapSymbolsWithShardShape(
     AffineMap physicalMemoryMap) const {
   mlir::SmallVector<int64_t> shardShape =
       getShardShape(false /*convertTileToScalar*/);
@@ -763,8 +768,8 @@ mlir::AffineMap LayoutAttr::replaceMemoryMapSymbolsWithShardShape(
 // grid. Then it composes the logical grid projection with physical memory
 // mapping.
 mlir::AffineMap
-LayoutAttr::projectOnto(mlir::AffineMap linearMap,
-                        mlir::AffineMap physicalMemoryMap) const {
+MetalLayoutAttr::projectOnto(mlir::AffineMap linearMap,
+                             mlir::AffineMap physicalMemoryMap) const {
   assert(getGrid().getShape().size() == physicalMemoryMap.getNumDims() &&
          "Layout and device grids must have same number of dimensions");
   assert(getLinear().getNumResults() == physicalMemoryMap.getNumDims() &&
@@ -1013,7 +1018,7 @@ DeviceAttr DeviceAttr::get(::mlir::MLIRContext *context,
 // Sample the last index in the tensor to get the last addressable element of
 // the tensor to determine its footprint in memory.
 uint64_t DeviceAttr::getLayoutSizeBytes(ArrayRef<int64_t> tensorScalarShape,
-                                        LayoutAttr layout,
+                                        MetalLayoutAttr layout,
                                         MemorySpace memorySpace) const {
   SmallVector<int64_t> shape = layout.isTiled()
                                    ? layout.getTiledShape(tensorScalarShape)
@@ -1035,9 +1040,9 @@ uint64_t DeviceAttr::getLayoutSizeBytes(ArrayRef<int64_t> tensorScalarShape,
 uint64_t DeviceAttr::getTensorSizeBytes(RankedTensorType tensorType,
                                         MemorySpace memorySpace) const {
   assert(tensorType.getEncoding());
-  return getLayoutSizeBytes(tensorType.getShape(),
-                            mlir::cast<LayoutAttr>(tensorType.getEncoding()),
-                            memorySpace);
+  return getLayoutSizeBytes(
+      tensorType.getShape(),
+      mlir::cast<MetalLayoutAttr>(tensorType.getEncoding()), memorySpace);
 }
 
 ::mlir::LogicalResult
diff --git a/lib/Dialect/TTIR/IR/TTIROps.cpp b/lib/Dialect/TTIR/IR/TTIROps.cpp
index 8f404323e2..11cfbb8fbb 100644
--- a/lib/Dialect/TTIR/IR/TTIROps.cpp
+++ b/lib/Dialect/TTIR/IR/TTIROps.cpp
@@ -908,9 +908,9 @@ ::mlir::LogicalResult mlir::tt::ttir::ToLayoutOp::verify() {
 mlir::tt::ttir::ToLayoutOp::CompoundComponents
 mlir::tt::ttir::ToLayoutOp::compoundComponents() {
   auto inputLayout =
-      mlir::cast<tt::LayoutAttr>(getInput().getType().getEncoding());
+      mlir::cast<tt::MetalLayoutAttr>(getInput().getType().getEncoding());
   auto outputLayout =
-      mlir::cast<tt::LayoutAttr>(getOutput().getType().getEncoding());
+      mlir::cast<tt::MetalLayoutAttr>(getOutput().getType().getEncoding());
   bool isLayoutChange = inputLayout.getLinear() != outputLayout.getLinear();
   bool isGridChange = inputLayout.getGrid() != outputLayout.getGrid();
   bool isShardChange =
@@ -1216,7 +1216,7 @@ ::mlir::LogicalResult mlir::tt::ttir::MatmulOp::verify() {
 
 // AllocOp verification
 ::mlir::LogicalResult mlir::tt::ttir::AllocOp::verify() {
-  auto layout = mlir::dyn_cast_or_null<mlir::tt::LayoutAttr>(
+  auto layout = mlir::dyn_cast_or_null<mlir::tt::MetalLayoutAttr>(
       getResult().getType().getEncoding());
   if (not layout) {
     return emitOpError("Result type missing layout attribute");
diff --git a/lib/Dialect/TTIR/Transforms/Allocate.cpp b/lib/Dialect/TTIR/Transforms/Allocate.cpp
index 37e788385c..a643f041c3 100644
--- a/lib/Dialect/TTIR/Transforms/Allocate.cpp
+++ b/lib/Dialect/TTIR/Transforms/Allocate.cpp
@@ -22,13 +22,13 @@ inline MemorySpace getMemorySpace(MemRefType memref) {
   return mlir::cast<MemorySpaceAttr>(memref.getMemorySpace()).getValue();
 }
 
-inline MemorySpace getMemorySpace(LayoutAttr layout) {
+inline MemorySpace getMemorySpace(MetalLayoutAttr layout) {
   return getMemorySpace(layout.getMemref());
 }
 
 inline MemorySpace getMemorySpace(RankedTensorType ty) {
   assert(ty.getEncoding());
-  auto layout = mlir::cast<LayoutAttr>(ty.getEncoding());
+  auto layout = mlir::cast<MetalLayoutAttr>(ty.getEncoding());
   return getMemorySpace(layout);
 }
 
diff --git a/lib/Dialect/TTIR/Transforms/Generic.cpp b/lib/Dialect/TTIR/Transforms/Generic.cpp
index 005e12c079..3bf96f3cd6 100644
--- a/lib/Dialect/TTIR/Transforms/Generic.cpp
+++ b/lib/Dialect/TTIR/Transforms/Generic.cpp
@@ -257,7 +257,7 @@ class TTIRGenericRegionRewriter
     auto resEncoding =
         mlir::cast<RankedTensorType>(op->getResult(0).getType()).getEncoding();
     if (resEncoding) {
-      auto resLayout = mlir::cast<LayoutAttr>(resEncoding);
+      auto resLayout = mlir::cast<MetalLayoutAttr>(resEncoding);
       gridAttr = resLayout.getGrid();
     }
 
@@ -339,7 +339,7 @@ struct TTIRGenericOperandsToMemrefRewriter
         auto matchingOperand = generic.getMatchingOperand(blockArgNumber);
         auto operandType = matchingOperand.getType();
 
-        auto bufferLayout = mlir::cast<LayoutAttr>(
+        auto bufferLayout = mlir::cast<MetalLayoutAttr>(
             mlir::cast<RankedTensorType>(operandType).getEncoding());
         auto bufferType = operandType;
 
@@ -349,7 +349,7 @@ struct TTIRGenericOperandsToMemrefRewriter
           assert(static_cast<size_t>(cbIndex) < generic.getCbs().size());
           auto cb = generic.getCbs()[cbIndex];
           auto cbType = cb.getType();
-          auto cbLayout = mlir::cast<LayoutAttr>(
+          auto cbLayout = mlir::cast<MetalLayoutAttr>(
               mlir::cast<RankedTensorType>(cbType).getEncoding());
           bufferLayout = cbLayout;
           bufferType = cbType;
@@ -387,7 +387,7 @@ class TTIRGenericRegionMemrefTypeConverter : public TypeConverter {
       if (mlir::isa<BufferAttr>(encoding)) {
         return type;
       }
-      auto layout = mlir::cast<LayoutAttr>(type.getEncoding());
+      auto layout = mlir::cast<MetalLayoutAttr>(type.getEncoding());
       auto buffer =
           BufferAttr::get(ctx, layout.getMemref(), BufferAccess::Alias);
       return RankedTensorType::get(buffer.getShape(), type.getElementType(),
@@ -451,11 +451,11 @@ class TTIRGenericOpCBsRewriter : public OpRewritePattern<GenericOp> {
 
       // Enforcing tiled layout as in kernel we always want to work with tiles.
       auto desiredElementType = rewriter.getType<TileType>(ty.getElementType());
-      auto desiredLayout = rewriter.getAttr<LayoutAttr>(
+      auto desiredLayout = rewriter.getAttr<MetalLayoutAttr>(
           ty, MemorySpace::DeviceL1, generic.getGrid(), desiredElementType);
 
       auto operandTy = operand.getType();
-      auto operandLayout = mlir::cast<LayoutAttr>(
+      auto operandLayout = mlir::cast<MetalLayoutAttr>(
           mlir::cast<RankedTensorType>(operandTy).getEncoding());
 
       if (desiredLayout.getGrid() == operandLayout.getGrid()) {
diff --git a/lib/Dialect/TTIR/Transforms/Layout.cpp b/lib/Dialect/TTIR/Transforms/Layout.cpp
index d7eef6732d..c3ccbf1a44 100644
--- a/lib/Dialect/TTIR/Transforms/Layout.cpp
+++ b/lib/Dialect/TTIR/Transforms/Layout.cpp
@@ -38,20 +38,21 @@ class TTIRLayoutTensorTypeConverter : public TypeConverter {
   TTIRLayoutTensorTypeConverter(MLIRContext *ctx, MemorySpace initMemorySpace,
                                 GridAttr deviceGrid) {
     addConversion([](Type type) { return type; });
-    addConversion([ctx, initMemorySpace,
-                   deviceGrid](RankedTensorType type) -> Type {
-      auto layout = type.getEncoding();
-      if (layout) {
-        return type;
-      }
-      std::int64_t deviceGridRank = deviceGrid.getShape().size();
-      // Default to single core grid
-      auto tensorGrid = GridAttr::get(ctx, deviceGridRank);
-      // Default to initMemorySpace, the optimizer might decide otherwise
-      auto newLayout = LayoutAttr::get(ctx, type, initMemorySpace, tensorGrid);
-      return RankedTensorType::get(type.getShape(), type.getElementType(),
-                                   newLayout);
-    });
+    addConversion(
+        [ctx, initMemorySpace, deviceGrid](RankedTensorType type) -> Type {
+          auto layout = type.getEncoding();
+          if (layout) {
+            return type;
+          }
+          std::int64_t deviceGridRank = deviceGrid.getShape().size();
+          // Default to single core grid
+          auto tensorGrid = GridAttr::get(ctx, deviceGridRank);
+          // Default to initMemorySpace, the optimizer might decide otherwise
+          auto newLayout =
+              MetalLayoutAttr::get(ctx, type, initMemorySpace, tensorGrid);
+          return RankedTensorType::get(type.getShape(), type.getElementType(),
+                                       newLayout);
+        });
   }
 };
 
@@ -129,7 +130,7 @@ createToLayoutOp(PatternRewriter &rewriter, Location loc, Value input,
                  TensorMemoryLayout desiredMemLayout, bool tiled) {
 
   auto ty = mlir::cast<RankedTensorType>(input.getType());
-  auto currLayout = mlir::cast<LayoutAttr>(ty.getEncoding());
+  auto currLayout = mlir::cast<MetalLayoutAttr>(ty.getEncoding());
   auto currMemorySpace = currLayout.getMemorySpace();
   auto currElementType = currLayout.getElementType();
   auto currMemLayout = currLayout.getMemLayout();
@@ -142,9 +143,9 @@ createToLayoutOp(PatternRewriter &rewriter, Location loc, Value input,
     return std::nullopt;
   }
 
-  auto desiredLayout =
-      rewriter.getAttr<LayoutAttr>(ty, desiredMemorySpace, currLayout.getGrid(),
-                                   desiredElementType, desiredMemLayout);
+  auto desiredLayout = rewriter.getAttr<MetalLayoutAttr>(
+      ty, desiredMemorySpace, currLayout.getGrid(), desiredElementType,
+      desiredMemLayout);
 
   tensor::EmptyOp existingEmpty = input.getDefiningOp<tensor::EmptyOp>();
   if (existingEmpty) {
@@ -343,7 +344,7 @@ class TTIRSplitCompoundLayoutRewriter : public OpRewritePattern<ToLayoutOp> {
   using OpRewritePattern<ToLayoutOp>::OpRewritePattern;
 
   Value createToLayoutOp(PatternRewriter &rewriter, Location loc, Value input,
-                         LayoutAttr desiredLayout) const {
+                         MetalLayoutAttr desiredLayout) const {
     auto ty = mlir::cast<RankedTensorType>(input.getType());
     auto output = rewriter.create<tensor::EmptyOp>(
         loc, ty.getShape(), ty.getElementType(), desiredLayout);
@@ -353,7 +354,7 @@ class TTIRSplitCompoundLayoutRewriter : public OpRewritePattern<ToLayoutOp> {
   }
 
   Value bounce(PatternRewriter &rewriter, ToLayoutOp op,
-               LayoutAttr bounceLayout) const {
+               MetalLayoutAttr bounceLayout) const {
     auto bounced =
         createToLayoutOp(rewriter, op.getLoc(), op.getInput(), bounceLayout);
     return rewriter.replaceOpWithNewOp<ttir::ToLayoutOp>(
@@ -375,8 +376,8 @@ class TTIRSplitCompoundLayoutRewriter : public OpRewritePattern<ToLayoutOp> {
 
     auto inputType = mlir::cast<RankedTensorType>(op.getInput().getType());
     auto outputType = mlir::cast<RankedTensorType>(op.getOutput().getType());
-    auto inputLayout = mlir::cast<LayoutAttr>(inputType.getEncoding());
-    auto outputLayout = mlir::cast<LayoutAttr>(outputType.getEncoding());
+    auto inputLayout = mlir::cast<MetalLayoutAttr>(inputType.getEncoding());
+    auto outputLayout = mlir::cast<MetalLayoutAttr>(outputType.getEncoding());
 
     bool inputL1 = inputLayout.getMemorySpace() == MemorySpace::DeviceL1;
     bool outputL1 = outputLayout.getMemorySpace() == MemorySpace::DeviceL1;
diff --git a/lib/Dialect/TTMetal/IR/TTMetalOps.cpp b/lib/Dialect/TTMetal/IR/TTMetalOps.cpp
index 49baf51e01..7f78c1afcb 100644
--- a/lib/Dialect/TTMetal/IR/TTMetalOps.cpp
+++ b/lib/Dialect/TTMetal/IR/TTMetalOps.cpp
@@ -17,7 +17,7 @@ namespace mlir::tt::ttmetal {
 ::mlir::LogicalResult HostWriteOp::verify() {
   ::mlir::RankedTensorType outputTy = getOutput().getType();
   auto outputLayout =
-      mlir::dyn_cast_or_null<mlir::tt::LayoutAttr>(outputTy.getEncoding());
+      mlir::dyn_cast_or_null<mlir::tt::MetalLayoutAttr>(outputTy.getEncoding());
   if (not outputLayout) {
     return emitOpError("Input tensor missing layout attribute");
   }
@@ -30,7 +30,7 @@ ::mlir::LogicalResult HostWriteOp::verify() {
 ::mlir::LogicalResult HostReadOp::verify() {
   ::mlir::RankedTensorType outputTy = getOutput().getType();
   auto outputLayout =
-      mlir::dyn_cast_or_null<mlir::tt::LayoutAttr>(outputTy.getEncoding());
+      mlir::dyn_cast_or_null<mlir::tt::MetalLayoutAttr>(outputTy.getEncoding());
   if (not outputLayout) {
     return emitOpError("Input tensor missing layout attribute");
   }
@@ -41,7 +41,7 @@ ::mlir::LogicalResult HostReadOp::verify() {
 }
 
 ::mlir::LogicalResult AllocOp::verify() {
-  auto layout = mlir::dyn_cast_or_null<mlir::tt::LayoutAttr>(
+  auto layout = mlir::dyn_cast_or_null<mlir::tt::MetalLayoutAttr>(
       getResult().getType().getEncoding());
   if (not layout) {
     return emitOpError("Result type missing layout attribute");
@@ -76,7 +76,7 @@ ::mlir::LogicalResult AllocOp::verify() {
 ::mlir::LogicalResult DispatchOp::verify() {
   // Assert inputs/outputs device memspace
   for (auto operand : getOperands()) {
-    auto layout = mlir::dyn_cast_or_null<mlir::tt::LayoutAttr>(
+    auto layout = mlir::dyn_cast_or_null<mlir::tt::MetalLayoutAttr>(
         mlir::cast<mlir::RankedTensorType>(operand.getType()).getEncoding());
     if (not layout) {
       return emitOpError("Input tensor missing layout attribute");
diff --git a/lib/Target/TTMetal/TTMetalToFlatbuffer.cpp b/lib/Target/TTMetal/TTMetalToFlatbuffer.cpp
index 47e15accf6..e82deaf633 100644
--- a/lib/Target/TTMetal/TTMetalToFlatbuffer.cpp
+++ b/lib/Target/TTMetal/TTMetalToFlatbuffer.cpp
@@ -62,18 +62,18 @@ memrefAttrToFlatbuffer(FlatbufferObjectCache &cache, MemRefType memref,
       toFlatbuffer(cache, memLayout), size);
 }
 
-flatbuffers::Offset<::tt::target::LayoutDesc>
-layoutAttrToFlatbuffer(FlatbufferObjectCache &cache, LayoutAttr layoutAttr,
-                       ArrayRef<int64_t> logicalShape, DeviceAttr deviceAttr) {
-  auto strideInt64 = layoutAttr.getStride(logicalShape);
+flatbuffers::Offset<::tt::target::LayoutDesc> metalLayoutAttrToFlatbuffer(
+    FlatbufferObjectCache &cache, MetalLayoutAttr metalLayoutAttr,
+    ArrayRef<int64_t> logicalShape, DeviceAttr deviceAttr) {
+  auto strideInt64 = metalLayoutAttr.getStride(logicalShape);
   std::vector<int32_t> stride(strideInt64.begin(), strideInt64.end());
-  auto coreRangeSet =
-      toFlatbuffer(cache, layoutAttr.getGrid(), deviceAttr.getWorkerGrid());
+  auto coreRangeSet = toFlatbuffer(cache, metalLayoutAttr.getGrid(),
+                                   deviceAttr.getWorkerGrid());
   return ::tt::target::CreateLayoutDescDirect(
-      *cache.fbb, &stride, toFlatbuffer(cache, layoutAttr.getOobVal()),
+      *cache.fbb, &stride, toFlatbuffer(cache, metalLayoutAttr.getOobVal()),
       &coreRangeSet,
-      cache.getOrCreate(layoutAttr.getMemref(), memrefAttrToFlatbuffer,
-                        layoutAttr.getMemLayout()));
+      cache.getOrCreate(metalLayoutAttr.getMemref(), memrefAttrToFlatbuffer,
+                        metalLayoutAttr.getMemLayout()));
 }
 
 } // namespace mlir::tt
@@ -277,7 +277,7 @@ static std::shared_ptr<void> translateModuleToFlatbuffer(
           argumentAllocations[input.getArgNumber()]);
       assert(
           argAlloc.getMemorySpace() ==
-              mlir::cast<tt::LayoutAttr>(
+              mlir::cast<tt::MetalLayoutAttr>(
                   mlir::cast<RankedTensorType>(input.getType()).getEncoding())
                   .getMemorySpace() &&
           "argument allocation memory space does not match tensor type "
diff --git a/python/TTModule.cpp b/python/TTModule.cpp
index c70d7df974..b8d543410c 100644
--- a/python/TTModule.cpp
+++ b/python/TTModule.cpp
@@ -16,14 +16,14 @@
 
 namespace mlir::ttmlir::python {
 void populateTTModule(py::module &m) {
-  tt_attribute_class<tt::LayoutAttr>(m, "LayoutAttr")
+  tt_attribute_class<tt::MetalLayoutAttr>(m, "MetalLayoutAttr")
       .def_static("get",
                   [](MlirContext ctx, MlirType rankedTensorType,
                      uint32_t memorySpaceValue, MlirAttribute grid,
                      std::vector<std::pair<std::int64_t, std::int64_t>>
                          collapseIntervals,
                      uint32_t oobValValue, uint32_t memLayoutValue) {
-                    return wrap(tt::LayoutAttr::get(
+                    return wrap(tt::MetalLayoutAttr::get(
                         unwrap(ctx),
                         mlir::cast<RankedTensorType>(unwrap(rankedTensorType)),
                         static_cast<tt::MemorySpace>(memorySpaceValue),
@@ -37,7 +37,7 @@ void populateTTModule(py::module &m) {
                      std::vector<std::pair<std::int64_t, std::int64_t>>
                          collapseIntervals) {
                     return wrap(
-                        mlir::cast<tt::LayoutAttr>(unwrap(self))
+                        mlir::cast<tt::MetalLayoutAttr>(unwrap(self))
                             .withGrid(unwrap(ctx), tensorShape,
                                       mlir::cast<tt::GridAttr>(unwrap(grid)),
                                       collapseIntervals));
@@ -47,7 +47,7 @@ void populateTTModule(py::module &m) {
                      std::vector<std::int64_t> tensorShape, MlirAttribute grid,
                      std::vector<std::pair<std::int64_t, std::int64_t>>
                          collapseIntervals) {
-                    return mlir::cast<tt::LayoutAttr>(unwrap(self))
+                    return mlir::cast<tt::MetalLayoutAttr>(unwrap(self))
                         .withGrid(unwrap(ctx), tensorShape,
                                   mlir::cast<tt::GridAttr>(unwrap(grid)),
                                   collapseIntervals);
@@ -55,13 +55,13 @@ void populateTTModule(py::module &m) {
       .def_static(
           "with_element_type",
           [](MlirContext ctx, MlirAttribute self, MlirType elementType) {
-            return wrap(mlir::cast<tt::LayoutAttr>(unwrap(self))
+            return wrap(mlir::cast<tt::MetalLayoutAttr>(unwrap(self))
                             .withElementType(unwrap(ctx), unwrap(elementType)));
           })
       .def_static(
           "with_element_type_",
           [](MlirContext ctx, MlirAttribute self, MlirType elementType) {
-            return mlir::cast<tt::LayoutAttr>(unwrap(self))
+            return mlir::cast<tt::MetalLayoutAttr>(unwrap(self))
                 .withElementType(unwrap(ctx), unwrap(elementType));
           })
       .def("getLayout",
@@ -73,38 +73,45 @@ void populateTTModule(py::module &m) {
                  mlir::cast<RankedTensorType>(unwrap(type));
              assert(tensor.getEncoding()); // Make sure that this Tensor has an
                                            // encoding value
-             tt::LayoutAttr layout =
-                 mlir::cast<tt::LayoutAttr>(tensor.getEncoding());
+             tt::MetalLayoutAttr layout =
+                 mlir::cast<tt::MetalLayoutAttr>(tensor.getEncoding());
              return layout;
            })
-      .def("wrapped", [](tt::LayoutAttr const &self) { return wrap(self); })
-      .def_property_readonly(
-          "stride",
-          [](tt::LayoutAttr const &self, std::vector<int64_t> logicalShape) {
-            auto stride = self.getStride(logicalShape);
-            return std::vector<std::int64_t>(stride.begin(), stride.end());
-          })
-      .def_property_readonly("oobval", &tt::LayoutAttr::getOobVal)
+      .def("wrapped",
+           [](tt::MetalLayoutAttr const &self) { return wrap(self); })
+      .def_property_readonly("stride",
+                             [](tt::MetalLayoutAttr const &self,
+                                std::vector<int64_t> logicalShape) {
+                               auto stride = self.getStride(logicalShape);
+                               return std::vector<std::int64_t>(stride.begin(),
+                                                                stride.end());
+                             })
+      .def_property_readonly("oobval", &tt::MetalLayoutAttr::getOobVal)
       .def_property_readonly("oobval_as_int",
-                             [](tt::LayoutAttr la) {
+                             [](tt::MetalLayoutAttr la) {
                                return static_cast<uint32_t>(la.getOobVal());
                              })
-      .def_property_readonly("grid_attr", &tt::LayoutAttr::getGrid)
+      .def_property_readonly("grid_attr", &tt::MetalLayoutAttr::getGrid)
       .def_property_readonly(
-          "memref", [](tt::LayoutAttr self) { return wrap(self.getMemref()); })
-      .def_property_readonly("memory_space", &tt::LayoutAttr::getMemorySpace)
+          "memref",
+          [](tt::MetalLayoutAttr self) { return wrap(self.getMemref()); })
+      .def_property_readonly("memory_space",
+                             &tt::MetalLayoutAttr::getMemorySpace)
       .def_property_readonly("memory_space_as_int",
-                             [](tt::LayoutAttr la) {
+                             [](tt::MetalLayoutAttr la) {
                                return static_cast<uint32_t>(
                                    la.getMemorySpace());
                              })
-      .def_property_readonly("shard_shape", &tt::LayoutAttr::getShardShape)
-      .def_property_readonly("memory_layout", &tt::LayoutAttr::getMemLayout)
+      .def_property_readonly("shard_shape", &tt::MetalLayoutAttr::getShardShape)
+      .def_property_readonly("memory_layout",
+                             &tt::MetalLayoutAttr::getMemLayout)
       .def_property_readonly(
-          "linear", [](tt::LayoutAttr self) { return wrap(self.getLinear()); })
-      .def_property_readonly("memory_layout_as_int", [](tt::LayoutAttr la) {
-        return static_cast<uint32_t>(la.getMemLayout());
-      });
+          "linear",
+          [](tt::MetalLayoutAttr self) { return wrap(self.getLinear()); })
+      .def_property_readonly("memory_layout_as_int",
+                             [](tt::MetalLayoutAttr la) {
+                               return static_cast<uint32_t>(la.getMemLayout());
+                             });
 
   tt_attribute_class<tt::GridAttr>(m, "GridAttr")
       .def_static("get",
diff --git a/test/python/tensor_layout.py b/test/python/tensor_layout.py
index 39a9a728be..2dbf249e9f 100644
--- a/test/python/tensor_layout.py
+++ b/test/python/tensor_layout.py
@@ -34,7 +34,7 @@ def createTensorLayout(
         shape, F32Type.get(ctx), None, Location.unknown(ctx)
     )
     memoryLayout = getTensorMemoryLayout(memorySpace)
-    layout = tt.ir.LayoutAttr.get(
+    layout = tt.ir.MetalLayoutAttr.get(
         ctx, tensorTy, memorySpace, grid, collapseIntervals, oobVal, memoryLayout
     )
     return RankedTensorType.get(shape, F32Type.get(ctx), layout, Location.unknown(ctx))
@@ -42,7 +42,7 @@ def createTensorLayout(
 
 def tilize(tensor, dataType, tileShape=[32, 32]):
     assert len(tileShape) == 2
-    return tt.ir.LayoutAttr.with_element_type_(
+    return tt.ir.MetalLayoutAttr.with_element_type_(
         ctx,
         tensor.encoding,
         tt.ir.TileType.get(ctx, tileShape[0], tileShape[1], dataType),
@@ -52,15 +52,15 @@ def tilize(tensor, dataType, tileShape=[32, 32]):
 def parallelize(tensor, grid, collapseIntervals=[(0, -1)]):
     if isinstance(grid, list) or isinstance(grid, tuple):
         grid = tt.ir.GridAttr.get(ctx, list(grid))
-    return tt.ir.LayoutAttr.with_grid_(
+    return tt.ir.MetalLayoutAttr.with_grid_(
         ctx, tensor.encoding, tensor.shape, grid, collapseIntervals
     )
 
 
 t0 = createTensorLayout([2, 3, 64, 128], [2, 4])
-# CHECK: tensor<2x3x64x128xf32, #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x4>, memref<192x32xf32, #tt.memory_space<l1>>, interleaved>>
+# CHECK: tensor<2x3x64x128xf32, #tt.metal_layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x4>, memref<192x32xf32, #tt.memory_space<l1>>, interleaved>>
 print(t0)
-# CHECK: #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x4>, memref<6x1x!tt.tile<32x32, bfp_bf8>, #tt.memory_space<l1>>, interleaved>
+# CHECK: #tt.metal_layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x4>, memref<6x1x!tt.tile<32x32, bfp_bf8>, #tt.memory_space<l1>>, interleaved>
 print(tilize(t0, tt.DataType.BFP_BFloat8).wrapped())
 print(parallelize(t0, [3, 2]).wrapped())
 
@@ -69,24 +69,24 @@ def parallelize(tensor, grid, collapseIntervals=[(0, -1)]):
 print(parallelize(t1, [3, 2]).wrapped())
 
 t2 = createTensorLayout([128], [4], collapseIntervals=[(0, -1)])
-# CHECK: tensor<128xf32, #tt.layout<(d0) -> (d0), undef, <4>, memref<32xf32, #tt.memory_space<l1>>, interleaved>>
+# CHECK: tensor<128xf32, #tt.metal_layout<(d0) -> (d0), undef, <4>, memref<32xf32, #tt.memory_space<l1>>, interleaved>>
 print(t2)
-# CHECK: #tt.layout<(d0) -> (d0), undef, <2>, memref<64xf32, #tt.memory_space<l1>>, interleaved>
+# CHECK: #tt.metal_layout<(d0) -> (d0), undef, <2>, memref<64xf32, #tt.memory_space<l1>>, interleaved>
 print(parallelize(t2, [2]).wrapped())
-# CHECK: #tt.layout<(d0) -> (0, d0), undef, <1x2>, memref<1x64xf32, #tt.memory_space<l1>>, interleaved>
+# CHECK: #tt.metal_layout<(d0) -> (0, d0), undef, <1x2>, memref<1x64xf32, #tt.memory_space<l1>>, interleaved>
 print(parallelize(t2, [1, 2]).wrapped())
 
 t3 = createTensorLayout([128], [1, 4], collapseIntervals=[(0, -1)])
-# CHECK: tensor<128xf32, #tt.layout<(d0) -> (0, d0), undef, <1x4>, memref<1x32xf32, #tt.memory_space<l1>>, interleaved>>
+# CHECK: tensor<128xf32, #tt.metal_layout<(d0) -> (0, d0), undef, <1x4>, memref<1x32xf32, #tt.memory_space<l1>>, interleaved>>
 print(t3)
-# CHECK: #tt.layout<(d0) -> (0, d0), undef, <1x4>, memref<1x1x!tt.tile<32x32, bfp_bf8>, #tt.memory_space<l1>>, interleaved>
+# CHECK: #tt.metal_layout<(d0) -> (0, d0), undef, <1x4>, memref<1x1x!tt.tile<32x32, bfp_bf8>, #tt.memory_space<l1>>, interleaved>
 print(tilize(t3, tt.DataType.BFP_BFloat8).wrapped())
 
 t4 = createTensorLayout([128], [1, 2, 4], collapseIntervals=[(0, -1)])
-# CHECK: tensor<128xf32, #tt.layout<(d0) -> (0, 0, d0), undef, <1x2x4>, memref<1x1x32xf32, #tt.memory_space<l1>>, interleaved>>
+# CHECK: tensor<128xf32, #tt.metal_layout<(d0) -> (0, 0, d0), undef, <1x2x4>, memref<1x1x32xf32, #tt.memory_space<l1>>, interleaved>>
 print(t4)
 
-# CHECK: #tt.layout<(d0) -> (0, 0, d0), undef, <1x2x4>, memref<1x1x1x!tt.tile<32x32, bfp_bf8>, #tt.memory_space<l1>>, interleaved>
+# CHECK: #tt.metal_layout<(d0) -> (0, 0, d0), undef, <1x2x4>, memref<1x1x1x!tt.tile<32x32, bfp_bf8>, #tt.memory_space<l1>>, interleaved>
 print(tilize(t4, tt.DataType.BFP_BFloat8).wrapped())
-# CHECK: #tt.layout<(d0) -> (0, d0), undef, <1x2>, memref<1x64xf32, #tt.memory_space<l1>>, interleaved>
+# CHECK: #tt.metal_layout<(d0) -> (0, d0), undef, <1x2>, memref<1x64xf32, #tt.memory_space<l1>>, interleaved>
 print(parallelize(t4, [1, 2]).wrapped())
diff --git a/test/ttmlir/Dialect/TTIR/split_compound_layout.mlir b/test/ttmlir/Dialect/TTIR/split_compound_layout.mlir
index 2335fb0df3..42cab3d1f6 100644
--- a/test/ttmlir/Dialect/TTIR/split_compound_layout.mlir
+++ b/test/ttmlir/Dialect/TTIR/split_compound_layout.mlir
@@ -3,21 +3,21 @@
 #dram = #tt.memory_space<dram>
 #l1_ = #tt.memory_space<l1>
 
-// CHECK-DAG: #[[row_major1x1:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>, interleaved>
-// CHECK-DAG: #[[row_major1x1_T:.*]] = #tt.layout<(d0, d1) -> (d1, d0), undef, <1x1>, memref<64x128xf32, #l1_>, interleaved>
-// CHECK-DAG: #[[row_major2x2:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<32x64xf32, #l1_>, interleaved>
-// CHECK-DAG: #[[tile1x1_f32:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, f32>, #l1_>, interleaved>
-// CHECK-DAG: #[[tile1x1_bf16:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #l1_>, interleaved>
-// CHECK-DAG: #[[tile1x1_f32_dram:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, f32>, #dram>, interleaved>
-// CHECK-DAG: #[[tile2x2_f32:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<1x2x!tt.tile<32x32, f32>, #l1_>, interleaved>
+// CHECK-DAG: #[[row_major1x1:.*]] = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>, interleaved>
+// CHECK-DAG: #[[row_major1x1_T:.*]] = #tt.metal_layout<(d0, d1) -> (d1, d0), undef, <1x1>, memref<64x128xf32, #l1_>, interleaved>
+// CHECK-DAG: #[[row_major2x2:.*]] = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<32x64xf32, #l1_>, interleaved>
+// CHECK-DAG: #[[tile1x1_f32:.*]] = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, f32>, #l1_>, interleaved>
+// CHECK-DAG: #[[tile1x1_bf16:.*]] = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #l1_>, interleaved>
+// CHECK-DAG: #[[tile1x1_f32_dram:.*]] = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, f32>, #dram>, interleaved>
+// CHECK-DAG: #[[tile2x2_f32:.*]] = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<1x2x!tt.tile<32x32, f32>, #l1_>, interleaved>
 
-#row_major1x1 = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>, interleaved>
-#row_major1x1_T = #tt.layout<(d0, d1) -> (d1, d0), undef, <1x1>, memref<64x128xf32, #l1_>, interleaved>
-#row_major2x2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<32x64xf32, #l1_>, interleaved>
-#tile1x1_f32 = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, f32>, #l1_>, interleaved>
-#tile1x1_bf16 = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #l1_>, interleaved>
-#tile1x1_f32_dram = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, f32>, #dram>, interleaved>
-#tile2x2_f32 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<1x2x!tt.tile<32x32, f32>, #l1_>, interleaved>
+#row_major1x1 = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>, interleaved>
+#row_major1x1_T = #tt.metal_layout<(d0, d1) -> (d1, d0), undef, <1x1>, memref<64x128xf32, #l1_>, interleaved>
+#row_major2x2 = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<32x64xf32, #l1_>, interleaved>
+#tile1x1_f32 = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, f32>, #l1_>, interleaved>
+#tile1x1_bf16 = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #l1_>, interleaved>
+#tile1x1_f32_dram = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, f32>, #dram>, interleaved>
+#tile2x2_f32 = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<1x2x!tt.tile<32x32, f32>, #l1_>, interleaved>
 
 func.func @noncompound_linear(%in: tensor<64x128xf32, #row_major1x1>) -> tensor<64x128xf32, #row_major1x1_T> {
     %out = tensor.empty() : tensor<64x128xf32, #row_major1x1_T>
diff --git a/test/ttmlir/Dialect/TTIR/test_allocate.mlir b/test/ttmlir/Dialect/TTIR/test_allocate.mlir
index a80a8c1c91..5888cf3f62 100644
--- a/test/ttmlir/Dialect/TTIR/test_allocate.mlir
+++ b/test/ttmlir/Dialect/TTIR/test_allocate.mlir
@@ -1,7 +1,7 @@
 // RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-allocate %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 #l1_ = #tt.memory_space<l1>
-#layout = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>, interleaved>
+#layout = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>, interleaved>
 module attributes {} {
   func.func @forward(%arg0: tensor<64x128xf32, #layout>, %arg1: tensor<64x128xf32, #layout>) -> tensor<64x128xf32, #layout> {
     // CHECK: %[[C:.*]] = "ttir.alloc"[[C:.*]]
diff --git a/test/ttmlir/Silicon/TTMetal/simple_reduce.mlir b/test/ttmlir/Silicon/TTMetal/simple_reduce.mlir
index 1674ae0d32..cdde621c2a 100644
--- a/test/ttmlir/Silicon/TTMetal/simple_reduce.mlir
+++ b/test/ttmlir/Silicon/TTMetal/simple_reduce.mlir
@@ -1,8 +1,8 @@
 // RUN: ttmlir-opt --ttir-to-ttmetal-backend-pipeline="system-desc-path=%system_desc_path%"  %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 #l1_ = #tt.memory_space<l1>
-#layout1 = #tt.layout<(d0, d1) -> (d0, d1), undef, <4x4>, memref<64x96xf32, #l1_>>
-#layout2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <4x1>, memref<64x32xf32, #l1_>>
+#layout1 = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <4x4>, memref<64x96xf32, #l1_>>
+#layout2 = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <4x1>, memref<64x32xf32, #l1_>>
 
 func.func @reduceW(%arg0: tensor<256x384xf32, #layout1>) -> tensor<256x32xf32, #layout2> {
   %0 = tensor.empty() : tensor<256x32xf32, #layout2>
@@ -15,7 +15,7 @@ func.func @reduceW(%arg0: tensor<256x384xf32, #layout1>) -> tensor<256x32xf32, #
   return %1 : tensor<256x32xf32, #layout2>
 }
 
-#layout3 = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x4>, memref<32x96xf32, #l1_>>
+#layout3 = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x4>, memref<32x96xf32, #l1_>>
 func.func @reduceH(%arg0: tensor<256x384xf32, #layout1>) -> tensor<32x384xf32, #layout3> {
   %0 = tensor.empty() : tensor<32x384xf32, #layout3>
   // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
@@ -27,7 +27,7 @@ func.func @reduceH(%arg0: tensor<256x384xf32, #layout1>) -> tensor<32x384xf32, #
   return %1 : tensor<32x384xf32, #layout3>
 }
 
-#layout4 = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<32x32xf32, #l1_>>
+#layout4 = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<32x32xf32, #l1_>>
 func.func @reduceWH(%arg0: tensor<256x384xf32, #layout1>) -> tensor<32x32xf32, #layout4> {
   %0 = tensor.empty() : tensor<32x32xf32, #layout4>
   // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
diff --git a/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir b/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir
index 64cf5f57a6..d7d3cea1dd 100644
--- a/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir
+++ b/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir
@@ -4,10 +4,10 @@
 
 #l1_ = #tt.memory_space<l1>
 
-#untilized = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>>
-#tilized = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32 x 32, f32>, #l1_>>
-#tilized2x2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<1x2x!tt.tile<32 x 32, f32>, #l1_>>
-#untilized2x2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<32x64xf32, #l1_>>
+#untilized = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>>
+#tilized = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32 x 32, f32>, #l1_>>
+#tilized2x2 = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<1x2x!tt.tile<32 x 32, f32>, #l1_>>
+#untilized2x2 = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<32x64xf32, #l1_>>
 func.func @tilize_reblock_2D(%arg0: tensor<64x128xf32, #untilized>) -> tensor<64x128xf32, #untilized2x2> {
   // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %0 = tensor.empty() : tensor<64x128xf32, #tilized>
@@ -25,10 +25,10 @@ func.func @tilize_reblock_2D(%arg0: tensor<64x128xf32, #untilized>) -> tensor<64
 }
 
 
-#untilized4D = #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <1x1>, memref<384x128xf32, #l1_>>
-#tilized4D = #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <1x1>, memref<12x4x!tt.tile<32 x 32, f32>, #l1_>>
-#tilized4D_2x2 = #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x2>, memref<6x2x!tt.tile<32 x 32, f32>, #l1_>>
-#untilized4D_2x2 = #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x2>, memref<192x64xf32, #l1_>>
+#untilized4D = #tt.metal_layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <1x1>, memref<384x128xf32, #l1_>>
+#tilized4D = #tt.metal_layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <1x1>, memref<12x4x!tt.tile<32 x 32, f32>, #l1_>>
+#tilized4D_2x2 = #tt.metal_layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x2>, memref<6x2x!tt.tile<32 x 32, f32>, #l1_>>
+#untilized4D_2x2 = #tt.metal_layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x2>, memref<192x64xf32, #l1_>>
 func.func @tilize_reblock_4D(%arg0: tensor<2x3x64x128xf32, #untilized4D>) -> tensor<2x3x64x128xf32, #untilized4D_2x2> {
   // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %0 = tensor.empty() : tensor<2x3x64x128xf32, #tilized4D>
@@ -48,10 +48,10 @@ func.func @tilize_reblock_4D(%arg0: tensor<2x3x64x128xf32, #untilized4D>) -> ten
   return %5 : tensor<2x3x64x128xf32, #untilized4D_2x2>
 }
 
-#untilized_big = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<96x192xf32, #l1_>>
-#tilized_big = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<3x6x!tt.tile<32 x 32, f32>, #l1_>>
-#tilized_big_3x2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <3x2>, memref<1x3x!tt.tile<32 x 32, f32>, #l1_>>
-#tilized_big_3x6 = #tt.layout<(d0, d1) -> (d0, d1), undef, <3x6>, memref<1x1x!tt.tile<32 x 32, f32>, #l1_>>
+#untilized_big = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<96x192xf32, #l1_>>
+#tilized_big = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<3x6x!tt.tile<32 x 32, f32>, #l1_>>
+#tilized_big_3x2 = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <3x2>, memref<1x3x!tt.tile<32 x 32, f32>, #l1_>>
+#tilized_big_3x6 = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <3x6>, memref<1x1x!tt.tile<32 x 32, f32>, #l1_>>
 func.func @tilize_reblock_big(%arg0: tensor<96x192xf32, #untilized_big>) -> tensor<96x192xf32, #untilized_big> {
   // move to tilized 1x1
   // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
diff --git a/test/ttmlir/Silicon/TTMetal/to_layout.mlir b/test/ttmlir/Silicon/TTMetal/to_layout.mlir
index 015e651750..e5318c6c1d 100644
--- a/test/ttmlir/Silicon/TTMetal/to_layout.mlir
+++ b/test/ttmlir/Silicon/TTMetal/to_layout.mlir
@@ -5,8 +5,8 @@
 #l1_ = #tt.memory_space<l1>
 #dram = #tt.memory_space<dram>
 
-#layout = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<4x16xf32, #l1_>>
-#layout1 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<2x8xf32, #l1_>>
+#layout = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<4x16xf32, #l1_>>
+#layout1 = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<2x8xf32, #l1_>>
 func.func @simple(%arg0: tensor<4x16xf32, #layout>) -> tensor<4x16xf32, #layout1> {
   %0 = tensor.empty() : tensor<4x16xf32, #layout1>
   // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
@@ -14,8 +14,8 @@ func.func @simple(%arg0: tensor<4x16xf32, #layout>) -> tensor<4x16xf32, #layout1
   return %1 : tensor<4x16xf32, #layout1>
 }
 
-#untilized = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>>
-#tilized = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32 x 32, f32>, #l1_>>
+#untilized = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>>
+#tilized = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32 x 32, f32>, #l1_>>
 func.func @tilize(%arg0: tensor<64x128xf32, #untilized>) -> tensor<64x128xf32, #untilized> {
   %0 = tensor.empty() : tensor<64x128xf32, #tilized>
   // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
@@ -26,11 +26,11 @@ func.func @tilize(%arg0: tensor<64x128xf32, #untilized>) -> tensor<64x128xf32, #
   return %3 : tensor<64x128xf32, #untilized>
 }
 
-#untilized_dram = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<16x64xf32, #dram>>
-#untilized_l1 = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<16x64xf32, #l1_>>
-#untilized2x2_dram = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<8x32xf32, #dram>>
-#untilized2x2_l1 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<8x32xf32, #l1_>>
-#untilized1x4_l1 = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x4>, memref<16x16xf32, #l1_>>
+#untilized_dram = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<16x64xf32, #dram>>
+#untilized_l1 = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<16x64xf32, #l1_>>
+#untilized2x2_dram = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<8x32xf32, #dram>>
+#untilized2x2_l1 = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<8x32xf32, #l1_>>
+#untilized1x4_l1 = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x4>, memref<16x16xf32, #l1_>>
 func.func @dram_to_l1(%arg0: tensor<16x64xf32, #untilized_dram>) -> tensor<16x64xf32, #untilized_l1> {
   %0 = tensor.empty() : tensor<16x64xf32, #untilized_l1>
   // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
diff --git a/test/ttmlir/Silicon/TTNN/perf_unit/mnist.mlir b/test/ttmlir/Silicon/TTNN/perf_unit/mnist.mlir
index ba995925d5..0193ec36b1 100644
--- a/test/ttmlir/Silicon/TTNN/perf_unit/mnist.mlir
+++ b/test/ttmlir/Silicon/TTNN/perf_unit/mnist.mlir
@@ -4,8 +4,8 @@
 #loc = loc("MNISTLinear":4294967295:0)
 module @"tt-forge-graph" attributes {} {
   func.func @main(%arg0: tensor<1x784xf32> loc("MNISTLinear":4294967295:0), %arg1: tensor<1x10xf32> loc("MNISTLinear":4294967295:0), %arg2: tensor<256x10xf32> loc("MNISTLinear":4294967295:0), %arg3: tensor<1x256xf32> loc("MNISTLinear":4294967295:0), %arg4: tensor<784x256xf32> loc("MNISTLinear":4294967295:0)) -> tensor<1x10xf32> {
-    // CHECK: #[[LAYOUT_10:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x8>, memref<1x32xf32, #l1_>, block_sharded>
-    // CHECK: #[[LAYOUT_11:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<1x10xf32, #l1_>, block_sharded>
+    // CHECK: #[[LAYOUT_10:.*]] = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x8>, memref<1x32xf32, #l1_>, block_sharded>
+    // CHECK: #[[LAYOUT_11:.*]] = #tt.metal_layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<1x10xf32, #l1_>, block_sharded>
     %0 = tensor.empty() : tensor<1x256xf32> loc(#loc8)
     // CHECK: %[[C:.*]] = "ttnn.matmul"[[C:.*]] -> tensor<1x256xf32, #[[LAYOUT_10]]>
     %1 = "ttir.matmul"(%arg0, %arg4, %0) <{operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x784xf32>, tensor<784x256xf32>, tensor<1x256xf32>) -> tensor<1x256xf32> loc(#loc8)
diff --git a/tools/explorer/tt_adapter/src/tt_adapter/mlir.py b/tools/explorer/tt_adapter/src/tt_adapter/mlir.py
index 5233e844c2..b9ae471ca5 100644
--- a/tools/explorer/tt_adapter/src/tt_adapter/mlir.py
+++ b/tools/explorer/tt_adapter/src/tt_adapter/mlir.py
@@ -348,7 +348,7 @@ def parse_dimension(attr):
 
 @AttrHandler.register_handler("tt.layout")
 def parse_tt_layout(attr):
-    layout = tt.ir.LayoutAttr.maybe_downcast(attr)
+    layout = tt.ir.MetalLayoutAttr.maybe_downcast(attr)
     result = []
     result.append(graph_builder.KeyValue(key="linear", value=str(layout.linear)))
     result.append(

From b5dfa741c10fc65a189c9f6e86eb2e05f63debb6 Mon Sep 17 00:00:00 2001
From: Sterling Taylor <166402033+staylorTT@users.noreply.github.com>
Date: Wed, 27 Nov 2024 08:11:48 -0600
Subject: [PATCH 28/84] Update issue-last-updated.yml (#1405)

replace null value with set value
---
 .github/workflows/issue-last-updated.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/issue-last-updated.yml b/.github/workflows/issue-last-updated.yml
index 5fed3d1882..f79d16c2c5 100644
--- a/.github/workflows/issue-last-updated.yml
+++ b/.github/workflows/issue-last-updated.yml
@@ -66,7 +66,7 @@ jobs:
             # Construct JSON payload using jq for proper formatting
             JSON_PAYLOAD=$(jq -n \
               --arg query "$QUERY" \
-              --arg projectId "$PROJECT_ID" \
+              --arg projectId "${{ env.project_id }}" \
               --arg cursor "$CURSOR" \
               '{ query: $query, variables: { projectId: $projectId, cursor: $cursor }}')
 

From f8121bf34c1194733a8eae7342df351555ff7b83 Mon Sep 17 00:00:00 2001
From: Muhammad Asif Manzoor <mmanzoor@tenstorrent.com>
Date: Wed, 27 Nov 2024 09:23:44 -0500
Subject: [PATCH 29/84] Update hard coded path for ttrt (#1421)

* Update path for hard coded ttmlir build directory to 'TTMLIR_BINARY_DIR'.
---
 runtime/tools/python/CMakeLists.txt |  1 +
 runtime/tools/python/setup.py       | 47 ++++++++++++++++-------------
 2 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/runtime/tools/python/CMakeLists.txt b/runtime/tools/python/CMakeLists.txt
index a4c7a51916..353ebbe7df 100644
--- a/runtime/tools/python/CMakeLists.txt
+++ b/runtime/tools/python/CMakeLists.txt
@@ -12,6 +12,7 @@ add_custom_target(ttrt
           TT_RUNTIME_ENABLE_PERF_TRACE=${TT_RUNTIME_ENABLE_PERF_TRACE}
           TT_RUNTIME_DEBUG=${TT_RUNTIME_DEBUG}
           TT_RUNTIME_WORKAROUNDS=${TT_RUNTIME_WORKAROUNDS}
+          TTMLIR_BINARY_DIR=${TTMLIR_BINARY_DIR}
           TTMLIR_VERSION_MAJOR=${TTMLIR_VERSION_MAJOR}
           TTMLIR_VERSION_MINOR=${TTMLIR_VERSION_MINOR}
           TTMLIR_VERSION_PATCH=${TTMLIR_VERSION_PATCH}
diff --git a/runtime/tools/python/setup.py b/runtime/tools/python/setup.py
index ddbe3da9fe..f5d148578b 100644
--- a/runtime/tools/python/setup.py
+++ b/runtime/tools/python/setup.py
@@ -18,6 +18,11 @@
     "SOURCE_ROOT",
     os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", ".."),
 )
+# Use 'src_dir/build' as default location if TTMLIR_BINARY_DIR env variable is not available.
+ttmlir_build_dir = os.environ.get(
+    "TTMLIR_BINARY_DIR",
+    os.path.join(src_dir, "build"),
+)
 toolchain = os.environ.get("TTMLIR_TOOLCHAIN_DIR", "/opt/ttmlir-toolchain")
 metaldir = f"{src_dir}/third_party/tt-metal/src/tt-metal-build"
 ttmetalhome = os.environ.get("TT_METAL_HOME", "")
@@ -37,12 +42,12 @@
         include_dirs=[
             f"{toolchain}/include",
             f"{src_dir}/runtime/include",
-            f"{src_dir}/build/include",
-            f"{src_dir}/build/include/ttmlir/Target/Common",
+            f"{ttmlir_build_dir}/include",
+            f"{ttmlir_build_dir}/include/ttmlir/Target/Common",
         ],
         libraries=["TTBinary", "flatbuffers"],
         library_dirs=[
-            f"{src_dir}/build/runtime/lib",
+            f"{ttmlir_build_dir}/runtime/lib",
             f"{toolchain}/lib",
         ],
         define_macros=[("VERSION_INFO", __version__)],
@@ -80,13 +85,13 @@
     for dylib in runlibs:
         shutil.copy(
             f"{metaldir}/lib/{dylib}",
-            f"{src_dir}/build/runtime/tools/python/ttrt/runtime",
+            f"{ttmlir_build_dir}/runtime/tools/python/ttrt/runtime",
         )
         command = [
             "patchelf",
             "--set-rpath",
             "$ORIGIN",
-            f"{src_dir}/build/runtime/tools/python/ttrt/runtime/{dylib}",
+            f"{ttmlir_build_dir}/runtime/tools/python/ttrt/runtime/{dylib}",
         ]
 
         try:
@@ -103,7 +108,7 @@
     for dylib in perflibs:
         shutil.copy(
             f"{metaldir}/tools/profiler/bin/{dylib}",
-            f"{src_dir}/build/runtime/tools/python/ttrt/runtime",
+            f"{ttmlir_build_dir}/runtime/tools/python/ttrt/runtime",
         )
         shutil.copy(
             f"{metaldir}/tools/profiler/bin/{dylib}",
@@ -169,7 +174,7 @@ def tt_metal_ignore_folders(folder, contents):
     # copy metal dir folder
     shutil.copytree(
         f"{ttmetalhome}/tt_metal",
-        f"{src_dir}/build/runtime/tools/python/ttrt/runtime/tt_metal",
+        f"{ttmlir_build_dir}/runtime/tools/python/ttrt/runtime/tt_metal",
         dirs_exist_ok=True,
         ignore=tt_metal_ignore_folders,
     )
@@ -177,14 +182,14 @@ def tt_metal_ignore_folders(folder, contents):
     # copy runtime dir folder
     shutil.copytree(
         f"{ttmetalhome}/runtime",
-        f"{src_dir}/build/runtime/tools/python/ttrt/runtime/runtime",
+        f"{ttmlir_build_dir}/runtime/tools/python/ttrt/runtime/runtime",
         dirs_exist_ok=True,
     )
 
     # copy kernels
     shutil.copytree(
         f"{ttmetalhome}/ttnn",
-        f"{src_dir}/build/runtime/tools/python/ttrt/runtime/ttnn",
+        f"{ttmlir_build_dir}/runtime/tools/python/ttrt/runtime/ttnn",
         dirs_exist_ok=True,
     )
 
@@ -198,16 +203,16 @@ def package_files(directory):
         return paths
 
     extra_files_tt_metal = package_files(
-        f"{src_dir}/build/runtime/tools/python/ttrt/runtime/tt_metal/"
+        f"{ttmlir_build_dir}/runtime/tools/python/ttrt/runtime/tt_metal/"
     )
     extra_files_runtime = package_files(
-        f"{src_dir}/build/runtime/tools/python/ttrt/runtime/runtime/"
+        f"{ttmlir_build_dir}/runtime/tools/python/ttrt/runtime/runtime/"
     )
     extra_files_ttnn = package_files(
-        f"{src_dir}/build/runtime/tools/python/ttrt/runtime/ttnn/"
+        f"{ttmlir_build_dir}/runtime/tools/python/ttrt/runtime/ttnn/"
     )
     extra_files_tests = package_files(
-        f"{src_dir}/build/runtime/tools/python/ttrt/runtime/tests/"
+        f"{ttmlir_build_dir}/runtime/tools/python/ttrt/runtime/tests/"
     )
 
     metallibs += extra_files_tt_metal
@@ -222,18 +227,18 @@ def package_files(directory):
             include_dirs=[
                 f"{toolchain}/include",
                 f"{src_dir}/runtime/include",
-                f"{src_dir}/build/include",
-                f"{src_dir}/build/include/ttmlir/Target/Common",
+                f"{ttmlir_build_dir}/include",
+                f"{ttmlir_build_dir}/include/ttmlir/Target/Common",
             ],
             libraries=["TTRuntime"] + linklibs + ["flatbuffers"],
             library_dirs=[
-                f"{src_dir}/build/runtime/lib",
-                f"{src_dir}/build/runtime/lib/common",
-                f"{src_dir}/build/runtime/lib/ttnn",
-                f"{src_dir}/build/runtime/lib/ttnn/operations",
-                f"{src_dir}/build/runtime/lib/ttmetal",
+                f"{ttmlir_build_dir}/runtime/lib",
+                f"{ttmlir_build_dir}/runtime/lib/common",
+                f"{ttmlir_build_dir}/runtime/lib/ttnn",
+                f"{ttmlir_build_dir}/runtime/lib/ttnn/operations",
+                f"{ttmlir_build_dir}/runtime/lib/ttmetal",
                 f"{toolchain}/lib",
-                f"{src_dir}/build/runtime/tools/python/ttrt/runtime",
+                f"{ttmlir_build_dir}/runtime/tools/python/ttrt/runtime",
                 f"{metaldir}/lib",
             ],
             define_macros=[

From ae987d53acbdd3051634d944e92b249c1fddbd5c Mon Sep 17 00:00:00 2001
From: Marko Bezulj <156311081+mbezuljTT@users.noreply.github.com>
Date: Thu, 28 Nov 2024 14:08:41 +0100
Subject: [PATCH 30/84] ttnn ops backend metal wrapper lib (#1230)

TTNNOpModelLib initial version.
To be used for op model interface (constraints, l1, perf).
builds with -DTTMLIR_ENABLE_OPMODEL=ON.
---
 .github/workflows/build-and-test.yml         |  13 +-
 CMakeLists.txt                               |   6 +
 include/ttmlir/OpModel/TTNN/TTNNOpModel.h    |  24 +++
 lib/CMakeLists.txt                           |   1 +
 lib/Dialect/TTNN/IR/CMakeLists.txt           |   2 +
 lib/Dialect/TTNN/IR/TTNNOpModelInterface.cpp |  13 +-
 lib/OpModel/CMakeLists.txt                   |   1 +
 lib/OpModel/TTNN/CMakeLists.txt              |  40 ++++
 lib/OpModel/TTNN/TTNNOpModelLib.cpp          | 183 +++++++++++++++++++
 lib/OpModel/TTNN/TTNNOpModelLib_Impl.h       |  60 ++++++
 10 files changed, 334 insertions(+), 9 deletions(-)
 create mode 100644 include/ttmlir/OpModel/TTNN/TTNNOpModel.h
 create mode 100644 lib/OpModel/CMakeLists.txt
 create mode 100644 lib/OpModel/TTNN/CMakeLists.txt
 create mode 100644 lib/OpModel/TTNN/TTNNOpModelLib.cpp
 create mode 100644 lib/OpModel/TTNN/TTNNOpModelLib_Impl.h

diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index ade377c06a..32e4eee3d6 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -47,8 +47,9 @@ jobs:
       fail-fast: false
       matrix:
         build: [
-          {runs-on: ubuntu-latest, enable_perf: OFF, name: "run", ttrt_flags: ""},
-          {runs-on: ubuntu-latest, enable_perf: ON, name: "perf", ttrt_flags: ""},
+          {runs-on: ubuntu-latest, enable_perf: OFF, enable_op_model: OFF, name: "run", ttrt_flags: ""},
+          {runs-on: ubuntu-latest, enable_perf: ON, enable_op_model: OFF, name: "perf", ttrt_flags: ""},
+          {runs-on: ubuntu-latest, enable_perf: OFF, enable_op_model: ON, name: "op_model" , ttrt_flags: ""}
         ]
 
     name: Build tt-mlir
@@ -78,7 +79,7 @@ jobs:
       uses: hendrikmuhs/ccache-action@v1.2
       with:
         create-symlink: true
-        key: ${{ matrix.build.runs-on }}-run-ON-perf-${{ matrix.build.enable_perf }}-${{ env.SDK_VERSION }}
+        key: ${{ matrix.build.runs-on }}-run-ON-perf-${{ matrix.build.enable_perf }}-op_model-${{ matrix.build.enable_op_model }}-${{ env.SDK_VERSION }}
 
     # Build project
 
@@ -97,6 +98,7 @@ jobs:
         -DTTMLIR_ENABLE_RUNTIME_TESTS=ON \
         -DTT_RUNTIME_ENABLE_PERF_TRACE=${{ matrix.build.enable_perf }} \
         -DTTMLIR_ENABLE_STABLEHLO=ON \
+        -DTTMLIR_ENABLE_OP_MODEL=${{ matrix.build.enable_op_model }} \
         -S ${{ steps.strings.outputs.work-dir }}
 
     - name: Build
@@ -147,7 +149,7 @@ jobs:
     - name: Upload Test Report
       uses: actions/upload-artifact@v4
       with:
-        name: test-reports-${{ matrix.build.runs-on }}-perf-${{ matrix.build.enable_perf }}
+        name: test-reports-${{ matrix.build.runs-on }}-perf-${{ matrix.build.enable_perf }}-op_model-${{ matrix.build.enable_op_model }}
         path: build/test/report.xml
 
     - name: Show Test Report
@@ -480,7 +482,7 @@ jobs:
       uses: hendrikmuhs/ccache-action@v1.2
       with:
         create-symlink: true
-        key: ${{ matrix.build.runs-on }}-run-ON-perf-${{ matrix.build.enable_perf }}-${{ env.SDK_VERSION }}
+        key: ${{ matrix.build.runs-on }}-run-ON-perf-${{ matrix.build.enable_perf }}-op_model-${{ matrix.build.enable_op_model }}-${{ env.SDK_VERSION }}
 
     - name: Configure CMake
       shell: bash
@@ -496,6 +498,7 @@ jobs:
         -DTTMLIR_ENABLE_RUNTIME_TESTS=OFF \
         -DTT_RUNTIME_ENABLE_PERF_TRACE=${{ matrix.build.enable_perf }} \
         -DTTMLIR_ENABLE_STABLEHLO=OFF \
+        -DTTMLIR_ENABLE_OP_MODEL=${{ matrix.build.enable_op_model }} \
         -S ${{ steps.strings.outputs.work-dir }}
 
     - name: Build tt-explorer
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 54fcc89d47..2927fb5602 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,6 +8,7 @@ endif()
 option(TT_RUNTIME_ENABLE_PERF_TRACE "Enable performance mode" OFF)
 option(TTMLIR_ENABLE_RUNTIME "Enable runtime" OFF)
 option(TTMLIR_ENABLE_STABLEHLO "Enable StableHLO support" OFF)
+option(TTMLIR_ENABLE_OP_MODEL "Enable OpModel support" OFF)
 
 if (TTMLIR_ENABLE_STABLEHLO)
   add_compile_definitions(TTMLIR_ENABLE_STABLEHLO)
@@ -20,6 +21,11 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 set(TTMLIR_ENABLE_BINDINGS_PYTHON ON CACHE BOOL "Enable Python bindings")
 
+if (APPLE)
+  set(TTMLIR_ENABLE_OP_MODEL OFF)
+  message(WARNING "TTNNOpModelLib is disabled on Apple platforms. Optimizer will not get true performance.")
+endif()
+
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake/modules)
 
 if (TT_RUNTIME_ENABLE_PERF_TRACE)
diff --git a/include/ttmlir/OpModel/TTNN/TTNNOpModel.h b/include/ttmlir/OpModel/TTNN/TTNNOpModel.h
new file mode 100644
index 0000000000..31ac149849
--- /dev/null
+++ b/include/ttmlir/OpModel/TTNN/TTNNOpModel.h
@@ -0,0 +1,24 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTMLIR_OPMODEL_TTNN_TTNNOPMODEL_H
+#define TTMLIR_OPMODEL_TTNN_TTNNOPMODEL_H
+
+#include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h"
+
+#include <tuple>
+
+namespace mlir::tt::op_model::ttnn {
+
+struct ReluOpInterface {
+  static bool isLegal(const mlir::tt::ttnn::TTNNLayoutAttr &inputLayout,
+                      const mlir::tt::ttnn::TTNNLayoutAttr &outputLayout);
+
+  static std::tuple<size_t, size_t, size_t>
+  getOpL1Usage(const mlir::tt::ttnn::TTNNLayoutAttr &inputLayout,
+               const mlir::tt::ttnn::TTNNLayoutAttr &outputLayout);
+};
+
+} // namespace mlir::tt::op_model::ttnn
+#endif // TTMLIR_OPMODEL_TTNN_TTNNOPMODEL_H
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index c3dc3a4b71..881d6545dc 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -1,6 +1,7 @@
 include_directories(${TTMLIR_TOOLCHAIN_DIR}/src/stablehlo)
 include_directories(${TTMLIR_TOOLCHAIN_DIR}/src/stablehlo-build)
 
+add_subdirectory(OpModel)
 add_subdirectory(CAPI)
 add_subdirectory(Conversion)
 add_subdirectory(Dialect)
diff --git a/lib/Dialect/TTNN/IR/CMakeLists.txt b/lib/Dialect/TTNN/IR/CMakeLists.txt
index 1620e96b5c..4b7804a5fd 100644
--- a/lib/Dialect/TTNN/IR/CMakeLists.txt
+++ b/lib/Dialect/TTNN/IR/CMakeLists.txt
@@ -11,10 +11,12 @@ add_mlir_dialect_library(MLIRTTNNDialect
         DEPENDS
         MLIRTTNNOpsIncGen
         MLIRTTOpsIncGen
+        TTNNOpModelLib
 
         LINK_LIBS PUBLIC
         TTMLIRTTNNUtils
         MLIRSCFToEmitC
         MLIRLinalgDialect
         MLIRMLProgramDialect
+        TTNNOpModelLib
         )
diff --git a/lib/Dialect/TTNN/IR/TTNNOpModelInterface.cpp b/lib/Dialect/TTNN/IR/TTNNOpModelInterface.cpp
index 9079a60194..344a4a4831 100644
--- a/lib/Dialect/TTNN/IR/TTNNOpModelInterface.cpp
+++ b/lib/Dialect/TTNN/IR/TTNNOpModelInterface.cpp
@@ -5,6 +5,9 @@
 #include "ttmlir/Dialect/TTNN/IR/TTNNOps.h"
 
 #include "ttmlir/Dialect/TTNN/IR/TTNNOpModelInterface.cpp.inc"
+#include "ttmlir/OpModel/TTNN/TTNNOpModel.h"
+
+#include <cassert>
 #include <tuple>
 
 namespace mlir::tt::ttnn {
@@ -22,14 +25,16 @@ size_t ReluOp::getOpPerfCycles(const std::vector<TTNNLayoutAttr> &input_layouts,
 std::tuple<size_t, size_t, size_t>
 ReluOp::getOpL1Usage(const std::vector<TTNNLayoutAttr> &input_layouts,
                      const TTNNLayoutAttr &output_layout) {
-  // TODO(mbezulj) wire to tt-metal once we have API
-  return std::make_tuple(1024, 2048, 1024);
+  assert(input_layouts.size() == 1);
+  return op_model::ttnn::ReluOpInterface::getOpL1Usage(input_layouts[0],
+                                                       output_layout);
 }
 
 bool ReluOp::isOpLegal(const std::vector<TTNNLayoutAttr> &input_layouts,
                        const TTNNLayoutAttr &output_layout) {
-  // TODO(mbezulj) wire to tt-metal once we have API
-  return true;
+  assert(input_layouts.size() == 1);
+  return op_model::ttnn::ReluOpInterface::isLegal(input_layouts[0],
+                                                  output_layout);
 }
 
 } // namespace mlir::tt::ttnn
diff --git a/lib/OpModel/CMakeLists.txt b/lib/OpModel/CMakeLists.txt
new file mode 100644
index 0000000000..9c34667d09
--- /dev/null
+++ b/lib/OpModel/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(TTNN)
diff --git a/lib/OpModel/TTNN/CMakeLists.txt b/lib/OpModel/TTNN/CMakeLists.txt
new file mode 100644
index 0000000000..094b9f1ddd
--- /dev/null
+++ b/lib/OpModel/TTNN/CMakeLists.txt
@@ -0,0 +1,40 @@
+set(LIB_NAME TTNNOpModelLib)
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+set(SOURCES
+    TTNNOpModelLib.cpp
+)
+add_library(${LIB_NAME} STATIC ${SOURCES})
+
+message(STATUS "TTMLIR_ENABLE_OP_MODEL[${TTMLIR_ENABLE_OP_MODEL}]")
+if (TTMLIR_ENABLE_OPMODEL)
+    # Link to tt-metal libs and include directories
+    target_include_directories(${LIB_NAME} PUBLIC "$<BUILD_INTERFACE:${TTMETAL_INCLUDE_DIRS}>")
+    target_link_libraries(${LIB_NAME} PUBLIC TTNN_LIBRARY TTMETAL_LIBRARY)
+    target_compile_definitions(${LIB_NAME} PUBLIC TTMLIR_ENABLE_OPMODEL)
+else()
+    # link stubs implementation when op model library is disabled
+    message(WARNING "TTNNOpModelLib is disabled. The optimizer will not achieve optimal performance.")
+endif()
+
+# Specify the include directories for the library
+target_include_directories(${LIB_NAME}
+    PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}/
+    ${PROJECT_SOURCE_DIR}/include/ttmlir/OpModel/TTNN/)
+
+
+# Add TTNNOpModelLib to the export set
+install(TARGETS ${LIB_NAME}
+        EXPORT TTNNOpModelLibTargets
+        LIBRARY DESTINATION lib
+        ARCHIVE DESTINATION lib
+        RUNTIME DESTINATION bin
+        INCLUDES DESTINATION include)
+
+# Export the targets
+export(EXPORT TTNNOpModelLibTargets
+       FILE "${CMAKE_CURRENT_BINARY_DIR}/TTNNOpModelLibTargets.cmake"
+       NAMESPACE TTNN::)
diff --git a/lib/OpModel/TTNN/TTNNOpModelLib.cpp b/lib/OpModel/TTNN/TTNNOpModelLib.cpp
new file mode 100644
index 0000000000..87bfc04150
--- /dev/null
+++ b/lib/OpModel/TTNN/TTNNOpModelLib.cpp
@@ -0,0 +1,183 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "TTNNOpModel.h"
+
+#ifdef TTMLIR_ENABLE_OPMODEL
+#include "TTNNOpModelLib_Impl.h"
+#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
+
+#include <llvm/Support/Casting.h>
+#include <mlir/IR/AttrTypeSubElements.h>
+
+#include <cstddef>
+#include <stdexcept>
+#endif // TTMLIR_ENABLE_OPMODEL
+
+namespace mlir::tt::op_model::ttnn {
+
+#ifdef TTMLIR_ENABLE_OPMODEL
+// alias to a common tt_metal types
+using DataType = ::tt::tt_metal::DataType;
+using Layout = ::tt::tt_metal::Layout;
+using CoreRange = ::tt::tt_metal::CoreRange;
+using CoreRangeSet = ::tt::tt_metal::CoreRangeSet;
+using CoreCoord = ::tt::tt_metal::CoreCoord;
+using ShardSpec = ::tt::tt_metal::ShardSpec;
+using ShardOrientation = ::tt::tt_metal::ShardOrientation;
+using TensorMemoryLayout = ::tt::tt_metal::TensorMemoryLayout;
+using MemoryConfig = ::tt::tt_metal::MemoryConfig;
+
+namespace detail {
+
+DataType getDataType(const mlir::MemRefType &memref) {
+
+  auto dataType = elementTypeToDataType(memref.getElementType());
+
+  switch (dataType) {
+  case tt::DataType::Float32:
+    return DataType::FLOAT32;
+  case tt::DataType::BFloat16:
+    return DataType::BFLOAT16;
+  case tt::DataType::BFP_BFloat8:
+    return DataType::BFLOAT8_B;
+  case tt::DataType::BFP_BFloat4:
+    return DataType::BFLOAT4_B;
+  case tt::DataType::UInt32:
+    return DataType::UINT32;
+  case tt::DataType::UInt16:
+    return DataType::UINT16;
+  case tt::DataType::UInt8:
+    return DataType::UINT8;
+  default:
+    throw std::runtime_error("Invalid element type");
+  }
+}
+
+::ttnn::SimpleShape getTensorShape(const mlir::MemRefType &memref) {
+  ::tt::tt_metal::SmallVector<uint32_t> small_vector_shape(
+      memref.getShape().begin(), memref.getShape().end());
+  return ::ttnn::SimpleShape(small_vector_shape);
+}
+
+const std::array<uint32_t, 2>
+getShardShape(const mlir::tt::ttnn::TTNNLayoutAttr &layout) {
+  const auto layoutShardTile = layout.getShardShape();
+
+  if (layoutShardTile.size() != 2) {
+    llvm::errs() << "ERROR: layout_shard_tile.size() != 2\n";
+    return {0, 0};
+  }
+
+  std::array<uint32_t, 2> shardShape;
+  shardShape[0] = layoutShardTile[0];
+  shardShape[1] = layoutShardTile[1];
+  return shardShape;
+}
+
+Layout getTensorLayout(const mlir::tt::ttnn::TTNNLayoutAttr &layout) {
+  return layout.isTiled() ? Layout::TILE : Layout::ROW_MAJOR;
+}
+
+CoreRangeSet getCoreRangeSet(const mlir::tt::ttnn::TTNNLayoutAttr &layout) {
+  // TODO(mbezulj): handle more complex grid shapes
+  // assuming grid shape is one rect starting at (0,0)
+
+  const auto layoutGrid = layout.getGrid();
+
+  const auto layoutGridShape = layoutGrid.getShape();
+  if (layoutGridShape.size() != 2) {
+    llvm::errs() << "ERROR: layout_grid.getShape().size() == 2\n";
+    return {};
+  }
+
+  return CoreRangeSet(CoreRange(CoreCoord(0, layoutGridShape[0]),
+                                CoreCoord(0, layoutGridShape[1])));
+}
+
+std::optional<ShardSpec>
+layout_get_shard_spec(const mlir::tt::ttnn::TTNNLayoutAttr &layout) {
+  // tt_ShardOrientation is not part of ttnn::TTNNLayoutAttr;
+  // defaulting to ROW_MAJOR. TODO: figure out if we need to expose this
+  return isShardedMemoryLayout(layout.getMemLayout())
+             ? std::make_optional(ShardSpec(getCoreRangeSet(layout),
+                                            getShardShape(layout),
+                                            ShardOrientation::ROW_MAJOR, false))
+             : std::nullopt;
+}
+
+::tt::tt_metal::BufferType getBufferType(const mlir::MemRefType &memref) {
+  auto memorySpace =
+      mlir::cast<tt::MemorySpaceAttr>(memref.getMemorySpace()).getValue();
+
+  switch (memorySpace) {
+  case tt::MemorySpace::DeviceDRAM:
+    return ::tt::tt_metal::BufferType::DRAM;
+  case tt::MemorySpace::DeviceL1:
+    return ::tt::tt_metal::BufferType::L1;
+  default: // TODO(mbezulj): handle other memory spaces
+    throw std::runtime_error("Unsupported memory space");
+  }
+}
+
+::tt::tt_metal::TensorMemoryLayout
+getTensorMemoryLayout(const mlir::tt::ttnn::TTNNLayoutAttr &layout) {
+  auto tensorMemoryLayout = layout.getMemLayout();
+
+  switch (tensorMemoryLayout) {
+  case mlir::tt::ttnn::TensorMemoryLayout::Interleaved:
+    return ::tt::tt_metal::TensorMemoryLayout::INTERLEAVED;
+  case mlir::tt::ttnn::TensorMemoryLayout::SingleBank:
+    return ::tt::tt_metal::TensorMemoryLayout::SINGLE_BANK;
+  case mlir::tt::ttnn::TensorMemoryLayout::HeightSharded:
+    return ::tt::tt_metal::TensorMemoryLayout::HEIGHT_SHARDED;
+  case mlir::tt::ttnn::TensorMemoryLayout::WidthSharded:
+    return ::tt::tt_metal::TensorMemoryLayout::WIDTH_SHARDED;
+  case mlir::tt::ttnn::TensorMemoryLayout::BlockSharded:
+    return ::tt::tt_metal::TensorMemoryLayout::BLOCK_SHARDED;
+  default:
+    throw std::runtime_error("Unsupported tensor memory layout");
+  }
+}
+
+::tt::tt_metal::MemoryConfig
+getMemoryConfig(const mlir::tt::ttnn::TTNNLayoutAttr &layout) {
+
+  auto tensorMemoryLayout = getTensorMemoryLayout(layout);
+  auto bufferType = getBufferType(layout.getMemref());
+
+  auto shardSpec = layout_get_shard_spec(layout);
+  return ::tt::tt_metal::MemoryConfig(tensorMemoryLayout, bufferType,
+                                      shardSpec);
+}
+
+} // namespace detail
+#endif // TTMLIR_ENABLE_OPMODEL
+
+//===----------------------------------------------------------------------===//
+// ReluOp
+//===----------------------------------------------------------------------===//
+
+bool ReluOpInterface::isLegal(
+    const mlir::tt::ttnn::TTNNLayoutAttr &inputLayout,
+    const mlir::tt::ttnn::TTNNLayoutAttr &outputLayout) {
+
+#ifdef TTMLIR_ENABLE_OPMODEL
+  return true; // to wire into tt-metal with the next uplift
+#else
+  return true;
+#endif // TTMLIR_ENABLE_OPMODEL
+}
+
+std::tuple<size_t, size_t, size_t> ReluOpInterface::getOpL1Usage(
+    const mlir::tt::ttnn::TTNNLayoutAttr &inputLayout,
+    const mlir::tt::ttnn::TTNNLayoutAttr &outputLayout) {
+#ifdef TTMLIR_ENABLE_OPMODEL
+  return std::make_tuple(0, 0, 0); // to wire into tt-metal with the next uplift
+#else
+  return std::make_tuple(0, 0, 0);
+#endif // TTMLIR_ENABLE_OPMODEL
+}
+
+} // namespace mlir::tt::op_model::ttnn
diff --git a/lib/OpModel/TTNN/TTNNOpModelLib_Impl.h b/lib/OpModel/TTNN/TTNNOpModelLib_Impl.h
new file mode 100644
index 0000000000..ed39d881a9
--- /dev/null
+++ b/lib/OpModel/TTNN/TTNNOpModelLib_Impl.h
@@ -0,0 +1,60 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTMLIR_OPMODEL_TTNN_TTNNOPMODELLIB_IMPL_H
+#define TTMLIR_OPMODEL_TTNN_TTNNOPMODELLIB_IMPL_H
+
+// This header resolves tt-metal warnings that would otherwise be treated as
+// errors in the MLIR build. Ensure that this is the only place where tt-metal
+// headers are included.
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcast-qual"
+#pragma clang diagnostic ignored "-Wctad-maybe-unsupported"
+#pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
+#pragma clang diagnostic ignored "-Wignored-qualifiers"
+#pragma clang diagnostic ignored "-Wvla-extension"
+#pragma clang diagnostic ignored "-Wcovered-switch-default"
+#pragma clang diagnostic ignored "-Wsign-compare"
+#pragma clang diagnostic ignored "-Wc++20-extensions"
+#pragma clang diagnostic ignored "-Wc++20-designator"
+#pragma clang diagnostic ignored "-Wnon-virtual-dtor"
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wunknown-warning-option"
+#pragma clang diagnostic ignored "-Wsuggest-override"
+#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
+#pragma clang diagnostic ignored "-Wnested-anon-types"
+#pragma clang diagnostic ignored "-Wreorder-ctor"
+#pragma clang diagnostic ignored "-Wmismatched-tags"
+#pragma clang diagnostic ignored "-Wunused-lambda-capture"
+#pragma clang diagnostic ignored "-Wmissing-field-initializers"
+#pragma clang diagnostic ignored "-Wunused-private-field"
+#pragma clang diagnostic ignored "-Wimplicit-fallthrough"
+#pragma clang diagnostic ignored "-Wstring-conversion"
+#pragma clang diagnostic ignored "-Wunneeded-internal-declaration"
+#pragma clang diagnostic ignored "-Wunused-local-typedef"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wpessimizing-move"
+#pragma clang diagnostic ignored "-Wparentheses"
+#pragma clang diagnostic ignored "-Wdeprecated-volatile"
+#pragma clang diagnostic ignored "-Wdeprecated-this-capture"
+#pragma clang diagnostic ignored "-Wc++23-extensions"
+#pragma clang diagnostic ignored "-Wunused-but-set-variable"
+#pragma clang diagnostic ignored "-Wlogical-op-parentheses"
+#pragma clang diagnostic ignored "-Wundefined-inline"
+#pragma clang diagnostic ignored "-Wc99-extensions"
+#pragma clang diagnostic ignored "-Wc++11-narrowing"
+#pragma clang diagnostic ignored "-Wzero-length-array"
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+
+#define FMT_HEADER_ONLY
+
+#include "tt_metal/common/core_coord.hpp"
+#include "tt_metal/impl/buffers/buffer.hpp"
+#include "ttnn/tensor/tensor.hpp"
+#include "ttnn/tensor/types.hpp"
+
+#pragma clang diagnostic pop
+
+#endif // TTMLIR_OPMODEL_TTNN_TTNNOPMODELLIB_IMPL_H

From 0585abd15448b836c7e1d4c4ddf06cc0fd7e0525 Mon Sep 17 00:00:00 2001
From: Kristijan Mitrovic <kmitrovic@tenstorrent.com>
Date: Thu, 28 Nov 2024 14:32:12 +0100
Subject: [PATCH 31/84] Removed explicit values/enumeration in fbs enums
 (#1426)

---
 include/ttmlir/Target/Common/types.fbs    | 94 ++++++++++++-----------
 include/ttmlir/Target/TTMetal/program.fbs | 28 +++----
 include/ttmlir/Target/TTNN/program.fbs    | 86 ++++++++++-----------
 3 files changed, 105 insertions(+), 103 deletions(-)

diff --git a/include/ttmlir/Target/Common/types.fbs b/include/ttmlir/Target/Common/types.fbs
index 2d67ee1d1c..3e7ed425f7 100644
--- a/include/ttmlir/Target/Common/types.fbs
+++ b/include/ttmlir/Target/Common/types.fbs
@@ -11,67 +11,67 @@ struct Dim2dRange {
 }
 
 enum Arch: uint {
-  Grayskull = 0,
-  Wormhole_b0 = 1,
-  Blackhole = 2,
+  Grayskull,
+  Wormhole_b0,
+  Blackhole
 }
 
 enum DataType: uint16 {
-  Float32 = 0,
-  Float16 = 1,
-  BFloat16 = 2,
-  BFP_Float8 = 3,
-  BFP_BFloat8 = 4,
-  BFP_Float4 = 5,
-  BFP_BFloat4 = 6,
-  BFP_Float2 = 7,
-  BFP_BFloat2 = 8,
-  UInt32 = 9,
-  UInt16 = 10,
-  UInt8 = 11,
+  Float32,
+  Float16,
+  BFloat16,
+  BFP_Float8,
+  BFP_BFloat8,
+  BFP_Float4,
+  BFP_BFloat4,
+  BFP_Float2,
+  BFP_BFloat2,
+  UInt32,
+  UInt16,
+  UInt8,
 }
 
 enum OOBVal: ushort {
-  Undef = 0,
-  Zero = 1,
-  One = 2,
-  Inf = 3,
-  NegInf = 4,
+  Undef,
+  Zero,
+  One,
+  Inf,
+  NegInf,
 }
 
 enum MemorySpace: ushort {
-  System = 0,
-  SystemMMIO = 1,
-  DeviceDRAM = 2,
-  DeviceL1 = 3,
+  System,
+  SystemMMIO,
+  DeviceDRAM,
+  DeviceL1,
 }
 
 enum ChipCapability: uint32 (bit_flags) {
-  PCIE = 0,
-  HostMMIO = 1,
+  PCIE,
+  HostMMIO,
 }
 
 enum TensorMemoryLayout: ushort {
-  None = 0,
-  Interleaved = 1,
-  SingleBank = 2,
-  HeightSharded = 3,
-  WidthSharded = 4,
-  BlockSharded = 5,
+  None,
+  Interleaved,
+  SingleBank,
+  HeightSharded,
+  WidthSharded,
+  BlockSharded,
 }
 
 enum TensorLayout: ushort {
-  RowMajor = 0,
-  Tile = 1,
-  Invalid = 2,
+  RowMajor,
+  Tile,
+  Invalid,
 }
 
 enum BufferType: ushort {
-  DRAM = 0,
-  L1 = 1,
-  SystemMemory = 2,
-  L1Small = 3,
-  Trace = 4,
+  DRAM,
+  L1,
+  SystemMemory,
+  L1Small,
+  Trace,
 }
 
 // TODO (#620): Add other fields like core_ranges, shard orientation etc.
@@ -197,8 +197,8 @@ table ChipPhysicalCores {
 
 enum CPURole: uint8
 {
-  Host = 0,
-  Device = 1,
+  Host,
+  Device,
 }
 
 table CPUDesc {
@@ -223,9 +223,11 @@ table EventRef {
   global_id: uint32;
 }
 
+// Explicit non-sequential enumeration copied over from tt-metal definition of
+// `enum class MathFidelity`.
 enum MathFidelity : uint8 {
-    LoFi          = 0,
-    HiFi2         = 2,
-    HiFi3         = 3,
-    HiFi4         = 4,
+  LoFi = 0,
+  HiFi2 = 2,
+  HiFi3 = 3,
+  HiFi4 = 4,
 }
diff --git a/include/ttmlir/Target/TTMetal/program.fbs b/include/ttmlir/Target/TTMetal/program.fbs
index 4fcf966020..52451234b1 100644
--- a/include/ttmlir/Target/TTMetal/program.fbs
+++ b/include/ttmlir/Target/TTMetal/program.fbs
@@ -3,18 +3,18 @@ include "Common/types.fbs";
 namespace tt.target.metal;
 
 enum NocIndex : ushort {
-  Noc0 = 0,
-  Noc1 = 1,
+  Noc0,
+  Noc1,
 }
 
 enum EthType : ushort {
-  Sender = 0,
-  Receiver = 1,
+  Sender,
+  Receiver,
 }
 
 enum UnpackToDestMode : uint8 {
-    UnpackToDestFp32 = 0,
-    Default = 1,
+    UnpackToDestFp32,
+    Default,
 }
 
 table NocConfig {
@@ -45,17 +45,17 @@ table KernelSource {
 }
 
 enum BinaryType : ushort {
-  BRISC = 0,
-  NCRISC = 1,
-  TRISC0 = 2,
-  TRISC1 = 3,
-  TRISC2 = 4,
-  ERISC = 5,
+  BRISC,
+  NCRISC,
+  TRISC0,
+  TRISC1,
+  TRISC2,
+  ERISC,
 }
 
 enum CoreType : ushort {
-  WORKER = 0,
-  ETH = 1,
+  WORKER,
+  ETH,
 }
 
 table KernelBinary {
diff --git a/include/ttmlir/Target/TTNN/program.fbs b/include/ttmlir/Target/TTNN/program.fbs
index 5f486bac93..39535e2f0b 100644
--- a/include/ttmlir/Target/TTNN/program.fbs
+++ b/include/ttmlir/Target/TTNN/program.fbs
@@ -72,46 +72,46 @@ table ArangeOp {
 }
 
 enum EltwiseOpType: uint32 {
-  Add = 0,
-  Multiply = 1,
-  Subtract = 2,
-  Relu = 3,
-  GreaterEqual = 4,
-  Sqrt = 5,
-  Div = 6,
-  Sigmoid = 7,
-  Reciprocal = 8,
-  Exp = 9,
-  Maximum = 10,
-  Abs = 11,
-  Neg = 12,
-  Rsqrt = 13,
-  Typecast = 14,
-  Equal = 15,
-  NotEqual = 16,
-  LessEqual = 17,
-  LessThan = 18,
-  GreaterThan = 19,
-  LogicalAnd = 20,
-  LogicalOr = 21,
-  LogicalNot = 22,
-  Cbrt = 23,
-  Minimum = 24,
-  Ceil = 25,
-  Sin = 26,
-  Cos = 27,
-  Log = 28,
-  Log1p = 29,
-  Expm1 = 30,
-  Sign = 31,
-  Remainder = 32,
-  IsFinite = 33,
-  Floor = 34,
-  Where = 35,
-  Gelu = 36,
-  LogicalXor = 37,
-  Clamp = 38,
-  LeakyRelu = 39,
+  Add,
+  Multiply,
+  Subtract,
+  Relu,
+  GreaterEqual,
+  Sqrt,
+  Div,
+  Sigmoid,
+  Reciprocal,
+  Exp,
+  Maximum,
+  Abs,
+  Neg,
+  Rsqrt,
+  Typecast,
+  Equal,
+  NotEqual,
+  LessEqual,
+  LessThan,
+  GreaterThan,
+  LogicalAnd,
+  LogicalOr,
+  LogicalNot,
+  Cbrt,
+  Minimum,
+  Ceil,
+  Sin,
+  Cos,
+  Log,
+  Log1p,
+  Expm1,
+  Sign,
+  Remainder,
+  IsFinite,
+  Floor,
+  Where,
+  Gelu,
+  LogicalXor,
+  Clamp,
+  LeakyRelu,
 }
 
 table ClampOpParams {
@@ -136,9 +136,9 @@ table EltwiseOp {
 }
 
 enum ReductionOpType: uint32 {
-  Sum = 0,
-  Mean = 1,
-  Max = 2,
+  Sum,
+  Mean,
+  Max,
 }
 
 table ReductionOp {

From 87cdd07da1eb16ba2c8528a8c3c837fe0b370a5e Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <157983820+vmilosevic@users.noreply.github.com>
Date: Thu, 28 Nov 2024 17:30:33 +0100
Subject: [PATCH 32/84] Dissable uplift auto-merge (#1362)

Disable auto-merge for now until we are more confident
that uplift won't break the downstream projects
---
 .github/workflows/nightly-uplift.yml | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/nightly-uplift.yml b/.github/workflows/nightly-uplift.yml
index b8dbf3d05c..54dd758aed 100644
--- a/.github/workflows/nightly-uplift.yml
+++ b/.github/workflows/nightly-uplift.yml
@@ -62,8 +62,11 @@ jobs:
           echo "Pull Request URL - ${{ steps.create-pr.outputs.pull-request-url }}"
           gh pr review ${{ steps.create-pr.outputs.pull-request-number }} --approve
 
-      - name: Enable Pull Request Automerge
-        if: ${{ steps.create-pr.outputs.pull-request-number }}
-        run: gh pr merge --squash --auto "${{ steps.create-pr.outputs.pull-request-number }}"
-        env:
-          GH_TOKEN: ${{ secrets.GH_TOKEN }}
+      # Note: Dissable auto-merge for now until we are more confident
+      # that uplift won't break the downstream projects
+      #
+      # - name: Enable Pull Request Automerge
+      #   if: ${{ steps.create-pr.outputs.pull-request-number }}
+      #   run: gh pr merge --squash --auto "${{ steps.create-pr.outputs.pull-request-number }}"
+      #   env:
+      #     GH_TOKEN: ${{ secrets.GH_TOKEN }}

From 5272015b77d525daaf86565f5eac2f2221bd39b8 Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <157983820+vmilosevic@users.noreply.github.com>
Date: Fri, 29 Nov 2024 00:32:44 +0100
Subject: [PATCH 33/84] Uplift third_party/tt-metal to
 efc6f706bf943e581e45d3b05e1129fbc857ba94 2024-11-28 (#1387)

* Uplift third_party/tt-metal to efc6f706bf943e581e45d3b05e1129fbc857ba94 2024-11-28

* Set new field enable_channels_padding=false on ::ttnn::operations::conv::conv2d::determine_parallel_config()

* Uplift third_party/tt-metal to d183d61dc5 2024-11-27 (+ 2 cherry picks)

* Uplift third_party/tt-metal to ed6dda9bd5 2024-11-27 (+ 2 cherry picks)

 - https://github.com/tenstorrent/tt-metal/issues/15510 (CB size assert)
 - https://github.com/tenstorrent/tt-metal/issues/15297 (GS/BH 1x1 MeshDevice)

---------

Co-authored-by: kmitrovicTT <169657397+kmitrovicTT@users.noreply.github.com>
Co-authored-by: Kyle Mabee <kmabee@tenstorrent.com>
---
 runtime/lib/ttnn/operations/pool/maxpool2d.cpp | 5 ++++-
 third_party/CMakeLists.txt                     | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/runtime/lib/ttnn/operations/pool/maxpool2d.cpp b/runtime/lib/ttnn/operations/pool/maxpool2d.cpp
index dfd8b9375e..4fc6fca87f 100644
--- a/runtime/lib/ttnn/operations/pool/maxpool2d.cpp
+++ b/runtime/lib/ttnn/operations/pool/maxpool2d.cpp
@@ -31,11 +31,14 @@ preshardForMaxPool2d(const ::tt::target::ttnn::MaxPool2dOp *op,
            op->dilation_width() * (op->kernel_width() - 1) - 1) /
               op->stride_width();
 
+  constexpr bool en_ch_padding = false;
+
   auto parallel_config =
       ::ttnn::operations::conv::conv2d::determine_parallel_config(
           ::ttnn::TensorMemoryLayout::HEIGHT_SHARDED, op->batch_size(),
           op->channels(), output_height, output_width, op->channels(),
-          device.compute_with_storage_grid_size(), ShardOrientation::ROW_MAJOR);
+          device.compute_with_storage_grid_size(), ShardOrientation::ROW_MAJOR,
+          en_ch_padding);
   auto sharded_memory_config = ::ttnn::operations::conv::conv2d::
       create_sharded_memory_config_from_parallel_config(inputShape,
                                                         parallel_config, 1);
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index c9ff431bf1..e033913e24 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -1,6 +1,6 @@
 include(ExternalProject)
 
-set(TT_METAL_VERSION "69870bdeaf1c9270e325810249def6a3e9f38fb4")
+set(TT_METAL_VERSION "82ba2cbad64d1e36cad446d1f2f9bd266883ae74")
 
 if ("$ENV{ARCH_NAME}" STREQUAL "grayskull")
   set(ARCH_NAME "grayskull")

From 3eca3d82086c4aea361db346765d845da0fbcbb8 Mon Sep 17 00:00:00 2001
From: Sanja Djukic <sdjukic@tenstorrent.com>
Date: Fri, 29 Nov 2024 12:07:15 +0100
Subject: [PATCH 34/84] TOSA to TTIR refactor: split pass and patterns into
 separate files (#1418)

* tosa to ttir refactor: split pass and patterns into separate files

* removed unused #include directives

* fixed the order of include directives, changed the name of the default dsp pattern

* added virtual checkConversionLegality and separated multiply op conversion pattern

* refactor for cleaner code: using the constructor of the base class for mulop pattern

* fixed formatting with pre-commit run
---
 .../ttmlir/Conversion/TosaToTTIR/TosaToTTIR.h |   6 +-
 lib/Conversion/TosaToTTIR/CMakeLists.txt      |   3 +-
 lib/Conversion/TosaToTTIR/TosaToTTIR.cpp      | 122 -----------------
 lib/Conversion/TosaToTTIR/TosaToTTIRPass.cpp  |  74 ++++++++++
 .../TosaToTTIR/TosaToTTIRPatterns.cpp         | 126 ++++++++++++++++++
 5 files changed, 207 insertions(+), 124 deletions(-)
 delete mode 100644 lib/Conversion/TosaToTTIR/TosaToTTIR.cpp
 create mode 100644 lib/Conversion/TosaToTTIR/TosaToTTIRPass.cpp
 create mode 100644 lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp

diff --git a/include/ttmlir/Conversion/TosaToTTIR/TosaToTTIR.h b/include/ttmlir/Conversion/TosaToTTIR/TosaToTTIR.h
index acd5373c90..5f1feb08b2 100644
--- a/include/ttmlir/Conversion/TosaToTTIR/TosaToTTIR.h
+++ b/include/ttmlir/Conversion/TosaToTTIR/TosaToTTIR.h
@@ -7,11 +7,15 @@
 
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir::tt {
 
+void populateTosaToTTIRPatterns(MLIRContext *ctx, RewritePatternSet &patterns,
+                                TypeConverter &typeConverter);
+
 std::unique_ptr<OperationPass<ModuleOp>> createConvertTosaToTTIRPass();
 
 } // namespace mlir::tt
 
-#endif
+#endif // TTMLIR_CONVERSION_TOSATOTTIR_TOSATOTTIR_H
diff --git a/lib/Conversion/TosaToTTIR/CMakeLists.txt b/lib/Conversion/TosaToTTIR/CMakeLists.txt
index 41baf75c67..56000eb652 100644
--- a/lib/Conversion/TosaToTTIR/CMakeLists.txt
+++ b/lib/Conversion/TosaToTTIR/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_library(TTMLIRTosaToTTIR
-  TosaToTTIR.cpp
+  TosaToTTIRPass.cpp
+  TosaToTTIRPatterns.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${PROJECT_SOURCE_DIR}/include/ttmlir/Conversion/TosaToTTIR
diff --git a/lib/Conversion/TosaToTTIR/TosaToTTIR.cpp b/lib/Conversion/TosaToTTIR/TosaToTTIR.cpp
deleted file mode 100644
index 6c6a7faf56..0000000000
--- a/lib/Conversion/TosaToTTIR/TosaToTTIR.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include "ttmlir/Conversion/TosaToTTIR/TosaToTTIR.h"
-#include "ttmlir/Dialect/TT/IR/TT.h"
-#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
-#include "ttmlir/Dialect/TTIR/IR/TTIR.h"
-#include "ttmlir/Dialect/TTIR/IR/TTIROps.h"
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Func/Transforms/FuncConversions.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tosa/IR/TosaOps.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/Dialect.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/ValueRange.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/DialectConversion.h"
-
-using namespace mlir;
-using namespace tt;
-
-namespace mlir::tt::ttir {
-
-#define GEN_PASS_DEF_CONVERTTOSATOTTIR
-#include "ttmlir/Conversion/Passes.h.inc"
-
-} // namespace mlir::tt::ttir
-
-namespace {
-
-template <typename SrcOp, typename DestOp,
-          typename Adaptor = typename SrcOp::Adaptor>
-class TosaToTTIROpConversionPattern : public OpConversionPattern<SrcOp> {
-  using OpConversionPattern<SrcOp>::OpConversionPattern;
-
-public:
-  LogicalResult
-  matchAndRewrite(SrcOp srcOp, Adaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    if constexpr (std::is_same<SrcOp, tosa::MulOp>::value) {
-      assert(srcOp.getShift() == 0);
-    }
-
-    auto outputType = mlir::cast<RankedTensorType>(srcOp.getResult().getType());
-    auto outputTensor = rewriter.create<tensor::EmptyOp>(
-        srcOp.getLoc(), outputType.getShape(), outputType.getElementType());
-    rewriter.replaceOpWithNewOp<DestOp>(
-        srcOp, TypeRange(outputTensor.getType()), adaptor.getOperands(),
-        ValueRange(outputTensor),
-        rewriter.getArrayAttr(
-            SmallVector<Attribute>(adaptor.getOperands().size() + 1,
-                                   rewriter.getAttr<OperandConstraintAttr>(
-                                       OperandConstraint::AnyDeviceTile))));
-    return success();
-  }
-};
-
-struct ConvertTosaToTTIRPass
-    : public ttir::impl::ConvertTosaToTTIRBase<ConvertTosaToTTIRPass> {
-  void runOnOperation() override {
-    mlir::ConversionTarget target(getContext());
-
-    target.addIllegalDialect<tosa::TosaDialect>();
-
-    target.addLegalDialect<ttir::TTIRDialect>();
-    target.addLegalOp<mlir::tensor::EmptyOp>();
-    target.addLegalOp<mlir::ModuleOp>();
-    target.addLegalOp<mlir::func::FuncOp>();
-    target.addLegalOp<mlir::func::ReturnOp>();
-
-    // For now keep the same type assuming tosa ops operate on builtin tensor.
-    TypeConverter typeConverter;
-    typeConverter.addConversion([](Type type) {
-      assert(isa<RankedTensorType>(type) &&
-             "only ranked tensor type supported");
-      return type;
-    });
-    RewritePatternSet patterns(&getContext());
-
-    // Add conversion patterns.
-    patterns
-        .add<TosaToTTIROpConversionPattern<tosa::AbsOp, mlir::tt::ttir::AbsOp>>(
-            typeConverter, &getContext());
-    patterns
-        .add<TosaToTTIROpConversionPattern<tosa::AddOp, mlir::tt::ttir::AddOp>>(
-            typeConverter, &getContext());
-    patterns.add<
-        TosaToTTIROpConversionPattern<tosa::MulOp, mlir::tt::ttir::MultiplyOp>>(
-        typeConverter, &getContext());
-    patterns.add<
-        TosaToTTIROpConversionPattern<tosa::NegateOp, mlir::tt::ttir::NegOp>>(
-        typeConverter, &getContext());
-    patterns.add<
-        TosaToTTIROpConversionPattern<tosa::SubOp, mlir::tt::ttir::SubtractOp>>(
-        typeConverter, &getContext());
-    patterns.add<TosaToTTIROpConversionPattern<tosa::GreaterEqualOp,
-                                               mlir::tt::ttir::GreaterEqualOp>>(
-        typeConverter, &getContext());
-
-    // Apply conversion.
-    if (failed(
-            applyFullConversion(getOperation(), target, std::move(patterns)))) {
-      signalPassFailure();
-      return;
-    }
-  }
-};
-
-} // namespace
-
-namespace mlir::tt {
-
-std::unique_ptr<OperationPass<ModuleOp>> createConvertTosaToTTIRPass() {
-  return std::make_unique<ConvertTosaToTTIRPass>();
-}
-
-} // namespace mlir::tt
diff --git a/lib/Conversion/TosaToTTIR/TosaToTTIRPass.cpp b/lib/Conversion/TosaToTTIR/TosaToTTIRPass.cpp
new file mode 100644
index 0000000000..183d58ccaa
--- /dev/null
+++ b/lib/Conversion/TosaToTTIR/TosaToTTIRPass.cpp
@@ -0,0 +1,74 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Func/Transforms/FuncConversions.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tosa/IR/TosaOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+#include "ttmlir/Conversion/TosaToTTIR/TosaToTTIR.h"
+#include "ttmlir/Dialect/TTIR/IR/TTIR.h"
+
+using namespace mlir;
+using namespace mlir::tt;
+
+namespace mlir::tt::ttir {
+
+#define GEN_PASS_DEF_CONVERTTOSATOTTIR
+#include "ttmlir/Conversion/Passes.h.inc"
+
+} // namespace mlir::tt::ttir
+
+namespace {
+
+struct ConvertTosaToTTIRPass
+    : public ttir::impl::ConvertTosaToTTIRBase<ConvertTosaToTTIRPass> {
+  void runOnOperation() override {
+    mlir::ConversionTarget target(getContext());
+
+    target.addIllegalDialect<tosa::TosaDialect>();
+
+    target.addLegalDialect<ttir::TTIRDialect>();
+    target.addLegalOp<mlir::tensor::EmptyOp>();
+    target.addLegalOp<mlir::ModuleOp>();
+    target.addLegalOp<mlir::func::FuncOp>();
+    target.addLegalOp<mlir::func::ReturnOp>();
+
+    // For now keep the same type assuming tosa ops operate on builtin tensor.
+    TypeConverter typeConverter;
+    typeConverter.addConversion([](Type type) {
+      assert(isa<RankedTensorType>(type) &&
+             "only ranked tensor type supported");
+      return type;
+    });
+    RewritePatternSet patterns(&getContext());
+
+    // Add conversion patterns.
+    populateTosaToTTIRPatterns(&getContext(), patterns, typeConverter);
+
+    // Apply conversion.
+    if (failed(
+            applyFullConversion(getOperation(), target, std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+  }
+};
+
+} // namespace
+
+namespace mlir::tt {
+
+std::unique_ptr<OperationPass<ModuleOp>> createConvertTosaToTTIRPass() {
+  return std::make_unique<ConvertTosaToTTIRPass>();
+}
+
+} // namespace mlir::tt
diff --git a/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp b/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp
new file mode 100644
index 0000000000..46eadb7899
--- /dev/null
+++ b/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp
@@ -0,0 +1,126 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "mlir/Dialect/Func/Transforms/FuncConversions.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tosa/IR/TosaOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+#include "ttmlir/Conversion/TosaToTTIR/TosaToTTIR.h"
+#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
+#include "ttmlir/Dialect/TTIR/IR/TTIROps.h"
+
+using namespace mlir;
+using namespace mlir::tt;
+
+namespace {
+
+// TODO(sdjukic): extract this pattern into separate file and use it for both
+// TOSA and StableHLO
+
+template <typename SrcOp, typename DestOp,
+          typename Adaptor = typename SrcOp::Adaptor>
+class TosaToTTIRDefaultDPSOpConversionPattern
+    : public OpConversionPattern<SrcOp> {
+  using OpConversionPattern<SrcOp>::OpConversionPattern;
+
+public:
+  LogicalResult
+  matchAndRewrite(SrcOp srcOp, Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    LogicalResult legalityResult =
+        checkConversionLegality(srcOp, adaptor, rewriter);
+    if (!legalityResult.succeeded()) {
+      return legalityResult;
+    }
+
+    RankedTensorType outputType =
+        mlir::cast<RankedTensorType>(srcOp.getResult().getType());
+    tensor::EmptyOp outputTensor = rewriter.create<tensor::EmptyOp>(
+        srcOp.getLoc(), outputType.getShape(), outputType.getElementType());
+    rewriter.replaceOpWithNewOp<DestOp>(
+        srcOp, TypeRange(outputTensor.getType()), adaptor.getOperands(),
+        ValueRange(outputTensor),
+        rewriter.getArrayAttr(
+            SmallVector<Attribute>(adaptor.getOperands().size() + 1,
+                                   rewriter.getAttr<OperandConstraintAttr>(
+                                       OperandConstraint::AnyDeviceTile))));
+    return success();
+  }
+
+private:
+  virtual LogicalResult
+  checkConversionLegality(SrcOp srcOp, Adaptor adaptor,
+                          ConversionPatternRewriter &rewriter) const {
+    return success();
+  }
+};
+
+class TosaToTTIRMultiplyOpConversionPattern
+    : public TosaToTTIRDefaultDPSOpConversionPattern<
+          tosa::MulOp, mlir::tt::ttir::MultiplyOp> {
+  using TosaToTTIRDefaultDPSOpConversionPattern<
+      tosa::MulOp,
+      mlir::tt::ttir::MultiplyOp>::TosaToTTIRDefaultDPSOpConversionPattern;
+
+private:
+  LogicalResult
+  checkConversionLegality(tosa::MulOp srcOp, tosa::MulOp::Adaptor adaptor,
+                          ConversionPatternRewriter &rewriter) const override {
+    if (srcOp.getShift() != 0) {
+      return rewriter.notifyMatchFailure(
+          srcOp, "TTIR MultiplyOp doesn't support shifted multiply.");
+    }
+    return success();
+  }
+};
+
+void addElementwiseUnaryOpsConversionPatterns(MLIRContext *ctx,
+                                              RewritePatternSet &patterns,
+                                              TypeConverter &typeConverter) {
+
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<tosa::AbsOp,
+                                                       mlir::tt::ttir::AbsOp>>(
+      typeConverter, ctx);
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<tosa::NegateOp,
+                                                       mlir::tt::ttir::NegOp>>(
+      typeConverter, ctx);
+}
+
+void addElementwiseBinaryOpsConversionPatterns(MLIRContext *ctx,
+                                               RewritePatternSet &patterns,
+                                               TypeConverter &typeConverter) {
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<tosa::AddOp,
+                                                       mlir::tt::ttir::AddOp>>(
+      typeConverter, ctx);
+  patterns.add<TosaToTTIRMultiplyOpConversionPattern>(typeConverter, ctx);
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
+      tosa::SubOp, mlir::tt::ttir::SubtractOp>>(typeConverter, ctx);
+}
+
+void addCompareOpsConversionPatterns(MLIRContext *ctx,
+                                     RewritePatternSet &patterns,
+                                     TypeConverter &typeConverter) {
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
+      tosa::GreaterEqualOp, mlir::tt::ttir::GreaterEqualOp>>(typeConverter,
+                                                             ctx);
+}
+
+} // namespace
+
+namespace mlir::tt {
+
+void populateTosaToTTIRPatterns(MLIRContext *ctx, RewritePatternSet &patterns,
+                                TypeConverter &typeConverter) {
+  addElementwiseUnaryOpsConversionPatterns(ctx, patterns, typeConverter);
+  addElementwiseBinaryOpsConversionPatterns(ctx, patterns, typeConverter);
+  addCompareOpsConversionPatterns(ctx, patterns, typeConverter);
+}
+
+} // namespace mlir::tt

From 99331c74676abf251ee807e74c2e45319b715ed4 Mon Sep 17 00:00:00 2001
From: Vladimir Canic <133228576+vcanicTT@users.noreply.github.com>
Date: Fri, 29 Nov 2024 17:18:34 +0100
Subject: [PATCH 35/84] Override mechanism (#1417)

* Add mechanism for overriding with corresponding unittests.

* Add mechanism for overriding with corresponding unittests.

* Add mechanism for overriding with corresponding unittests.

* Add mechanism for overriding with corresponding unittests.

* Add mechanism for overriding with corresponding unittests.

* Add mechanism for overriding with corresponding unittests.

* Add mechanism for overriding with corresponding unittests.

* Add mechanism for overriding with corresponding unittests.

* Add mechanism for overriding with corresponding unittests.

* Add mechanism for overriding with corresponding unittests.

* Add mechanism for overriding with corresponding unittests.

* Add mechanism for overriding with corresponding unittests.

* Add mechanism for overriding with corresponding unittests.

* Add mechanism for overriding with corresponding unittests.
---
 .../TT/Utils/MemoryLayoutAnalysisParams.h     |  17 +-
 .../Dialect/TTNN/Pipelines/TTNNPipelines.h    |   3 +-
 .../Dialect/TTNN/Utils/OptimizerOverrides.h   | 116 +++--
 .../ttmlir/Dialect/TTNN/Utils/PassOverrides.h |  91 ++++
 include/ttmlir/Dialect/TTNN/Utils/Utils.h     |   2 +
 lib/Dialect/TTNN/Utils/CMakeLists.txt         |   1 +
 lib/Dialect/TTNN/Utils/OptimizerOverrides.cpp | 316 ++++++-------
 lib/Dialect/TTNN/Utils/PassOverrides.cpp      | 206 +++++++++
 test/unittests/Optimizer/CMakeLists.txt       |   1 +
 .../Optimizer/TestOptimizerOverrides.cpp      | 433 ++++++++++++++++++
 10 files changed, 980 insertions(+), 206 deletions(-)
 create mode 100644 include/ttmlir/Dialect/TTNN/Utils/PassOverrides.h
 create mode 100644 lib/Dialect/TTNN/Utils/PassOverrides.cpp
 create mode 100644 test/unittests/Optimizer/TestOptimizerOverrides.cpp

diff --git a/include/ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h b/include/ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h
index 16fafe551a..4a44e883da 100644
--- a/include/ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h
+++ b/include/ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h
@@ -27,18 +27,23 @@ struct MemoryLayoutAnalysisPolicyTypeParser
     return false;
   }
 
-  static void print(llvm::raw_ostream &os,
-                    const MemoryLayoutAnalysisPolicyType &value) {
-    llvm::StringRef policy;
+  static std::string toString(const MemoryLayoutAnalysisPolicyType &value) {
+    std::string res;
     switch (value) {
     case MemoryLayoutAnalysisPolicyType::DFSharding:
-      policy = "DFSharding";
+      res += "DFSharding";
       break;
     case MemoryLayoutAnalysisPolicyType::L1Interleaved:
-      policy = "L1Interleaved";
+      res += "L1Interleaved";
       break;
     }
-    os << "memory-layout-analysis-policy=" << policy << "\n";
+    return res;
+  }
+
+  static void print(llvm::raw_ostream &os,
+                    const MemoryLayoutAnalysisPolicyType &value) {
+    os << "memory-layout-analysis-policy="
+       << MemoryLayoutAnalysisPolicyTypeParser::toString(value) << "\n";
   }
 };
 
diff --git a/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h b/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h
index 48c723e1cd..636d5f6238 100644
--- a/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h
+++ b/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h
@@ -6,7 +6,8 @@
 #define TTMLIR_DIALECT_TTNN_PIPELINES_TTNNPIPELINES_H
 
 #include "ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h"
-#include "ttmlir/Dialect/TTNN/Utils/OptimizerOverrides.h"
+#include "ttmlir/Dialect/TTNN/Utils/PassOverrides.h"
+#include "ttmlir/Dialect/TTNN/Utils/Utils.h"
 
 #include "mlir/Pass/PassOptions.h"
 
diff --git a/include/ttmlir/Dialect/TTNN/Utils/OptimizerOverrides.h b/include/ttmlir/Dialect/TTNN/Utils/OptimizerOverrides.h
index db24eeb287..c474106e3a 100644
--- a/include/ttmlir/Dialect/TTNN/Utils/OptimizerOverrides.h
+++ b/include/ttmlir/Dialect/TTNN/Utils/OptimizerOverrides.h
@@ -5,50 +5,98 @@
 #ifndef TTMLIR_DIALECT_TTNN_UTILS_OPTIMIZEROVERRIDES_H
 #define TTMLIR_DIALECT_TTNN_UTILS_OPTIMIZEROVERRIDES_H
 
-#include <llvm/Support/CommandLine.h>
-
-#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
-#include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h"
+#include "ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h"
+#include "ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h"
+#include "ttmlir/Dialect/TTNN/Utils/PassOverrides.h"
 
 namespace mlir::tt::ttnn {
 
-struct OutputLayoutOverrideParams {
-  SmallVector<int64_t, 2> grid;
-  BufferType bufferType;
-  TensorMemoryLayout tensorMemoryLayout; // INTERLEAVED / SHARDED etc...
-  Layout memoryLayout;                   // ROW_MAJOR / TILE
-  tt::DataType dataType;
-};
+class OptimizerOverridesHandler {
+public:
+  OptimizerOverridesHandler() {};
+  ~OptimizerOverridesHandler() {};
 
-struct InputLayoutOverrideParams {
-  SmallVector<int64_t> operandIdxes;
-};
+  // Setters for the overrides
+  // These are used to enable/disable the optimizer passes
+  void setEnableOptimizer(bool);
+  // These are used to enable/disable the memory configurations
+  void setMemoryReconfig(bool);
+  void setEnableMemoryLayoutAnalysis(bool);
+  void setEnableMemoryLayoutAnalysisPolicy(bool);
+  void setMemoryLayoutAnalysisPolicy(MemoryLayoutAnalysisPolicyType);
+  // These are used to set the input/output layout overrides
+  void setInputLayoutOverrides(llvm::StringMap<InputLayoutOverrideParams> &);
+  void setOutputLayoutOverrides(llvm::StringMap<OutputLayoutOverrideParams> &);
+  // These are used to add system descriptor path
+  void setSystemDescPath(std::string);
+  // These are used to set the maximum number of legal layouts for grid analysis
+  void setMaxLegalLayouts(int64_t);
+  // These are used to set the mesh shape
+  void setMeshShape(std::vector<int64_t>);
 
-struct OutputLayoutOverrideParser
-    : public llvm::cl::parser<llvm::StringMap<OutputLayoutOverrideParams>> {
-public:
-  OutputLayoutOverrideParser(llvm::cl::Option &opt)
-      : llvm::cl::parser<llvm::StringMap<OutputLayoutOverrideParams>>(opt) {}
+  // Getters for the overrides
+  // These are used to get the current state of the optimizer passes
+  bool getEnableOptimizer() const;
+  // These are used to get the current state of the memory configurations
+  bool getMemoryReconfig() const;
+  bool getEnableMemoryLayoutAnalysis() const;
+  bool getEnableMemoryLayoutAnalysisPolicy() const;
+  MemoryLayoutAnalysisPolicyType getMemoryLayoutAnalysisPolicy() const;
+  // These are used to get the current input/output layout overrides
+  llvm::StringMap<InputLayoutOverrideParams> getInputLayoutOverrides() const;
+  llvm::StringMap<OutputLayoutOverrideParams> getOutputLayoutOverrides() const;
+  // These are used to get the current system descriptor path
+  std::string getSystemDescPath() const;
+  // These are used to get the current maximum number of legal layouts for grid
+  // analysis
+  int64_t getMaxLegalLayouts() const;
+  // These are used to get the current mesh shape
+  std::vector<int64_t> getMeshShape() const;
 
-  bool parse(llvm::cl::Option &opt, StringRef argName, StringRef arg,
-             llvm::StringMap<OutputLayoutOverrideParams> &value);
+  // Method that converts the overrides to a string
+  std::string toString() const;
 
-  static void print(llvm::raw_ostream &os,
-                    const llvm::StringMap<OutputLayoutOverrideParams> &value);
-};
+  // Fill input/output layout overrides maps.
+  // This is used from tt-forge frontend where we define and compile the models.
+  void addInputLayoutOverride(StringRef, InputLayoutOverrideParams);
+  void addInputLayoutOverride(StringRef, SmallVector<int64_t> &);
+  void addOutputLayoutOverride(StringRef, OutputLayoutOverrideParams);
+  void addOutputLayoutOverride(StringRef, SmallVector<int64_t> &, BufferType,
+                               TensorMemoryLayout, tt::ttnn::Layout,
+                               tt::DataType);
 
-struct InputLayoutOverrideParser
-    : public llvm::cl::parser<llvm::StringMap<InputLayoutOverrideParams>> {
-public:
-  InputLayoutOverrideParser(llvm::cl::Option &opt)
-      : llvm::cl::parser<llvm::StringMap<InputLayoutOverrideParams>>(opt) {}
+private:
+  // Options for the TTIR to TTNN backend pipeline,
+  // we use them to extract the names and the deafulat values.
+  TTIRToTTNNBackendPipelineOptions pipelineOptions;
+
+  // Flags for enabling/disabling the optimizer passes
+  bool enableOptimizer = false;
+
+  // Flags for enabling/disabling the memory configurations
+  bool enableMemoryReconfig = true;
+  bool enableMemoryLayoutAnalysis = false;
+
+  // Input layout overrides
+  llvm::StringMap<InputLayoutOverrideParams> inputLayoutOverrides;
+
+  // Output layout overrides
+  llvm::StringMap<OutputLayoutOverrideParams> outputLayoutOverrides;
+
+  // Memory layout analysis policy
+  bool enableMemoryLayoutAnalysisPolicy = false;
+  MemoryLayoutAnalysisPolicyType memoryLayoutAnalysisPolicy;
+
+  // System descriptor path
+  std::string systemDescPath;
+
+  // Maximum number of legal layouts for grid analysis
+  int64_t maxLegalLayouts = 0;
 
-  bool parse(llvm::cl::Option &opt, StringRef argName, StringRef arg,
-             llvm::StringMap<InputLayoutOverrideParams> &value);
+  // Mesh shape
+  std::vector<int64_t> meshShape;
 
-  static void print(llvm::raw_ostream &os,
-                    const llvm::StringMap<InputLayoutOverrideParams> &value);
-};
+}; // class OptimizerOverridesHandler
 
 } // namespace mlir::tt::ttnn
 
diff --git a/include/ttmlir/Dialect/TTNN/Utils/PassOverrides.h b/include/ttmlir/Dialect/TTNN/Utils/PassOverrides.h
new file mode 100644
index 0000000000..09e587c9c3
--- /dev/null
+++ b/include/ttmlir/Dialect/TTNN/Utils/PassOverrides.h
@@ -0,0 +1,91 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTMLIR_DIALECT_TTNN_UTILS_PASSOVERRIDES_H
+#define TTMLIR_DIALECT_TTNN_UTILS_PASSOVERRIDES_H
+
+#include <llvm/Support/CommandLine.h>
+
+#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
+#include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h"
+#include "ttmlir/Dialect/TTNN/IR/TTNNOpsTypes.h"
+
+namespace mlir::tt::ttnn {
+
+struct OutputLayoutOverrideParams {
+
+  SmallVector<int64_t, 2> grid;
+  BufferType bufferType;
+  TensorMemoryLayout tensorMemoryLayout; // INTERLEAVED / SHARDED etc...
+  Layout memoryLayout;                   // ROW_MAJOR / TILE
+  mlir::tt::DataType dataType;
+
+  bool operator==(const OutputLayoutOverrideParams rhs) const {
+    return grid[0] == rhs.grid[0] && grid[1] == rhs.grid[1] &&
+           bufferType == rhs.bufferType &&
+           tensorMemoryLayout == rhs.tensorMemoryLayout &&
+           memoryLayout == rhs.memoryLayout && dataType == rhs.dataType;
+  }
+
+  bool operator!=(const OutputLayoutOverrideParams &rhs) const {
+    return !(*this == rhs);
+  }
+};
+
+struct InputLayoutOverrideParams {
+
+  SmallVector<int64_t> operandIdxes;
+
+  bool operator==(const InputLayoutOverrideParams &rhs) const {
+    if (operandIdxes.size() != rhs.operandIdxes.size()) {
+      return false;
+    }
+    for (std::size_t i = 0; i < operandIdxes.size(); i++) {
+      if (operandIdxes[i] != rhs.operandIdxes[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool operator!=(const InputLayoutOverrideParams &rhs) const {
+    return !(*this == rhs);
+  }
+};
+
+struct OutputLayoutOverrideParser
+    : public llvm::cl::parser<llvm::StringMap<OutputLayoutOverrideParams>> {
+public:
+  OutputLayoutOverrideParser(llvm::cl::Option &opt)
+      : llvm::cl::parser<llvm::StringMap<OutputLayoutOverrideParams>>(opt) {}
+
+  bool parse(llvm::cl::Option &opt, StringRef argName, StringRef arg,
+             llvm::StringMap<OutputLayoutOverrideParams> &value);
+
+  static std::string
+  toString(const llvm::StringMap<OutputLayoutOverrideParams> &);
+
+  static void print(llvm::raw_ostream &os,
+                    const llvm::StringMap<OutputLayoutOverrideParams> &value);
+};
+
+struct InputLayoutOverrideParser
+    : public llvm::cl::parser<llvm::StringMap<InputLayoutOverrideParams>> {
+public:
+  InputLayoutOverrideParser(llvm::cl::Option &opt)
+      : llvm::cl::parser<llvm::StringMap<InputLayoutOverrideParams>>(opt) {}
+
+  bool parse(llvm::cl::Option &opt, StringRef argName, StringRef arg,
+             llvm::StringMap<InputLayoutOverrideParams> &value);
+
+  static std::string
+  toString(const llvm::StringMap<InputLayoutOverrideParams> &);
+
+  static void print(llvm::raw_ostream &os,
+                    const llvm::StringMap<InputLayoutOverrideParams> &value);
+};
+
+} // namespace mlir::tt::ttnn
+
+#endif // TTMLIR_DIALECT_TTNN_UTILS_PASSOVERRIDES_H
diff --git a/include/ttmlir/Dialect/TTNN/Utils/Utils.h b/include/ttmlir/Dialect/TTNN/Utils/Utils.h
index 533235a610..d7d8fbdd30 100644
--- a/include/ttmlir/Dialect/TTNN/Utils/Utils.h
+++ b/include/ttmlir/Dialect/TTNN/Utils/Utils.h
@@ -5,6 +5,8 @@
 #ifndef TTMLIR_DIALECT_TTNN_UTILS_UTILS_H
 #define TTMLIR_DIALECT_TTNN_UTILS_UTILS_H
 
+#include <llvm/Support/CommandLine.h>
+
 #include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
 #include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h"
 #include "ttmlir/Dialect/TTNN/IR/TTNNOpsTypes.h"
diff --git a/lib/Dialect/TTNN/Utils/CMakeLists.txt b/lib/Dialect/TTNN/Utils/CMakeLists.txt
index f49f829e6f..f78f418642 100644
--- a/lib/Dialect/TTNN/Utils/CMakeLists.txt
+++ b/lib/Dialect/TTNN/Utils/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_mlir_dialect_library(TTMLIRTTNNUtils
   Utils.cpp
   OptimizerOverrides.cpp
+  PassOverrides.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/TTNN
diff --git a/lib/Dialect/TTNN/Utils/OptimizerOverrides.cpp b/lib/Dialect/TTNN/Utils/OptimizerOverrides.cpp
index 5ef306cdb0..bbc456948e 100644
--- a/lib/Dialect/TTNN/Utils/OptimizerOverrides.cpp
+++ b/lib/Dialect/TTNN/Utils/OptimizerOverrides.cpp
@@ -6,187 +6,173 @@
 
 namespace mlir::tt::ttnn {
 
-bool OutputLayoutOverrideParser::parse(
-    llvm::cl::Option &opt, StringRef argName, StringRef arg,
+void OptimizerOverridesHandler::setEnableOptimizer(bool value) {
+  enableOptimizer = value;
+}
+
+void OptimizerOverridesHandler::setMemoryReconfig(bool value) {
+  enableMemoryReconfig = value;
+}
+void OptimizerOverridesHandler::setEnableMemoryLayoutAnalysis(bool value) {
+  enableMemoryLayoutAnalysis = value;
+}
+void OptimizerOverridesHandler::setEnableMemoryLayoutAnalysisPolicy(
+    bool value) {
+  enableMemoryLayoutAnalysisPolicy = value;
+}
+void OptimizerOverridesHandler::setMemoryLayoutAnalysisPolicy(
+    MemoryLayoutAnalysisPolicyType value) {
+  memoryLayoutAnalysisPolicy = value;
+}
+
+void OptimizerOverridesHandler::setInputLayoutOverrides(
+    llvm::StringMap<InputLayoutOverrideParams> &value) {
+  inputLayoutOverrides = value;
+}
+void OptimizerOverridesHandler::setOutputLayoutOverrides(
     llvm::StringMap<OutputLayoutOverrideParams> &value) {
-  SmallVector<StringRef> opOverrideList;
-  constexpr size_t kMaxGridSize = 2;
-  constexpr size_t kvPairSize = 2;
-  constexpr size_t kMaxLayoutOverrideParams = 5;
-  constexpr size_t iOpName = 0;
-  constexpr size_t iLayoutOverrideParams = 1;
-  constexpr size_t iGrid = 0;
-  constexpr size_t iMemorySpace = 1;
-  constexpr size_t iTensorMemoryLayout = 2;
-  constexpr size_t iMemoryLayout = 3;
-  constexpr size_t iDataType = 4;
-  constexpr char opSeparator = ',';
-  constexpr char opNameSeparator = '=';
-  constexpr char paramSepataor = ':';
-  constexpr char gridSeparator = 'x';
-
-  arg.split(opOverrideList, opSeparator);
-  for (const StringRef override : opOverrideList) {
-    SmallVector<StringRef, kvPairSize> opOverrideParts;
-    override.split(opOverrideParts, opNameSeparator);
-    if (opOverrideParts.size() != kvPairSize) {
-      opt.error("Invalid format for override grid sizes: " + override);
-      return true;
-    }
+  outputLayoutOverrides = value;
+}
 
-    SmallVector<StringRef, kMaxLayoutOverrideParams> layoutParamParts;
-    // Split into layout parameters.
-    opOverrideParts[iLayoutOverrideParams].split(layoutParamParts,
-                                                 paramSepataor);
-    if (layoutParamParts.size() != kMaxLayoutOverrideParams) {
-      opt.error("Invalid number of layout parameters: " +
-                std::to_string(layoutParamParts.size()));
-      return true;
-    }
+void OptimizerOverridesHandler::setSystemDescPath(std::string value) {
+  systemDescPath = value;
+}
+void OptimizerOverridesHandler::setMaxLegalLayouts(int64_t value) {
+  maxLegalLayouts = value;
+}
+void OptimizerOverridesHandler::setMeshShape(std::vector<int64_t> value) {
+  meshShape = value;
+}
 
-    // Parse grid.
-    SmallVector<int64_t, kMaxGridSize> grid;
-    SmallVector<StringRef, kMaxGridSize> gridParts;
-    layoutParamParts[iGrid].split(gridParts, gridSeparator);
-    for (const StringRef gridPart : gridParts) {
-      int64_t gridValue;
-      if (gridPart.getAsInteger(10 /*Radix*/, gridValue)) {
-        opt.error("Invalid grid size: " + gridPart);
-        return true;
-      }
-      grid.push_back(gridValue);
-    }
+bool OptimizerOverridesHandler::getEnableOptimizer() const {
+  return enableOptimizer;
+}
 
-    // Parse memory space.
-    std::optional<BufferType> bufferType =
-        symbolizeBufferType(layoutParamParts[iMemorySpace]);
-    if (!bufferType.has_value()) {
-      opt.error("Invalid memory space: " + layoutParamParts[iMemorySpace]);
-      return true;
-    }
+bool OptimizerOverridesHandler::getMemoryReconfig() const {
+  return enableMemoryReconfig;
+}
+bool OptimizerOverridesHandler::getEnableMemoryLayoutAnalysis() const {
+  return enableMemoryLayoutAnalysis;
+}
+bool OptimizerOverridesHandler::getEnableMemoryLayoutAnalysisPolicy() const {
+  return enableMemoryLayoutAnalysisPolicy;
+}
+MemoryLayoutAnalysisPolicyType
+OptimizerOverridesHandler::getMemoryLayoutAnalysisPolicy() const {
+  return memoryLayoutAnalysisPolicy;
+}
 
-    // Parse tensor memory layout.
-    std::optional<TensorMemoryLayout> tensorMemoryLayout =
-        symbolizeTensorMemoryLayout(layoutParamParts[iTensorMemoryLayout]);
-    if (!tensorMemoryLayout.has_value()) {
-      opt.error("Invalid tensor memory layout: " +
-                layoutParamParts[iTensorMemoryLayout]);
-      return true;
-    }
+std::string OptimizerOverridesHandler::getSystemDescPath() const {
+  return systemDescPath;
+}
+int64_t OptimizerOverridesHandler::getMaxLegalLayouts() const {
+  return maxLegalLayouts;
+}
+std::vector<int64_t> OptimizerOverridesHandler::getMeshShape() const {
+  return meshShape;
+}
 
-    // Parse memory layout.
-    std::optional<tt::ttnn::Layout> memoryLayout =
-        mlir::tt::ttnn::symbolizeLayout(layoutParamParts[iMemoryLayout]);
-    if (!memoryLayout.has_value()) {
-      opt.error("Invalid memory layout: " + layoutParamParts[iMemoryLayout]);
-      return true;
-    }
+llvm::StringMap<InputLayoutOverrideParams>
+OptimizerOverridesHandler::getInputLayoutOverrides() const {
+  return inputLayoutOverrides;
+}
+llvm::StringMap<OutputLayoutOverrideParams>
+OptimizerOverridesHandler::getOutputLayoutOverrides() const {
+  return outputLayoutOverrides;
+}
 
-    // Parse data type.
-    std::optional<tt::DataType> dataType =
-        mlir::tt::DataTypeStringToEnum(layoutParamParts[iDataType]);
-    if (!dataType.has_value()) {
-      opt.error("Invalid data type: " + layoutParamParts[iDataType]);
-      return true;
-    }
+std::string OptimizerOverridesHandler::toString() const {
 
-    // Set parsed op overrides.
-    value[opOverrideParts[iOpName]] = OutputLayoutOverrideParams{
-        std::move(grid), bufferType.value(), tensorMemoryLayout.value(),
-        memoryLayout.value(), dataType.value()};
+  std::string options = "";
+
+  if (enableOptimizer) {
+    options += std::string(pipelineOptions.optimizerPassEnabled.getArgStr()) +
+               "=true ";
   }
-  return false;
-}
-
-void OutputLayoutOverrideParser::print(
-    llvm::raw_ostream &os,
-    const llvm::StringMap<OutputLayoutOverrideParams> &value) {
-  os << "override-output-layout=";
-  size_t count = 0;
-  for (const auto &entry : value) {
-    os << entry.getKey() << "=";
-    const OutputLayoutOverrideParams &params = entry.getValue();
-    // Print grid values
-    for (size_t i = 0; i < params.grid.size(); ++i) {
-      os << params.grid[i];
-      if (i < params.grid.size() - 1) {
-        os << "x";
-      }
-    }
-    // Print memory space and memory layout
-    os << ":" << mlir::tt::ttnn::stringifyBufferType(params.bufferType);
-    os << ":"
-       << mlir::tt::ttnn::stringifyTensorMemoryLayout(
-              params.tensorMemoryLayout);
-    os << ":" << mlir::tt::ttnn::stringifyLayout(params.memoryLayout);
-    os << ":" << mlir::tt::DataTypeEnumToString(params.dataType);
-    if (++count < value.size()) {
-      os << ",";
-    }
+
+  if (enableMemoryReconfig) {
+    options +=
+        std::string(pipelineOptions.memReconfigEnabled.getArgStr()) + "=true ";
   }
-  os << "\n";
-}
 
-bool InputLayoutOverrideParser::parse(
-    llvm::cl::Option &opt, StringRef argName, StringRef arg,
-    llvm::StringMap<InputLayoutOverrideParams> &value) {
-  SmallVector<StringRef> opOverrideList;
-  constexpr size_t kvPairSize = 2;
-  constexpr size_t iOpName = 0;
-  constexpr size_t iOperands = 1;
-  constexpr char opSeparator = ',';
-  constexpr char opNameSeparator = '=';
-  constexpr char opParamSeparator = ':';
-
-  arg.split(opOverrideList, opSeparator);
-  for (const StringRef override : opOverrideList) {
-    SmallVector<StringRef, kvPairSize> opOverrideParts;
-    override.split(opOverrideParts, opNameSeparator);
-    if (opOverrideParts.size() != kvPairSize) {
-      opt.error("Invalid format for input layouts override: " + override);
-      return true;
-    }
+  if (enableMemoryLayoutAnalysis) {
+    options +=
+        std::string(pipelineOptions.memoryLayoutAnalysisEnabled.getArgStr()) +
+        "=true ";
+  }
 
-    SmallVector<int64_t> operandIndexes;
-    SmallVector<StringRef> operandIndexParts;
-
-    // Parse operand indexes.
-    opOverrideParts[iOperands].split(operandIndexParts, opParamSeparator);
-    for (const StringRef operandIndexPart : operandIndexParts) {
-      int64_t operandIndexValue;
-      if (operandIndexPart.getAsInteger(10 /*Radix*/, operandIndexValue)) {
-        opt.error("Invalid operand index: " + operandIndexPart);
-        return true;
-      }
-      operandIndexes.push_back(operandIndexValue);
-    }
+  if (enableMemoryLayoutAnalysisPolicy) {
+    options +=
+        std::string(pipelineOptions.memoryLayoutAnalysisPolicy.getArgStr()) +
+        MemoryLayoutAnalysisPolicyTypeParser::toString(
+            memoryLayoutAnalysisPolicy) +
+        " ";
+  }
 
-    // Set parsed op overrides.
-    value[opOverrideParts[iOpName]] =
-        InputLayoutOverrideParams{std::move(operandIndexes)};
+  // Create input layout overrides.
+  //  Example: insert-memreconfig=input0=0:1,input1=0,input2=0:1:2
+  if (inputLayoutOverrides.size() > 0) {
+    options += std::string(pipelineOptions.overrideInputLayout.getArgStr()) +
+               "=" + InputLayoutOverrideParser::toString(inputLayoutOverrides) +
+               " ";
   }
-  return false;
-}
-
-void InputLayoutOverrideParser::print(
-    llvm::raw_ostream &os,
-    const llvm::StringMap<InputLayoutOverrideParams> &value) {
-  os << "insert-memreconfig=";
-  size_t count = 0;
-  for (const auto &entry : value) {
-    os << entry.getKey() << "=";
-    const InputLayoutOverrideParams &params = entry.getValue();
-    for (int64_t operandIdx : params.operandIdxes) {
-      os << operandIdx
-         << (operandIdx < static_cast<int64_t>(params.operandIdxes.size()) - 1
-                 ? ':'
-                 : char());
-    }
-    if (++count < value.size()) {
-      os << ",";
+
+  // Create output layout overrides.
+  //  Example:
+  //  override-output-layout=op1=2x2:dram:interleaved:tile:fp32,op2=4x4:l1:block_sharded:row_major:fp16
+  //  Example:
+  //  override-output-layout=add_1_2=1x1:dram:interleaved:row_major:f32"
+  if (outputLayoutOverrides.size() > 0) {
+    options +=
+        std::string(pipelineOptions.overrideOutputLayout.getArgStr()) + "=" +
+        OutputLayoutOverrideParser::toString(outputLayoutOverrides) + " ";
+  }
+
+  if (systemDescPath.size() > 0) {
+    options += std::string(pipelineOptions.systemDescPath.getArgStr()) +
+               systemDescPath + " ";
+  }
+
+  if (maxLegalLayouts > 0) {
+    options += std::string(pipelineOptions.maxLegalLayouts.getArgStr()) +
+               std::to_string(maxLegalLayouts) + " ";
+  }
+
+  if (meshShape.size() > 0) {
+    options += std::string(pipelineOptions.meshShape.getArgStr()) + "=";
+    for (int64_t meshShapeValue : meshShape) {
+      options += std::to_string(meshShapeValue) + ",";
     }
+    // Remove the last comma.
+    options.pop_back();
+  }
+
+  if (options[options.size() - 1] == ' ') {
+    options.pop_back();
   }
-  os << "\n";
+
+  return options;
+}
+
+void OptimizerOverridesHandler::addInputLayoutOverride(
+    StringRef opName, InputLayoutOverrideParams params) {
+  inputLayoutOverrides[opName] = params;
+}
+void OptimizerOverridesHandler::addInputLayoutOverride(
+    StringRef opName, SmallVector<int64_t> &operandIdxes) {
+  inputLayoutOverrides[opName] =
+      InputLayoutOverrideParams{std::move(operandIdxes)};
+}
+void OptimizerOverridesHandler::addOutputLayoutOverride(
+    StringRef opName, OutputLayoutOverrideParams params) {
+  outputLayoutOverrides[opName] = params;
+}
+void OptimizerOverridesHandler::addOutputLayoutOverride(
+    StringRef opName, SmallVector<int64_t> &grid, BufferType bufferType,
+    TensorMemoryLayout tensorMemoryLayout, tt::ttnn::Layout memoryLayout,
+    tt::DataType dataType) {
+  outputLayoutOverrides[opName] = OutputLayoutOverrideParams{
+      std::move(grid), bufferType, tensorMemoryLayout, memoryLayout, dataType};
 }
 
 } // namespace mlir::tt::ttnn
diff --git a/lib/Dialect/TTNN/Utils/PassOverrides.cpp b/lib/Dialect/TTNN/Utils/PassOverrides.cpp
new file mode 100644
index 0000000000..9c8ef2be1f
--- /dev/null
+++ b/lib/Dialect/TTNN/Utils/PassOverrides.cpp
@@ -0,0 +1,206 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ttmlir/Dialect/TTNN/Utils/PassOverrides.h"
+
+namespace mlir::tt::ttnn {
+
+bool OutputLayoutOverrideParser::parse(
+    llvm::cl::Option &opt, StringRef argName, StringRef arg,
+    llvm::StringMap<OutputLayoutOverrideParams> &value) {
+  SmallVector<StringRef> opOverrideList;
+  constexpr size_t kMaxGridSize = 2;
+  constexpr size_t kvPairSize = 2;
+  constexpr size_t kMaxLayoutOverrideParams = 5;
+  constexpr size_t iOpName = 0;
+  constexpr size_t iLayoutOverrideParams = 1;
+  constexpr size_t iGrid = 0;
+  constexpr size_t iMemorySpace = 1;
+  constexpr size_t iTensorMemoryLayout = 2;
+  constexpr size_t iMemoryLayout = 3;
+  constexpr size_t iDataType = 4;
+  constexpr char opSeparator = ',';
+  constexpr char opNameSeparator = '=';
+  constexpr char paramSepataor = ':';
+  constexpr char gridSeparator = 'x';
+
+  arg.split(opOverrideList, opSeparator);
+  for (const StringRef override : opOverrideList) {
+    SmallVector<StringRef, kvPairSize> opOverrideParts;
+    override.split(opOverrideParts, opNameSeparator);
+    if (opOverrideParts.size() != kvPairSize) {
+      opt.error("Invalid format for override grid sizes: " + override);
+      return true;
+    }
+
+    SmallVector<StringRef, kMaxLayoutOverrideParams> layoutParamParts;
+    // Split into layout parameters.
+    opOverrideParts[iLayoutOverrideParams].split(layoutParamParts,
+                                                 paramSepataor);
+    if (layoutParamParts.size() != kMaxLayoutOverrideParams) {
+      opt.error("Invalid number of layout parameters: " +
+                std::to_string(layoutParamParts.size()));
+      return true;
+    }
+
+    // Parse grid.
+    SmallVector<int64_t, kMaxGridSize> grid;
+    SmallVector<StringRef, kMaxGridSize> gridParts;
+    layoutParamParts[iGrid].split(gridParts, gridSeparator);
+    for (const StringRef gridPart : gridParts) {
+      int64_t gridValue;
+      if (gridPart.getAsInteger(10 /*Radix*/, gridValue)) {
+        opt.error("Invalid grid size: " + gridPart);
+        return true;
+      }
+      grid.push_back(gridValue);
+    }
+
+    // Parse memory space.
+    std::optional<BufferType> bufferType =
+        symbolizeBufferType(layoutParamParts[iMemorySpace]);
+    if (!bufferType.has_value()) {
+      opt.error("Invalid memory space: " + layoutParamParts[iMemorySpace]);
+      return true;
+    }
+
+    // Parse tensor memory layout.
+    std::optional<TensorMemoryLayout> tensorMemoryLayout =
+        symbolizeTensorMemoryLayout(layoutParamParts[iTensorMemoryLayout]);
+    if (!tensorMemoryLayout.has_value()) {
+      opt.error("Invalid tensor memory layout: " +
+                layoutParamParts[iTensorMemoryLayout]);
+      return true;
+    }
+
+    // Parse memory layout.
+    std::optional<tt::ttnn::Layout> memoryLayout =
+        mlir::tt::ttnn::symbolizeLayout(layoutParamParts[iMemoryLayout]);
+    if (!memoryLayout.has_value()) {
+      opt.error("Invalid memory layout: " + layoutParamParts[iMemoryLayout]);
+      return true;
+    }
+
+    // Parse data type.
+    std::optional<tt::DataType> dataType =
+        mlir::tt::DataTypeStringToEnum(layoutParamParts[iDataType]);
+    if (!dataType.has_value()) {
+      opt.error("Invalid data type: " + layoutParamParts[iDataType]);
+      return true;
+    }
+
+    // Set parsed op overrides.
+    value[opOverrideParts[iOpName]] = OutputLayoutOverrideParams{
+        std::move(grid), bufferType.value(), tensorMemoryLayout.value(),
+        memoryLayout.value(), dataType.value()};
+  }
+  return false;
+}
+
+std::string OutputLayoutOverrideParser::toString(
+    const llvm::StringMap<OutputLayoutOverrideParams> &value) {
+  std::string res;
+  size_t count = 0;
+  for (const auto &entry : value) {
+    res += std::string(entry.getKey()) + "=";
+    const OutputLayoutOverrideParams &params = entry.getValue();
+    // Print grid values
+    for (size_t i = 0; i < params.grid.size(); ++i) {
+      res += std::to_string(params.grid[i]);
+      if (i < params.grid.size() - 1) {
+        res += "x";
+      }
+    }
+    // Print memory space and memory layout
+    res += ":" +
+           std::string(mlir::tt::ttnn::stringifyBufferType(params.bufferType));
+    res += ":" + std::string(mlir::tt::ttnn::stringifyTensorMemoryLayout(
+                     params.tensorMemoryLayout));
+    res +=
+        ":" + std::string(mlir::tt::ttnn::stringifyLayout(params.memoryLayout));
+    res += ":" + std::string(mlir::tt::DataTypeEnumToString(params.dataType));
+    if (++count < value.size()) {
+      res += ",";
+    }
+  }
+  return res;
+}
+
+void OutputLayoutOverrideParser::print(
+    llvm::raw_ostream &os,
+    const llvm::StringMap<OutputLayoutOverrideParams> &value) {
+  os << "override-output-layout=";
+  os << OutputLayoutOverrideParser::toString(value);
+  os << "\n";
+}
+
+bool InputLayoutOverrideParser::parse(
+    llvm::cl::Option &opt, StringRef argName, StringRef arg,
+    llvm::StringMap<InputLayoutOverrideParams> &value) {
+  SmallVector<StringRef> opOverrideList;
+  constexpr size_t kvPairSize = 2;
+  constexpr size_t iOpName = 0;
+  constexpr size_t iOperands = 1;
+  constexpr char opSeparator = ',';
+  constexpr char opNameSeparator = '=';
+  constexpr char opParamSeparator = ':';
+
+  arg.split(opOverrideList, opSeparator);
+  for (const StringRef override : opOverrideList) {
+    SmallVector<StringRef, kvPairSize> opOverrideParts;
+    override.split(opOverrideParts, opNameSeparator);
+    if (opOverrideParts.size() != kvPairSize) {
+      opt.error("Invalid format for input layouts override: " + override);
+      return true;
+    }
+
+    SmallVector<int64_t> operandIndexes;
+    SmallVector<StringRef> operandIndexParts;
+
+    // Parse operand indexes.
+    opOverrideParts[iOperands].split(operandIndexParts, opParamSeparator);
+    for (const StringRef operandIndexPart : operandIndexParts) {
+      int64_t operandIndexValue;
+      if (operandIndexPart.getAsInteger(10 /*Radix*/, operandIndexValue)) {
+        opt.error("Invalid operand index: " + operandIndexPart);
+        return true;
+      }
+      operandIndexes.push_back(operandIndexValue);
+    }
+
+    // Set parsed op overrides.
+    value[opOverrideParts[iOpName]] =
+        InputLayoutOverrideParams{std::move(operandIndexes)};
+  }
+  return false;
+}
+
+std::string InputLayoutOverrideParser::toString(
+    const llvm::StringMap<InputLayoutOverrideParams> &value) {
+  std::string res;
+  size_t count = 0;
+  for (const auto &entry : value) {
+    res += std::string(entry.getKey()) + "=";
+    const InputLayoutOverrideParams &params = entry.getValue();
+    for (int64_t operandIdx : params.operandIdxes) {
+      res += std::to_string(operandIdx) + ":";
+    }
+    // Remove the last colon.
+    res.pop_back();
+    if (++count < value.size()) {
+      res += ",";
+    }
+  }
+  return res;
+}
+
+void InputLayoutOverrideParser::print(
+    llvm::raw_ostream &os,
+    const llvm::StringMap<InputLayoutOverrideParams> &value) {
+  os << "insert-memreconfig=";
+  os << InputLayoutOverrideParser::toString(value);
+  os << "\n";
+}
+
+} // namespace mlir::tt::ttnn
diff --git a/test/unittests/Optimizer/CMakeLists.txt b/test/unittests/Optimizer/CMakeLists.txt
index 681d78ff0e..4e6ee799a7 100644
--- a/test/unittests/Optimizer/CMakeLists.txt
+++ b/test/unittests/Optimizer/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_unittest(OptimizerTests
     TestShardSolver.cpp
+    TestOptimizerOverrides.cpp
 )
 
 target_link_libraries(OptimizerTests
diff --git a/test/unittests/Optimizer/TestOptimizerOverrides.cpp b/test/unittests/Optimizer/TestOptimizerOverrides.cpp
new file mode 100644
index 0000000000..c75fde21f9
--- /dev/null
+++ b/test/unittests/Optimizer/TestOptimizerOverrides.cpp
@@ -0,0 +1,433 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gtest/gtest.h>
+
+#include "ttmlir/Dialect/TTNN/Utils/OptimizerOverrides.h"
+
+using namespace mlir::tt::ttnn;
+
+class TestOptimizerOverrides : public ::testing::Test {
+
+public:
+  OptimizerOverridesHandler optimizerOverridesHandler;
+
+  void SetUp() override {}
+
+  llvm::StringMap<InputLayoutOverrideParams> createInputLayoutOverrides() {
+
+    // struct InputLayoutOverrideParams {
+    //   SmallVector<int64_t> operandIdxes;
+    // };
+
+    llvm::StringMap<InputLayoutOverrideParams> inputLayoutOverrides;
+
+    // Create input layout overrides for 3 input overrides.
+    inputLayoutOverrides["input0"] = createInputLayoutOverrideParams();
+    inputLayoutOverrides["input1"] = createInputLayoutOverrideParams();
+    inputLayoutOverrides["input2"] = createInputLayoutOverrideParams();
+
+    return inputLayoutOverrides;
+  }
+
+  InputLayoutOverrideParams createInputLayoutOverrideParams() {
+
+    InputLayoutOverrideParams inputLayoutOverrideParams;
+
+    // Create input layout override params for 2 operands.
+    // Their operand indexes are 0 and 1, respectively.
+    inputLayoutOverrideParams.operandIdxes.push_back(0);
+    inputLayoutOverrideParams.operandIdxes.push_back(1);
+
+    return inputLayoutOverrideParams;
+  }
+
+  llvm::StringMap<OutputLayoutOverrideParams> createOutputLayoutOverrides() {
+
+    llvm::StringMap<OutputLayoutOverrideParams> outputLayoutOverrides;
+
+    // Create output layout overrides for 3 output overrides.
+    outputLayoutOverrides["output0"] = createOutputLayoutOverrideParams_0();
+    outputLayoutOverrides["output1"] = createOutputLayoutOverrideParams_1();
+    outputLayoutOverrides["output2"] = createOutputLayoutOverrideParams_2();
+
+    return outputLayoutOverrides;
+  }
+
+  OutputLayoutOverrideParams createOutputLayoutOverrideParams_0() {
+
+    // struct OutputLayoutOverrideParams {
+    //   SmallVector<int64_t, 2> grid;
+    //   BufferType;
+    //   TensorMemoryLayout tensorMemoryLayout; // INTERLEAVED / SHARDED etc...
+    //   Layout memoryLayout;             // ROW_MAJOR / TILE
+    //   mlir::tt::DataType dataType;
+    // };
+
+    OutputLayoutOverrideParams outputLayoutOverrideParams;
+
+    // Output 0 has
+    //      - grid size 2x2,
+    //      - buffer type dram
+    //      - tensor memory layout interleaved
+    //      - memory layout tile
+    //      - data type fp16.
+    outputLayoutOverrideParams.grid.push_back(2);
+    outputLayoutOverrideParams.grid.push_back(2);
+    outputLayoutOverrideParams.bufferType = BufferType::DRAM;
+    outputLayoutOverrideParams.tensorMemoryLayout =
+        TensorMemoryLayout::Interleaved;
+    outputLayoutOverrideParams.memoryLayout = Layout::Tile;
+    outputLayoutOverrideParams.dataType = mlir::tt::DataType::Float16;
+
+    return outputLayoutOverrideParams;
+  }
+
+  OutputLayoutOverrideParams createOutputLayoutOverrideParams_1() {
+
+    // struct OutputLayoutOverrideParams {
+    //   SmallVector<int64_t, 2> grid;
+    //   BufferType;
+    //   TensorMemoryLayout tensorMemoryLayout; // INTERLEAVED / SHARDED etc...
+    //   Layout memoryLayout;             // ROW_MAJOR / TILE
+    //   mlir::tt::DataType dataType;
+    // };
+
+    OutputLayoutOverrideParams outputLayoutOverrideParams;
+
+    // Output 1 has
+    //      - grid size 8x4,
+    //      - buffer type l1
+    //      - tensor memory layout block_sharded
+    //      - memory layout row_major
+    //      - data type fp16.
+    outputLayoutOverrideParams.grid.push_back(8);
+    outputLayoutOverrideParams.grid.push_back(4);
+    outputLayoutOverrideParams.bufferType = BufferType::L1;
+    outputLayoutOverrideParams.tensorMemoryLayout =
+        TensorMemoryLayout::BlockSharded;
+    outputLayoutOverrideParams.memoryLayout = Layout::RowMajor;
+    outputLayoutOverrideParams.dataType = mlir::tt::DataType::Float16;
+
+    return outputLayoutOverrideParams;
+  }
+
+  OutputLayoutOverrideParams createOutputLayoutOverrideParams_2() {
+
+    // struct OutputLayoutOverrideParams {
+    //   SmallVector<int64_t, 2> grid;
+    //   BufferType;
+    //   TensorMemoryLayout tensorMemoryLayout; // INTERLEAVED / SHARDED etc...
+    //   Layout memoryLayout;             // ROW_MAJOR / TILE
+    //   mlir::tt::DataType dataType;
+    // };
+
+    OutputLayoutOverrideParams outputLayoutOverrideParams;
+
+    // Output 2 has
+    //      - grid size 3x6,
+    //      - buffer type system
+    //      - tensor memory layout height_sharded
+    //      - memory layout tile
+    //      - data type fp16.
+    outputLayoutOverrideParams.grid.push_back(3);
+    outputLayoutOverrideParams.grid.push_back(6);
+    outputLayoutOverrideParams.bufferType = BufferType::SystemMemory;
+    outputLayoutOverrideParams.tensorMemoryLayout =
+        TensorMemoryLayout::HeightSharded;
+    outputLayoutOverrideParams.memoryLayout = Layout::Tile;
+    outputLayoutOverrideParams.dataType = mlir::tt::DataType::Float16;
+
+    return outputLayoutOverrideParams;
+  }
+
+  bool
+  compareInputLayoutOverrides(llvm::StringMap<InputLayoutOverrideParams> in1,
+                              llvm::StringMap<InputLayoutOverrideParams> in2) {
+    // Check if the sizes of the two input layout overrides are the same.
+    if (in1.size() != in2.size()) {
+      return false;
+    }
+    llvm::StringMap<InputLayoutOverrideParams>::iterator it1;
+    for (it1 = in1.begin(); it1 != in1.end(); it1++) {
+      // Check if the two input layout overrides have the same keys.
+      llvm::StringMap<InputLayoutOverrideParams>::iterator it2 =
+          in2.find(it1->getKey());
+      if (it2 == in2.end()) {
+        return false;
+      }
+      // Check if the two input layout overrides have the same values.
+      // The structure InputLayoutOverrideParams has overloaded operators for ==
+      // and !=, so we can compare the objects in this way.
+      if (it1->getValue() != it2->getValue()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool compareOutputLayoutOverrides(
+      llvm::StringMap<OutputLayoutOverrideParams> out1,
+      llvm::StringMap<OutputLayoutOverrideParams> out2) {
+    // Check if the sizes of the two output layout overrides are the same.
+    if (out1.size() != out2.size()) {
+      return false;
+    }
+    llvm::StringMap<OutputLayoutOverrideParams>::iterator it1;
+    for (it1 = out1.begin(); it1 != out1.end(); it1++) {
+      // Check if the two output layout overrides have the same keys.
+      llvm::StringMap<OutputLayoutOverrideParams>::iterator it2 =
+          out2.find(it1->getKey());
+      if (it2 == out2.end()) {
+        return false;
+      }
+      // Check if the two output layout overrides have the same values.
+      // The structure OutputLayoutOverrideParams has overloaded operators for
+      // == and !=, so we can compare the objects in this way.
+      if (it1->getValue() != it2->getValue()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void TearDown() override {}
+};
+
+// Test the setEnableOptimizer method
+TEST_F(TestOptimizerOverrides, TestSetOptimizerPass) {
+
+  optimizerOverridesHandler.setEnableOptimizer(true);
+  ASSERT_TRUE(optimizerOverridesHandler.getEnableOptimizer());
+
+  optimizerOverridesHandler.setEnableOptimizer(false);
+  ASSERT_FALSE(optimizerOverridesHandler.getEnableOptimizer());
+}
+
+// Test the setMemoryConfig method
+TEST_F(TestOptimizerOverrides, TestSetMemoryConfig) {
+
+  optimizerOverridesHandler.setMemoryReconfig(true);
+  ASSERT_TRUE(optimizerOverridesHandler.getMemoryReconfig());
+
+  optimizerOverridesHandler.setMemoryReconfig(false);
+  ASSERT_FALSE(optimizerOverridesHandler.getMemoryReconfig());
+}
+
+// Test the setMemoryLayoutAnalysis method
+TEST_F(TestOptimizerOverrides, TestSetMemoryLayoutAnalysis) {
+
+  optimizerOverridesHandler.setEnableMemoryLayoutAnalysis(true);
+  ASSERT_TRUE(optimizerOverridesHandler.getEnableMemoryLayoutAnalysis());
+
+  optimizerOverridesHandler.setEnableMemoryLayoutAnalysis(false);
+  ASSERT_FALSE(optimizerOverridesHandler.getEnableMemoryLayoutAnalysis());
+}
+
+// Test the setEnableMemoryLayoutAnalysisPolicy method
+TEST_F(TestOptimizerOverrides, TestSetEnableMemoryLayoutAnalysisPolicy) {
+
+  optimizerOverridesHandler.setEnableMemoryLayoutAnalysisPolicy(true);
+  ASSERT_TRUE(optimizerOverridesHandler.getEnableMemoryLayoutAnalysisPolicy());
+
+  optimizerOverridesHandler.setEnableMemoryLayoutAnalysisPolicy(false);
+  ASSERT_FALSE(optimizerOverridesHandler.getEnableMemoryLayoutAnalysisPolicy());
+}
+
+// Test the setMemoryLayoutAnalysisPolicy method
+TEST_F(TestOptimizerOverrides, TestSetMemoryLayoutAnalysisPolicy) {
+
+  optimizerOverridesHandler.setMemoryLayoutAnalysisPolicy(
+      mlir::tt::MemoryLayoutAnalysisPolicyType::DFSharding);
+  ASSERT_EQ(optimizerOverridesHandler.getMemoryLayoutAnalysisPolicy(),
+            mlir::tt::MemoryLayoutAnalysisPolicyType::DFSharding);
+
+  optimizerOverridesHandler.setMemoryLayoutAnalysisPolicy(
+      mlir::tt::MemoryLayoutAnalysisPolicyType::L1Interleaved);
+  ASSERT_EQ(optimizerOverridesHandler.getMemoryLayoutAnalysisPolicy(),
+            mlir::tt::MemoryLayoutAnalysisPolicyType::L1Interleaved);
+}
+
+// Test the setInputLayoutOverrides method
+TEST_F(TestOptimizerOverrides, TestSetInputLayoutOverrides) {
+
+  llvm::StringMap<InputLayoutOverrideParams> inputLayoutOverrides =
+      createInputLayoutOverrides();
+
+  optimizerOverridesHandler.setInputLayoutOverrides(inputLayoutOverrides);
+  ASSERT_TRUE(compareInputLayoutOverrides(
+      optimizerOverridesHandler.getInputLayoutOverrides(),
+      inputLayoutOverrides));
+}
+
+// Test the setOutputLayoutOverrides method
+TEST_F(TestOptimizerOverrides, TestSetOutputLayoutOverrides) {
+
+  llvm::StringMap<OutputLayoutOverrideParams> outputLayoutOverrides =
+      createOutputLayoutOverrides();
+
+  optimizerOverridesHandler.setOutputLayoutOverrides(outputLayoutOverrides);
+  ASSERT_TRUE(compareOutputLayoutOverrides(
+      optimizerOverridesHandler.getOutputLayoutOverrides(),
+      outputLayoutOverrides));
+}
+
+// Test the addInputLayoutOverride method passing the whole object
+TEST_F(TestOptimizerOverrides, TestAddInputLayoutOverrideObject) {
+
+  // This method is implemented across two functions in the
+  // OptimizerOverridesHandler class. The first function takes the whole object
+  // as a parameter, while the second function takes the individual parameters.
+
+  // Here, we test the first function, which takes the whole object as a
+  // parameter.
+
+  llvm::StringMap<InputLayoutOverrideParams> inputLayoutOverrides =
+      createInputLayoutOverrides();
+
+  optimizerOverridesHandler.addInputLayoutOverride(
+      "input0", createInputLayoutOverrideParams());
+  optimizerOverridesHandler.addInputLayoutOverride(
+      "input1", createInputLayoutOverrideParams());
+  optimizerOverridesHandler.addInputLayoutOverride(
+      "input2", createInputLayoutOverrideParams());
+
+  ASSERT_TRUE(compareInputLayoutOverrides(
+      optimizerOverridesHandler.getInputLayoutOverrides(),
+      inputLayoutOverrides));
+}
+
+// Test the addInputLayoutOverride method passing the individual parameters
+TEST_F(TestOptimizerOverrides, TestAddInputLayoutOverrideParams) {
+
+  // This method is implemented across two functions in the
+  // OptimizerOverridesHandler class. The first function takes the whole object
+  // as a parameter, while the second function takes the individual parameters.
+
+  // Here, we test the second function, which takes the individual parameters.
+
+  llvm::StringMap<InputLayoutOverrideParams> inputLayoutOverrides =
+      createInputLayoutOverrides();
+
+  llvm::SmallVector<int64_t> operandIdxes1 = {0, 1};
+  llvm::SmallVector<int64_t> operandIdxes2 = {0, 1};
+  llvm::SmallVector<int64_t> operandIdxes3 = {0, 1};
+
+  optimizerOverridesHandler.addInputLayoutOverride("input0", operandIdxes1);
+  optimizerOverridesHandler.addInputLayoutOverride("input1", operandIdxes2);
+  optimizerOverridesHandler.addInputLayoutOverride("input2", operandIdxes3);
+
+  ASSERT_TRUE(compareInputLayoutOverrides(
+      optimizerOverridesHandler.getInputLayoutOverrides(),
+      inputLayoutOverrides));
+}
+
+// Test the addOutputLayoutOverride method passing the whole object
+TEST_F(TestOptimizerOverrides, TestAddOutputLayoutOverrideObject) {
+
+  // This method is implemented across two functions in the
+  // OptimizerOverridesHandler class. The first function takes the whole object
+  // as a parameter, while the second function takes the individual parameters.
+
+  // Here, we test the first function, which takes the whole object as a
+  // parameter.
+
+  llvm::StringMap<OutputLayoutOverrideParams> outputLayoutOverrides =
+      createOutputLayoutOverrides();
+
+  optimizerOverridesHandler.addOutputLayoutOverride(
+      "output0", createOutputLayoutOverrideParams_0());
+  optimizerOverridesHandler.addOutputLayoutOverride(
+      "output1", createOutputLayoutOverrideParams_1());
+  optimizerOverridesHandler.addOutputLayoutOverride(
+      "output2", createOutputLayoutOverrideParams_2());
+
+  ASSERT_TRUE(compareOutputLayoutOverrides(
+      optimizerOverridesHandler.getOutputLayoutOverrides(),
+      outputLayoutOverrides));
+}
+
+// Test the addOutputLayoutOverride method passing the individual parameters
+TEST_F(TestOptimizerOverrides, TestAddOutputLayoutOverrideParams) {
+
+  // This method is implemented across two functions in the
+  // OptimizerOverridesHandler class. The first function takes the whole object
+  // as a parameter, while the second function takes the individual parameters.
+
+  // Here, we test the second function, which takes the individual parameters.
+
+  llvm::StringMap<OutputLayoutOverrideParams> outputLayoutOverrides =
+      createOutputLayoutOverrides();
+
+  llvm::SmallVector<int64_t> grid1 = {2, 2};
+  llvm::SmallVector<int64_t> grid2 = {8, 4};
+  llvm::SmallVector<int64_t> grid3 = {3, 6};
+
+  optimizerOverridesHandler.addOutputLayoutOverride(
+      "output0", grid1, BufferType::DRAM, TensorMemoryLayout::Interleaved,
+      Layout::Tile, mlir::tt::DataType::Float16);
+  optimizerOverridesHandler.addOutputLayoutOverride(
+      "output1", grid2, BufferType::L1, TensorMemoryLayout::BlockSharded,
+      Layout::RowMajor, mlir::tt::DataType::Float16);
+  optimizerOverridesHandler.addOutputLayoutOverride(
+      "output2", grid3, BufferType::SystemMemory,
+      TensorMemoryLayout::HeightSharded, Layout::Tile,
+      mlir::tt::DataType::Float16);
+
+  ASSERT_TRUE(compareOutputLayoutOverrides(
+      optimizerOverridesHandler.getOutputLayoutOverrides(),
+      outputLayoutOverrides));
+}
+
+// Test the setSystemDescPath method
+TEST_F(TestOptimizerOverrides, TestSetSystemDescPath) {
+
+  optimizerOverridesHandler.setSystemDescPath("system_desc_path");
+  ASSERT_EQ(optimizerOverridesHandler.getSystemDescPath(), "system_desc_path");
+}
+
+// Test the setMaxLegalLayouts method
+TEST_F(TestOptimizerOverrides, TestSetMaxLegalLayouts) {
+
+  optimizerOverridesHandler.setMaxLegalLayouts(10);
+  ASSERT_EQ(optimizerOverridesHandler.getMaxLegalLayouts(), 10);
+}
+
+// Test the setMeshShape method
+TEST_F(TestOptimizerOverrides, TestSetMeshShape) {
+
+  std::vector<int64_t> meshShape;
+  meshShape.push_back(1);
+  meshShape.push_back(2);
+
+  optimizerOverridesHandler.setMeshShape(meshShape);
+  ASSERT_EQ(optimizerOverridesHandler.getMeshShape()[0], meshShape[0]);
+  ASSERT_EQ(optimizerOverridesHandler.getMeshShape()[1], meshShape[1]);
+}
+
+// Test the toString method
+TEST_F(TestOptimizerOverrides, TestToString) {
+
+  std::string options;
+  options +=
+      "enable-optimizer=true "; // The optimizer pass is enabled by default.
+  options += "memreconfig-enabled=true ";
+  options += "memory-layout-analysis-enabled=true ";
+  options += "insert-memreconfig=add_0_1_2=0 ";
+  options +=
+      "override-output-layout=add_1_2=1x1:dram:interleaved:row_major:f32";
+
+  llvm::SmallVector<int64_t> operandIdxes = {0};
+  llvm::SmallVector<int64_t> grid = {1, 1};
+
+  optimizerOverridesHandler.setEnableOptimizer(true);
+  optimizerOverridesHandler.setEnableMemoryLayoutAnalysis(true);
+  optimizerOverridesHandler.setMemoryReconfig(true);
+  optimizerOverridesHandler.addInputLayoutOverride("add_0_1_2", operandIdxes);
+  optimizerOverridesHandler.addOutputLayoutOverride(
+      "add_1_2", grid, BufferType::DRAM, TensorMemoryLayout::Interleaved,
+      Layout::RowMajor, mlir::tt::DataType::Float32);
+
+  ASSERT_EQ(optimizerOverridesHandler.toString(), options);
+}

From c4b3dffcc449c9fc318a0620ba0158a7c6d8d635 Mon Sep 17 00:00:00 2001
From: Andrej Jakovljevic <ajakovljevic@tenstorrent.com>
Date: Mon, 2 Dec 2024 08:25:33 +0100
Subject: [PATCH 36/84] Added support for scatter op (#1279)

---
 include/ttmlir/Dialect/TTIR/IR/TTIROps.td     |  34 ++++
 include/ttmlir/Dialect/TTNN/IR/TTNNOps.td     |   7 +
 include/ttmlir/Target/TTNN/program.fbs        |   1 +
 .../StableHLOToTTIRPatterns.cpp               | 145 ++++++++++++++++++
 lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp      |  18 ++-
 lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp    |   1 +
 lib/Dialect/TTIR/IR/TTIROps.cpp               |  62 ++++++++
 lib/Dialect/TTNN/IR/TTNNOps.cpp               |   8 +
 lib/Target/TTNN/TTNNToFlatbuffer.cpp          |   6 +
 .../eltwise/binary/binary_composite.cpp       |   4 +
 .../eltwise/binary/binary_composite.h         |   1 +
 .../ttnn/operations/eltwise/binary/utils.cpp  |  12 +-
 .../StableHLOToTTIR/scatter_op.mlir           |  16 ++
 test/ttmlir/Dialect/TTNN/simple_scatter.mlir  |  16 ++
 test/ttmlir/Silicon/TTNN/simple_eltwise.mlir  |  10 ++
 15 files changed, 338 insertions(+), 3 deletions(-)
 create mode 100644 test/ttmlir/Conversion/StableHLOToTTIR/scatter_op.mlir
 create mode 100644 test/ttmlir/Dialect/TTNN/simple_scatter.mlir

diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
index f5e284078a..8908e470e0 100644
--- a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
+++ b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
@@ -1244,6 +1244,40 @@ def TTIR_DivOp : TTIR_GenericElementwiseBinaryOp<"div"> {
     }];
 }
 
+def TTIR_ScatterOp: TTIR_DPSOp<"scatter"> {
+  let summary = "Scatter operation";
+  let description = [{
+    Produces a 'result' tensor which are equal to `input` tensor except that
+    several slices specified by `scatter_indices` are updated with the values
+    `updates`.
+    }];
+
+  let arguments = (ins AnyRankedTensor:$input,
+                       AnyRankedTensor:$scatter_indices,
+                       AnyRankedTensor:$update,
+                       DenseI32ArrayAttr:$update_window_dims,
+                       DenseI32ArrayAttr:$inserted_window_dims,
+                       DenseI32ArrayAttr:$input_batching_dims,
+                       DenseI32ArrayAttr:$scatter_indices_batching_dims,
+                       DenseI32ArrayAttr:$scatter_dims_to_operand_dims,
+                       I32Attr:$index_vector_dim,
+                       BoolAttr:$indices_are_sorted,
+                       BoolAttr:$unique_indices,
+                       AnyRankedTensor:$output,
+                       TT_OperandConstraintArrayAttr:$operand_constraints);
+
+  let regions = (region SizedRegion<1>:$update_computation);
+
+  let results = (outs AnyRankedTensor:$result);
+
+  let hasVerifier = 1;
+
+  let extraClassDeclaration = [{
+    MutableOperandRange getDpsInitsMutable() { return getOutputMutable(); }
+  }];
+
+}
+
 //===----------------------------------------------------------------------===//
 // TTIR region ops (ops that may appear inside of ttir.generic region)
 //===----------------------------------------------------------------------===//
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
index 21eb704cf7..57383c007d 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
@@ -860,6 +860,13 @@ def TTNN_AllGatherOp: TTNN_Op<"all_gather"> {
     let hasVerifier = 1;
 }
 
+def TTNN_ScatterOp: TTNN_ElementwiseBinaryOp<"scatter"> {
+    let summary = "Scatter op.";
+    let description = [{
+      Embeds the values of the 'update' tensor into 'input' at the given index and puts the value in the 'output' tensor.
+      }];
+}
+
 def TTNN_ReduceScatterOp: TTNN_Op<"reduce_scatter"> {
     let summary = "Reduce scatter op.";
     let description = [{
diff --git a/include/ttmlir/Target/TTNN/program.fbs b/include/ttmlir/Target/TTNN/program.fbs
index 39535e2f0b..f145aaf657 100644
--- a/include/ttmlir/Target/TTNN/program.fbs
+++ b/include/ttmlir/Target/TTNN/program.fbs
@@ -112,6 +112,7 @@ enum EltwiseOpType: uint32 {
   LogicalXor,
   Clamp,
   LeakyRelu,
+  Scatter
 }
 
 table ClampOpParams {
diff --git a/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp b/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
index ccf21ff275..d81b6e2149 100644
--- a/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
+++ b/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
@@ -1666,6 +1666,137 @@ class StableHLOToTTIROpIotaOpConversionPattern
   }
 };
 
+class StableHLOToTTIRScatterOpConversionPattern
+    : public OpConversionPattern<mlir::stablehlo::ScatterOp> {
+
+  using OpConversionPattern<mlir::stablehlo::ScatterOp>::OpConversionPattern;
+
+public:
+  LogicalResult
+  matchAndRewrite(mlir::stablehlo::ScatterOp srcOp,
+                  mlir::stablehlo::ScatterOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    auto outputType = mlir::cast<RankedTensorType>(
+        this->getTypeConverter()->convertType(srcOp.getResults()[0].getType()));
+    tensor::EmptyOp outputTensor = rewriter.create<tensor::EmptyOp>(
+        srcOp.getLoc(), outputType.getShape(), outputType.getElementType());
+    Value operand = srcOp.getInputs()[0];
+    Value scatterIndices = srcOp.getScatterIndices();
+    Value update = srcOp.getUpdates()[0];
+    mlir::ArrayAttr binaryConstraints = rewriter.getArrayAttr(
+        SmallVector<Attribute>(4, rewriter.getAttr<OperandConstraintAttr>(
+                                      OperandConstraint::AnyDeviceTile)));
+    auto updateWindowsDims =
+        adaptor.getScatterDimensionNumbers().getUpdateWindowDims();
+    auto insertedWindowDims =
+        adaptor.getScatterDimensionNumbers().getInsertedWindowDims();
+    auto inputBatchingDims =
+        adaptor.getScatterDimensionNumbers().getInputBatchingDims();
+    auto scatterIndicesBatchingDims =
+        adaptor.getScatterDimensionNumbers().getScatterIndicesBatchingDims();
+    auto scatterDimsToOperandDims =
+        adaptor.getScatterDimensionNumbers().getScatterDimsToOperandDims();
+    auto indexVectorDim =
+        adaptor.getScatterDimensionNumbers().getIndexVectorDim();
+    auto indicesAreSorted = adaptor.getIndicesAreSorted();
+    auto uniqueIndices = adaptor.getUniqueIndices();
+
+    auto newScatterOp = rewriter.create<mlir::tt::ttir::ScatterOp>(
+        srcOp.getLoc(), outputType, operand, scatterIndices, update,
+        llvm::ArrayRef<int32_t>(
+            convertArrayRefToInt32vector(updateWindowsDims)),
+        llvm::ArrayRef<int32_t>(
+            convertArrayRefToInt32vector(insertedWindowDims)),
+        llvm::ArrayRef<int32_t>(
+            convertArrayRefToInt32vector(inputBatchingDims)),
+        llvm::ArrayRef<int32_t>(
+            convertArrayRefToInt32vector(scatterIndicesBatchingDims)),
+        llvm::ArrayRef<int32_t>(
+            convertArrayRefToInt32vector(scatterDimsToOperandDims)),
+        indexVectorDim, indicesAreSorted, uniqueIndices, outputTensor,
+        binaryConstraints);
+
+    // Replaces with different types do not work and will fail silently, so we
+    // manually set the second operand, since the type changes there from i32 to
+    // i64.
+    newScatterOp.setOperand(
+        1, adaptor.getScatterIndices().getDefiningOp()->getResult(0));
+
+    newScatterOp->getRegion(0).takeBody(adaptor.getUpdateComputation());
+    changeRegionTypes(newScatterOp->getRegion(0), *getTypeConverter(),
+                      rewriter);
+
+    rewriter.replaceOp(srcOp, newScatterOp);
+
+    return success();
+  }
+
+private:
+  std::vector<int32_t>
+  convertArrayRefToInt32vector(const llvm::ArrayRef<int64_t> &source) const {
+    std::vector<int32_t> converted;
+    converted.reserve(source.size());
+
+    for (int64_t value : source) {
+      converted.push_back(static_cast<int32_t>(value));
+    }
+
+    return converted;
+  }
+
+  void changeRegionTypes(mlir::Region &region,
+                         const mlir::TypeConverter &typeConverter,
+                         mlir::PatternRewriter &rewriter) const {
+    Block &block = *region.getBlocks().begin();
+    llvm::SmallVector<mlir::BlockArgument, 4> oldArguments(
+        block.getArguments().begin(), block.getArguments().end());
+    llvm::SmallVector<mlir::Value, 4> newArguments;
+
+    // Add new arguments with updated types to the block.
+    for (auto arg : oldArguments) {
+      if (auto newType = typeConverter.convertType(arg.getType())) {
+        mlir::BlockArgument newArg = block.addArgument(newType, arg.getLoc());
+        newArguments.push_back(newArg);
+      } else {
+        newArguments.push_back(arg); // Type didn't change
+      }
+    }
+
+    for (auto it : llvm::zip(oldArguments, newArguments)) {
+      mlir::BlockArgument oldArg = std::get<0>(it);
+      mlir::Value newArg = std::get<1>(it);
+      if (oldArg != newArg) {
+        oldArg.replaceAllUsesWith(newArg);
+      }
+    }
+
+    for (auto arg : oldArguments) {
+      if (!llvm::is_contained(newArguments, arg)) {
+        block.eraseArgument(arg.getArgNumber());
+      }
+    }
+  }
+};
+
+class StableHLOToTTIRReturnOpConversionPattern
+    : public OpConversionPattern<mlir::stablehlo::ReturnOp> {
+
+  using OpConversionPattern<mlir::stablehlo::ReturnOp>::OpConversionPattern;
+
+public:
+  LogicalResult
+  matchAndRewrite(mlir::stablehlo::ReturnOp srcOp,
+                  mlir::stablehlo::ReturnOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    rewriter.replaceOpWithNewOp<mlir::tt::ttir::YieldOp>(srcOp,
+                                                         srcOp.getResults());
+
+    return success();
+  }
+};
+
 void addElementwiseUnaryOpsConversionPatterns(MLIRContext *ctx,
                                               RewritePatternSet &patterns,
                                               TypeConverter &typeConverter) {
@@ -1846,6 +1977,18 @@ void addIotaOpConversionPattern(MLIRContext *ctx, RewritePatternSet &patterns,
           typeConverter, ctx);
 }
 
+void addScatterOpConversionPatterns(MLIRContext *ctx,
+                                    RewritePatternSet &patterns,
+                                    TypeConverter &typeConverter) {
+  patterns.add<StableHLOToTTIRScatterOpConversionPattern>(typeConverter, ctx);
+}
+
+void addReturnOpConversionPatterns(MLIRContext *ctx,
+                                   RewritePatternSet &patterns,
+                                   TypeConverter &typeConverter) {
+  patterns.add<StableHLOToTTIRReturnOpConversionPattern>(typeConverter, ctx);
+}
+
 } // namespace
 
 namespace mlir::tt {
@@ -1872,6 +2015,8 @@ void populateStableHLOToTTIRPatterns(MLIRContext *ctx,
   addClampOpConversionPattern(ctx, patterns, typeConverter);
   addGatherOpConversionPattern(ctx, patterns, typeConverter);
   addIotaOpConversionPattern(ctx, patterns, typeConverter);
+  addScatterOpConversionPatterns(ctx, patterns, typeConverter);
+  addReturnOpConversionPatterns(ctx, patterns, typeConverter);
 }
 
 } // namespace mlir::tt
diff --git a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
index 3241928f45..18efb982e5 100644
--- a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
+++ b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
@@ -953,6 +953,21 @@ class ArangeOpConversionPattern : public OpConversionPattern<ttir::ArangeOp> {
   }
 };
 
+class ScatterOpConversionPattern : public OpConversionPattern<ttir::ScatterOp> {
+public:
+  using OpConversionPattern<ttir::ScatterOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(ttir::ScatterOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // The ttnn interface has the inverse inputs of the TTIR dialect op (which
+    // matches torch ops).
+    rewriter.replaceOpWithNewOp<ttnn::ScatterOp>(
+        op, adaptor.getUpdate(), adaptor.getInput(), adaptor.getOutput());
+
+    return success();
+  }
+};
 } // namespace
 
 namespace mlir::tt {
@@ -1022,7 +1037,8 @@ void populateTTIRToTTNNPatterns(MLIRContext *ctx, RewritePatternSet &patterns,
            MaxPool2dOpConversionPattern,
            SubtractOpConversionPattern,
            AllGatherOpConversionPattern,
-           ArangeOpConversionPattern
+           ArangeOpConversionPattern,
+           ScatterOpConversionPattern
            >(typeConverter, ctx);
   // ANCHOR_END: op_rewriter_pattern_set
   // clang-format on
diff --git a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
index c5ab71b235..f04d5566b9 100644
--- a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
+++ b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
@@ -713,6 +713,7 @@ void populateTTNNToEmitCPatterns(mlir::MLIRContext *ctx,
                DefaultOpConversionPattern<ttnn::MaximumOp>,
                DefaultOpConversionPattern<ttnn::MinimumOp>,
                DefaultOpConversionPattern<ttnn::DivOp>,
+               DefaultOpConversionPattern<ttnn::ScatterOp>,
                DefaultOpConversionPattern<ttnn::RemainderOp>>(typeConverter,
                                                               ctx);
 
diff --git a/lib/Dialect/TTIR/IR/TTIROps.cpp b/lib/Dialect/TTIR/IR/TTIROps.cpp
index 11cfbb8fbb..aacb2a43de 100644
--- a/lib/Dialect/TTIR/IR/TTIROps.cpp
+++ b/lib/Dialect/TTIR/IR/TTIROps.cpp
@@ -1323,6 +1323,68 @@ ::mlir::LogicalResult mlir::tt::ttir::MeshShardOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// ScatterOp
+//===----------------------------------------------------------------------===//
+
+bool matchSimpleBlock(mlir::Region &region) {
+  if (!region.hasOneBlock()) {
+    return false;
+  }
+  mlir::Block &block = region.front();
+  if (block.getNumArguments() != 2) {
+    return false;
+  }
+  auto argType1 =
+      mlir::cast<mlir::RankedTensorType>(block.getArgument(0).getType());
+  auto argType2 =
+      mlir::cast<mlir::RankedTensorType>(block.getArgument(1).getType());
+  if (!argType1 || !argType2) {
+    return false;
+  }
+  if (block.getOperations().size() != 1) {
+    return false;
+  }
+  mlir::tt::ttir::YieldOp returnOp =
+      mlir::cast<mlir::tt::ttir::YieldOp>(&block.front());
+  if (!returnOp) {
+    return false;
+  }
+  if (returnOp.getNumOperands() != 1 ||
+      returnOp.getOperand(0) != block.getArgument(1)) {
+    return false;
+  }
+  return true;
+}
+
+::mlir::LogicalResult mlir::tt::ttir::ScatterOp::verify() {
+
+  ArrayRef<int64_t> inputShape =
+      mlir::cast<RankedTensorType>(getInput().getType()).getShape();
+
+  if (getUpdateWindowDims().size() + getInsertedWindowDims().size() !=
+      inputShape.size()) {
+    return emitOpError("Batching currently not supported");
+  }
+
+  for (uint64_t insertedWindowDims : getInsertedWindowDims()) {
+    if (inputShape[insertedWindowDims] != 1) {
+      return emitOpError("Dimension size to slice into must be 1");
+    }
+  }
+
+  // We currently do not support custom functions in the scatter function,
+  // which is a possbility in StableHLO dialect. See issue:
+  // https://github.com/tenstorrent/tt-mlir/issues/1278
+  if (!matchSimpleBlock(getUpdateComputation())) {
+    return emitOpError(
+        "Currently not supporting custom scatter function in TTNN "
+        "dialect and TT-metal.");
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // GenericOp
 //===----------------------------------------------------------------------===//
diff --git a/lib/Dialect/TTNN/IR/TTNNOps.cpp b/lib/Dialect/TTNN/IR/TTNNOps.cpp
index cd2746aadf..8e41368cbb 100644
--- a/lib/Dialect/TTNN/IR/TTNNOps.cpp
+++ b/lib/Dialect/TTNN/IR/TTNNOps.cpp
@@ -950,6 +950,10 @@ ::mlir::LogicalResult mlir::tt::ttnn::SoftmaxOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// AllGatherOp
+//===----------------------------------------------------------------------===//
+
 ::mlir::LogicalResult AllGatherOp::verify() {
   ::mlir::RankedTensorType inputType = getInput().getType();
   int32_t dim = getDim();
@@ -961,6 +965,10 @@ ::mlir::LogicalResult AllGatherOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// ReduceScatterOp
+//===----------------------------------------------------------------------===//
+
 ::mlir::LogicalResult ReduceScatterOp::verify() {
   // TODO(gfengTT)
   return success();
diff --git a/lib/Target/TTNN/TTNNToFlatbuffer.cpp b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
index e7df85956f..34a0c4725d 100644
--- a/lib/Target/TTNN/TTNNToFlatbuffer.cpp
+++ b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
@@ -526,6 +526,8 @@ createEltwiseOp(FlatbufferObjectCache &cache, EltwiseOp op) {
     type = ::tt::target::ttnn::EltwiseOpType::Div;
   } else if constexpr (std::is_same_v<EltwiseOp, SigmoidOp>) {
     type = ::tt::target::ttnn::EltwiseOpType::Sigmoid;
+  } else if constexpr (std::is_same_v<EltwiseOp, ScatterOp>) {
+    type = ::tt::target::ttnn::EltwiseOpType::Scatter;
   } else if constexpr (std::is_same_v<EltwiseOp, Log1pOp>) {
     type = ::tt::target::ttnn::EltwiseOpType::Log1p;
   } else if constexpr (std::is_same_v<EltwiseOp, ExpOp>) {
@@ -819,6 +821,10 @@ emitTTNNOperation(FlatbufferObjectCache &cache, Operation *op,
   if (auto log1pOp = dyn_cast<Log1pOp>(op); log1pOp) {
     return createOperation(cache, createEltwiseOp(cache, log1pOp), debugString);
   }
+  if (auto scatterOp = dyn_cast<ScatterOp>(op); scatterOp) {
+    return createOperation(cache, createEltwiseOp(cache, scatterOp),
+                           debugString);
+  }
   if (auto reciprocalOp = dyn_cast<ReciprocalOp>(op); reciprocalOp) {
     return createOperation(cache, createEltwiseOp(cache, reciprocalOp),
                            debugString);
diff --git a/runtime/lib/ttnn/operations/eltwise/binary/binary_composite.cpp b/runtime/lib/ttnn/operations/eltwise/binary/binary_composite.cpp
index 2a05d6246f..5c1d056f99 100644
--- a/runtime/lib/ttnn/operations/eltwise/binary/binary_composite.cpp
+++ b/runtime/lib/ttnn/operations/eltwise/binary/binary_composite.cpp
@@ -41,6 +41,10 @@ void run(const ::tt::target::ttnn::EltwiseOp *op, ProgramContext &context) {
     runEltwiseBinaryCompositeOp(op, tensorPool, ::ttnn::remainder);
     break;
   }
+  case ::tt::target::ttnn::EltwiseOpType::Scatter: {
+    runEltwiseBinaryCompositeOp(op, tensorPool, ::ttnn::scatter);
+    break;
+  }
   default:
     LOG_FATAL("Unsupported Eltwise Binary Composite operation");
   }
diff --git a/runtime/lib/ttnn/operations/eltwise/binary/binary_composite.h b/runtime/lib/ttnn/operations/eltwise/binary/binary_composite.h
index 9be8bc6b7e..bd497fe98c 100644
--- a/runtime/lib/ttnn/operations/eltwise/binary/binary_composite.h
+++ b/runtime/lib/ttnn/operations/eltwise/binary/binary_composite.h
@@ -15,6 +15,7 @@ inline bool isBinaryCompositeOp(const ::tt::target::ttnn::EltwiseOp *op) {
   case ::tt::target::ttnn::EltwiseOpType::Maximum:
   case ::tt::target::ttnn::EltwiseOpType::Minimum:
   case ::tt::target::ttnn::EltwiseOpType::Remainder:
+  case ::tt::target::ttnn::EltwiseOpType::Scatter:
     return true;
   default:
     return false;
diff --git a/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/eltwise/binary/utils.cpp b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/eltwise/binary/utils.cpp
index a54777ab28..f97f71e403 100644
--- a/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/eltwise/binary/utils.cpp
+++ b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/eltwise/binary/utils.cpp
@@ -7,6 +7,15 @@
 
 namespace tt::runtime::ttnn::operations::binary {
 
+bool shouldSwapBinaryOperands(const ::tt::target::ttnn::EltwiseOp *op,
+                              ::ttnn::Tensor **lhs, ::ttnn::Tensor **rhs) {
+  // For scatter, we expect the left-hand side operator to be lesser or equal in
+  // volume to the right hand side, so we omit the swap.
+  return (op->type() != ::tt::target::ttnn::EltwiseOpType::Scatter &&
+          workaround::Env::get().swapBinaryOperands &&
+          (*lhs)->volume() < (*rhs)->volume());
+}
+
 void getEltwiseBinaryOpInputTensors(const ::tt::target::ttnn::EltwiseOp *op,
                                     ProgramTensorPool &tensorPool,
                                     ::ttnn::Tensor **lhs,
@@ -21,8 +30,7 @@ void getEltwiseBinaryOpInputTensors(const ::tt::target::ttnn::EltwiseOp *op,
   // TODO(bug #1124): We're currently swapping the operands for binary ops
   // in runtime if the lhs operand is smaller (and requires broadcast onto the
   // rhs operand). We should add this check in the compiler.
-  if (workaround::Env::get().swapBinaryOperands &&
-      (*lhs)->volume() < (*rhs)->volume()) {
+  if (shouldSwapBinaryOperands(op, lhs, rhs)) {
     std::swap(*lhs, *rhs);
   }
 }
diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/scatter_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/scatter_op.mlir
new file mode 100644
index 0000000000..92cd8895fd
--- /dev/null
+++ b/test/ttmlir/Conversion/StableHLOToTTIR/scatter_op.mlir
@@ -0,0 +1,16 @@
+// REQUIRES: stablehlo
+// RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module @jit_scatter attributes {} {
+    func.func public @test_scatter(%arg0: tensor<1x3x320x320xf32>, %arg1: tensor<1x1xi64>, %arg2: tensor<1x3x32x32xf32>) -> tensor<1x3x320x320xf32> {
+        // CHECK: [[VAL0:%[0-9]+]] = tensor.empty() : [[TENSOR_SIZE1:tensor<[0-9]+x[0-9]+x[0-9]+x[0-9]+xf[0-9]+>]]
+        %result = "stablehlo.scatter"(%arg0, %arg1, %arg2) <{indices_are_sorted = false, scatter_dimension_numbers = #stablehlo.scatter<update_window_dims = [1, 2, 3], inserted_window_dims = [0], scatter_dims_to_operand_dims = [0], index_vector_dim = 1>, unique_indices = false}> ({
+        ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+            stablehlo.return %arg4 : tensor<f32>
+        }) : (tensor<1x3x320x320xf32>, tensor<1x1xi64>, tensor<1x3x32x32xf32>) -> tensor<1x3x320x320xf32>
+        // CHECK: [[VAL1:%[0-9]+]] = "ttir.scatter"(%arg0, %arg1, %arg2, [[VAL0]]) <{index_vector_dim = 1 : i32, indices_are_sorted = false, input_batching_dims = array<i32>, inserted_window_dims = array<i32: 0>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile], scatter_dims_to_operand_dims = array<i32: 0>, scatter_indices_batching_dims = array<i32>, unique_indices = false, update_window_dims = array<i32: 1, 2, 3>}
+        // CHECK: ([[TENSOR_SIZE1]], tensor<1x1xi32>, tensor<1x3x32x32xf32>, [[TENSOR_SIZE1]]) -> tensor<1x3x320x320xf32>
+        return %result : tensor<1x3x320x320xf32>
+        // CHECK: return [[VAL1]] : [[TENSOR_SIZE1]]
+    }
+}
diff --git a/test/ttmlir/Dialect/TTNN/simple_scatter.mlir b/test/ttmlir/Dialect/TTNN/simple_scatter.mlir
new file mode 100644
index 0000000000..5991efeabe
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/simple_scatter.mlir
@@ -0,0 +1,16 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s | FileCheck %s
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<1x3x320x320xf32>, %arg1: tensor<1x3x32x32xf32>) -> tensor<1x3x320x320xf32> {
+    %0 = tensor.empty() : tensor<1x3x320x320xf32>
+    %1 = tensor.empty() : tensor<1x1xi32>
+    // CHECK: [[VAL0:%[0-9]+]] = "ttnn.empty"(%{{[0-9]+}}) <{dtype = {{.*}}, layout = {{.*}}, memory_config = {{.*}}, shape = #ttnn.shape<[[TENSOR_SHAPE0:[0-9]+x[0-9]+x[0-9]+x[0-9]+]]>}> : (!tt.device<#device>) -> tensor<[[TENSOR_SHAPE1:[0-9]+x[0-9]+x[0-9]+x[0-9]+xf[0-9]+]], {{.*}}>
+    %2 = "ttir.scatter"(%arg0, %1, %arg1, %0) <{index_vector_dim = 1 : i32, indices_are_sorted = false, input_batching_dims = array<i32>, inserted_window_dims = array<i32: 0>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile], scatter_dims_to_operand_dims = array<i32: 0>, scatter_indices_batching_dims = array<i32>, unique_indices = false, update_window_dims = array<i32: 1, 2, 3>}> ({
+    ^bb0(%arg3: tensor<1xf32>, %arg4: tensor<1xf32>):
+      "ttir.yield"(%arg4) : (tensor<1xf32>) -> ()
+    }) : (tensor<1x3x320x320xf32>, tensor<1x1xi32>, tensor<1x3x32x32xf32>, tensor<1x3x320x320xf32>) -> tensor<1x3x320x320xf32>
+    // CHECK: {{[0-9]+}} = "ttnn.scatter"(%4, %2, %5) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<1x3x32x32xf32, {{.*}}>, tensor<[[TENSOR_SHAPE1]], {{.*}}>, tensor<[[TENSOR_SHAPE1]], {{.*}}>) -> tensor<[[TENSOR_SHAPE1]], {{.*}}>
+    return %2 : tensor<1x3x320x320xf32>
+    // CHECK: return %{{[0-9]+}} : tensor<[[TENSOR_SHAPE1]], {{.*}}>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir b/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir
index 976f2867db..b7912d4c19 100644
--- a/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir
+++ b/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir
@@ -306,3 +306,13 @@ func.func @addint32(%arg0: tensor<64x128xi32>, %arg1: tensor<64x128xi32>) -> ten
   %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xi32>, tensor<64x128xi32>, tensor<64x128xi32>) -> tensor<64x128xi32>
   return %1 : tensor<64x128xi32>
 }
+
+func.func @scatter(%arg0: tensor<1x3x320x320xf32>, %arg1: tensor<1x3x32x32xf32>) -> tensor<1x3x320x320xf32> {
+  %0 = tensor.empty() : tensor<1x3x320x320xf32>
+  %1 = tensor.empty() : tensor<1x1xi32>
+  %2 = "ttir.scatter"(%arg0, %1, %arg1, %0) <{index_vector_dim = 1 : i32, indices_are_sorted = false, input_batching_dims = array<i32>, inserted_window_dims = array<i32: 0>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile], scatter_dims_to_operand_dims = array<i32: 0>, scatter_indices_batching_dims = array<i32>, unique_indices = false, update_window_dims = array<i32: 1, 2, 3>}> ({
+  ^bb0(%arg3: tensor<1xf32>, %arg4: tensor<1xf32>):
+    "ttir.yield"(%arg4) : (tensor<1xf32>) -> ()
+  }) : (tensor<1x3x320x320xf32>, tensor<1x1xi32>, tensor<1x3x32x32xf32>, tensor<1x3x320x320xf32>) -> tensor<1x3x320x320xf32>
+  return %2 : tensor<1x3x320x320xf32>
+}

From cfbc6a1c96ba5eaabd61ae5c9d2aa7431b607d2b Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <157983820+vmilosevic@users.noreply.github.com>
Date: Mon, 2 Dec 2024 12:02:46 +0100
Subject: [PATCH 37/84] Generate test reports xml (#1420)

Generate test reports xml

generate reports in xml format from test runs
upload XML reports to artifacts
use job_id in report name
---
 .github/Dockerfile.base              |  6 +-
 .github/workflows/build-and-test.yml | 92 ++++++++++++++++++++--------
 2 files changed, 70 insertions(+), 28 deletions(-)

diff --git a/.github/Dockerfile.base b/.github/Dockerfile.base
index c0a01e6d69..e6fc33757c 100644
--- a/.github/Dockerfile.base
+++ b/.github/Dockerfile.base
@@ -28,7 +28,11 @@ RUN apt-get update && apt-get install -y \
     graphviz \
     patchelf \
     libyaml-cpp-dev \
-    libboost-all-dev
+    libboost-all-dev \
+    curl \
+    jq \
+    sudo \
+    gh
 
 # Install clang 17
 RUN wget https://apt.llvm.org/llvm.sh && \
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index 32e4eee3d6..8ec0c93dc2 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -67,11 +67,22 @@ jobs:
     - name: Set reusable strings
       id: strings
       shell: bash
+      env:
+        job-name: "Build tt-mlir (${{ matrix.build.runs-on }}, ${{ matrix.build.enable_perf }}, ${{ matrix.build.enable_op_model }}, ${{ matrix.build.name }})"
       run: |
         echo "work-dir=$(pwd)" >> "$GITHUB_OUTPUT"
         echo "build-output-dir=$(pwd)/build" >> "$GITHUB_OUTPUT"
         echo "install-output-dir=$(pwd)/install" >> "$GITHUB_OUTPUT"
 
+        # Github job context unfortunately doesn't contain job_id, this is the workaround how to fetch it using GH API
+        echo "Expected job name: ${{ env.job-name }}"
+        JOB_ID=$(curl -s -H "Authorization: token ${{ secrets.GH_TOKEN }}" \
+          "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/attempts/${{ github.run_attempt }}/jobs" | \
+          jq -r '.jobs[] | select(.name | contains("${{ env.job-name }}")) | .id ')
+        echo "Current job id: $JOB_ID"
+        echo "job-id=$JOB_ID" >> "$GITHUB_OUTPUT"
+        echo "test_report_path=report_$JOB_ID.xml" >> "$GITHUB_OUTPUT"
+
     - name: Git safe dir
       run: git config --global --add safe.directory ${{ steps.strings.outputs.work-dir }}
 
@@ -145,18 +156,19 @@ jobs:
       run: |
         source env/activate
         cmake --build ${{ steps.strings.outputs.build-output-dir }} -- check-ttmlir
+        cp build/test/report.xml ${{ steps.strings.outputs.test_report_path }}
 
     - name: Upload Test Report
       uses: actions/upload-artifact@v4
       with:
         name: test-reports-${{ matrix.build.runs-on }}-perf-${{ matrix.build.enable_perf }}-op_model-${{ matrix.build.enable_op_model }}
-        path: build/test/report.xml
+        path: ${{ steps.strings.outputs.test_report_path }}
 
     - name: Show Test Report
       uses: mikepenz/action-junit-report@v4
       if: success() || failure()
       with:
-        report_paths: build/test/report.xml
+        report_paths: ${{ steps.strings.outputs.test_report_path }}
         check_name: MLIR Tests
 
     # Build and upload ttrt
@@ -216,6 +228,7 @@ jobs:
           {runs-on: n300, enable_perf: OFF, name: "run", ttrt_flags: "--non-zero"},
           {runs-on: n300, enable_perf: ON, name: "perf"},
         ]
+    name: "run-tests (${{ matrix.build.runs-on }}, ${{ matrix.build.enable_perf }}, ${{ matrix.build.name }})"
 
     runs-on:
       - in-service
@@ -239,11 +252,23 @@ jobs:
     - name: Set reusable strings
       id: strings
       shell: bash
+      env:
+        job-name: "run-tests (${{ matrix.build.runs-on }}, ${{ matrix.build.enable_perf }}, ${{ matrix.build.name }})"
       run: |
         echo "work-dir=$(pwd)" >> "$GITHUB_OUTPUT"
         echo "build-output-dir=$(pwd)/build" >> "$GITHUB_OUTPUT"
         echo "install-output-dir=$(pwd)/install" >> "$GITHUB_OUTPUT"
 
+        # Github job context unfortunately doesn't contain job_id, this is the workaround how to fetch it using GH API
+        echo "Expected job name: ${{ env.job-name }}"
+        JOB_ID=$(curl -s -H "Authorization: token ${{ secrets.GH_TOKEN }}" \
+          "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/attempts/${{ github.run_attempt }}/jobs" | \
+          jq -r '.jobs[] | select(.name | contains("${{ env.job-name }}")) | .id ')
+        echo "Current job id: $JOB_ID"
+
+        echo "job-id=$JOB_ID" >> "$GITHUB_OUTPUT"
+        echo "test_report_path=report_$JOB_ID.xml" >> "$GITHUB_OUTPUT"
+
     - name: Git safe dir
       run: git config --global --add safe.directory ${{ steps.strings.outputs.work-dir }}
 
@@ -305,19 +330,27 @@ jobs:
       run: |
         source env/activate
         ttrt ${{ matrix.build.name }} ${{ matrix.build.ttrt_flags }} ${{ steps.strings.outputs.build-output-dir }}/test/ttmlir/Silicon/TTNN/perf_unit
+        cp ttrt_report.xml ${{ steps.strings.outputs.test_report_path }}
 
-    - name: Upload ttrt test report
+    - name: Upload ttrt test report json
       if: always()
       uses: actions/upload-artifact@v4
       with:
         name: ${{ matrix.build.runs-on }}_${{ matrix.build.name }}_results.json
         path: ${{ matrix.build.name }}_results.json
 
+    - name: Upload Test Report xml
+      uses: actions/upload-artifact@v4
+      if: success() || failure()
+      with:
+        name: test-reports-${{ matrix.build.runs-on }}-${{ matrix.test_group_id }}
+        path: ${{ steps.strings.outputs.test_report_path }}
+
     - name: Show Test Report
       uses: mikepenz/action-junit-report@v4
       if: success() || failure()
       with:
-        report_paths: ttrt_report.xml
+        report_paths: ${{ steps.strings.outputs.test_report_path }}
         check_name: TTRT ${{ matrix.build.runs-on }} ${{ matrix.build.name }} Tests
 
   run-ttrt-tests:
@@ -348,6 +381,7 @@ jobs:
         - /opt/tt_metal_infra/provisioning/provisioning_env:/opt/tt_metal_infra/provisioning/provisioning_env
 
     steps:
+
     - uses: actions/checkout@v4
       with:
           fetch-depth: 0
@@ -355,11 +389,22 @@ jobs:
     - name: Set reusable strings
       id: strings
       shell: bash
+      env:
+        job-name: "${{ github.job }} (${{ matrix.build.runs-on }}, ${{ matrix.build.enable_perf }}, ${{ matrix.build.name }})"
       run: |
         echo "work-dir=$(pwd)" >> "$GITHUB_OUTPUT"
         echo "build-output-dir=$(pwd)/build" >> "$GITHUB_OUTPUT"
         echo "install-output-dir=$(pwd)/install" >> "$GITHUB_OUTPUT"
 
+        # Github job context unfortunately doesn't contain job_id, this is the workaround how to fetch it using GH API
+        echo "Expected job name: ${{ env.job-name }}"
+        JOB_ID=$(curl -s -H "Authorization: token ${{ secrets.GH_TOKEN }}" \
+          "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/attempts/${{ github.run_attempt }}/jobs" | \
+          jq -r '.jobs[] | select(.name | contains("${{ env.job-name }}")) | .id ')
+        echo "Current job id: $JOB_ID"
+        echo "job-id=$JOB_ID" >> "$GITHUB_OUTPUT"
+        echo "test_report_path=report_$JOB_ID.xml" >> "$GITHUB_OUTPUT"
+
     - name: Git safe dir
       run: git config --global --add safe.directory ${{ steps.strings.outputs.work-dir }}
 
@@ -413,31 +458,22 @@ jobs:
       shell: bash
       run: |
         source env/activate
-        pytest -ssv runtime/tools/python/test/test_read.py
-
-    - name: ttrt query tests
-      shell: bash
-      run: |
-        source env/activate
-        pytest -ssv runtime/tools/python/test/test_query.py
-
-    - name: ttrt check tests
-      shell: bash
-      run: |
-        source env/activate
-        pytest -ssv runtime/tools/python/test/test_check.py
+        pytest -ssv runtime/tools/python/test \
+        --junit-xml=${{ steps.strings.outputs.test_report_path }}
 
-    - name: ttrt run tests
-      shell: bash
-      run: |
-        source env/activate
-        pytest -ssv runtime/tools/python/test/test_run.py
+    - name: Upload Test Report
+      uses: actions/upload-artifact@v4
+      if: success() || failure()
+      with:
+        name: test-reports-${{ matrix.build.runs-on }}-${{ matrix.build.name }}
+        path: ${{ steps.strings.outputs.test_report_path }}
 
-    - name: ttrt perf tests
-      shell: bash
-      run: |
-        source env/activate
-        pytest -ssv runtime/tools/python/test/test_perf.py
+    - name: Show Test Report
+      uses: mikepenz/action-junit-report@v4
+      if: success() || failure()
+      with:
+        report_paths: ${{ steps.strings.outputs.test_report_path }}
+        check_name: Run ttrt tests
 
   build-and-test-explorer:
     needs: build-image
@@ -474,6 +510,7 @@ jobs:
       run: |
         echo "work-dir=$(pwd)" >> "$GITHUB_OUTPUT"
         echo "build-output-dir=$(pwd)/build" >> "$GITHUB_OUTPUT"
+        echo "install-output-dir=$(pwd)/install" >> "$GITHUB_OUTPUT"
 
     - name: Git safe dir
       run: git config --global --add safe.directory ${{ steps.strings.outputs.work-dir }}
@@ -512,3 +549,4 @@ jobs:
       run: |
         source env/activate
         pytest tools/explorer/test/run_tests.py
+        # collect results

From e2c982cf7b5bdb7474725c94edda7421fcceb6e4 Mon Sep 17 00:00:00 2001
From: Usman Aziz <uaziz@tenstorrent.com>
Date: Mon, 2 Dec 2024 20:11:27 +0500
Subject: [PATCH 38/84] Move BroadcastOp folding to a seperate TTIR pass.
 (#1353)

* Move BroadcastOp folding to a seperate TTIR pass. Add more tests.
---
 .../ttmlir/Dialect/TTIR/Transforms/Passes.td  | 13 ++++
 lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp      | 41 -----------
 lib/Dialect/TTIR/Transforms/Broadcast.cpp     | 68 +++++++++++++++++++
 lib/Dialect/TTIR/Transforms/CMakeLists.txt    |  1 +
 lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp  | 13 ++++
 .../StableHLOToTTIR/broadcast_op.mlir         | 51 ++++++++++++++
 .../TTNN/arange/arange_tests_positive.mlir    |  2 +
 .../arange/simple_device_arange_dim2.mlir     |  2 +
 .../ttmlir/Silicon/TTNN/simple_broadcast.mlir | 14 ----
 9 files changed, 150 insertions(+), 55 deletions(-)
 create mode 100644 lib/Dialect/TTIR/Transforms/Broadcast.cpp
 delete mode 100644 test/ttmlir/Silicon/TTNN/simple_broadcast.mlir

diff --git a/include/ttmlir/Dialect/TTIR/Transforms/Passes.td b/include/ttmlir/Dialect/TTIR/Transforms/Passes.td
index 63ccb0d28a..b6269f7153 100644
--- a/include/ttmlir/Dialect/TTIR/Transforms/Passes.td
+++ b/include/ttmlir/Dialect/TTIR/Transforms/Passes.td
@@ -112,4 +112,17 @@ def TTIRLoadSystemDesc: Pass<"ttir-load-system-desc", "::mlir::ModuleOp"> {
     ];
 }
 
+def TTIRBroadcastFold: Pass<"ttir-broadcast-fold", "::mlir::ModuleOp"> {
+  let summary = "Broadcast operation is folded to all the consumers.";
+  let description = [{
+    This pass walks through the graph and folds all broadcast instructions since broadcast is supported implicitly by backend ops.
+    Example:
+      %1 = "ttir.broadcast"(%arg0) (tensor<1xf32>) -> tensor<512xf32>
+      %2 = "ttir.maximum"(%1, %arg1) (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+
+      This above broadcast is folded as:
+      %1 = "ttir.maximum"(%arg0, %arg1) (tensor<1xf32>, tensor<512xf32>) -> tensor<512xf32>
+  }];
+}
+
 #endif
diff --git a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
index 18efb982e5..789485eac3 100644
--- a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
+++ b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
@@ -812,46 +812,6 @@ class TypecastOpConversionPattern
   }
 };
 
-class BroadcastOpConversionPattern
-    : public OpConversionPattern<ttir::BroadcastOp> {
-  using OpConversionPattern<ttir::BroadcastOp>::OpConversionPattern;
-
-public:
-  LogicalResult
-  matchAndRewrite(ttir::BroadcastOp srcOp, ttir::BroadcastOp::Adaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-
-    // Fold this operation into all consumer ops. It will only work with TTNN
-    // ops that support implicit broadcasting. We expect each Op's verify
-    // function to assert their arguments to verify that they can broadcast.
-
-    if (srcOp->getUsers().empty()) {
-      // This broadcast chain has already been replaced.
-      rewriter.eraseOp(srcOp);
-      return success();
-    }
-
-    mlir::Value input = srcOp.getOperand(0);
-
-    mlir::Operation *nextOp = srcOp;
-    while (isa<ttir::BroadcastOp>(*nextOp->getUsers().begin())) {
-      assert(nextOp->hasOneUse() &&
-             "Broadcast with multiple uses are not supported");
-      nextOp = *nextOp->getUsers().begin();
-      if (nextOp->getUsers().empty()) {
-        // This broadcast chain has already been replaced.
-        rewriter.eraseOp(srcOp);
-        return success();
-      }
-    }
-
-    rewriter.replaceAllOpUsesWith(nextOp, input);
-    rewriter.eraseOp(srcOp);
-
-    return success();
-  }
-};
-
 class SubtractOpConversionPattern
     : public OpConversionPattern<ttir::SubtractOp> {
   using OpConversionPattern<ttir::SubtractOp>::OpConversionPattern;
@@ -1019,7 +979,6 @@ void populateTTIRToTTNNPatterns(MLIRContext *ctx, RewritePatternSet &patterns,
            ReductionOpConversionPattern<ttir::SumOp, ttnn::SumOp>,
            ReductionOpConversionPattern<ttir::MeanOp, ttnn::MeanOp>,
            ReductionOpConversionPattern<ttir::MaxOp, ttnn::MaxOp>,
-           BroadcastOpConversionPattern,
            EmbeddingOpConversionPattern,
            SoftmaxOpConversionPattern,
            TransposeOpConversionPattern,
diff --git a/lib/Dialect/TTIR/Transforms/Broadcast.cpp b/lib/Dialect/TTIR/Transforms/Broadcast.cpp
new file mode 100644
index 0000000000..7823b021ed
--- /dev/null
+++ b/lib/Dialect/TTIR/Transforms/Broadcast.cpp
@@ -0,0 +1,68 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ttmlir/Dialect/TT/IR/TT.h"
+#include "ttmlir/Dialect/TTIR/Transforms/Passes.h"
+
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include <mlir/Transforms/GreedyPatternRewriteDriver.h>
+
+namespace mlir::tt::ttir {
+#define GEN_PASS_DEF_TTIRBROADCASTFOLD
+#include "ttmlir/Dialect/TTIR/Transforms/Passes.h.inc"
+
+//===----------------------------------------------------------------------===//
+// Broadcast Folding pass
+// Our backend supports implicit broadcast of operands, so explicit broadcast
+// instructions are folded.
+//
+// For Example:
+//
+// %0 = tensor.empty() : tensor<512xf32>
+// %1 = "ttir.broadcast"(%arg0, %0) (tensor<1xf32>, tensor<512xf32>) ->
+// tensor<512xf32> %2 = tensor.empty() : tensor<512xf32> %3 = "ttir.maximum"(%1,
+// %arg1, %2) (tensor<512xf32>, tensor<512xf32>, tensor<512xf32>) ->
+// tensor<512xf32>
+//
+// After folding:
+//
+// %0 = tensor.empty() : tensor<512xf32>
+// %1 = "ttir.maximum"(%arg0, %arg1, %0) (tensor<1xf32>, tensor<512xf32>,
+// tensor<512xf32>) -> tensor<512xf32>
+//===----------------------------------------------------------------------===//
+
+class TTIRBroadcastFoldRewriter : public OpRewritePattern<BroadcastOp> {
+public:
+  using OpRewritePattern<BroadcastOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(BroadcastOp op,
+                                PatternRewriter &rewriter) const final {
+
+    rewriter.replaceOp(op, op->getOperand(0));
+    return success();
+  }
+};
+
+class TTIRBroadcastFold
+    : public impl::TTIRBroadcastFoldBase<TTIRBroadcastFold> {
+public:
+  using impl::TTIRBroadcastFoldBase<TTIRBroadcastFold>::TTIRBroadcastFoldBase;
+
+  void runOnOperation() final {
+    RewritePatternSet patterns(&getContext());
+    patterns.add<TTIRBroadcastFoldRewriter>(&getContext());
+    FrozenRewritePatternSet patternSet(std::move(patterns));
+    if (failed(applyPatternsAndFoldGreedily(getOperation(), patternSet))) {
+      signalPassFailure();
+      return;
+    }
+  }
+
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {
+    registry.insert<mlir::tt::ttir::TTIRDialect>();
+    registry.insert<mlir::tt::TTDialect>();
+  }
+};
+
+} // namespace mlir::tt::ttir
diff --git a/lib/Dialect/TTIR/Transforms/CMakeLists.txt b/lib/Dialect/TTIR/Transforms/CMakeLists.txt
index f5fec45a8b..597c55e3ca 100644
--- a/lib/Dialect/TTIR/Transforms/CMakeLists.txt
+++ b/lib/Dialect/TTIR/Transforms/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_dialect_library(MLIRTTIRTransforms
         Allocate.cpp
+	Broadcast.cpp
         Constant.cpp
         Generic.cpp
         Layout.cpp
diff --git a/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp b/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp
index 24980fb7c0..3ade96bf82 100644
--- a/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp
+++ b/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp
@@ -107,9 +107,22 @@ void createTTNNPipelineDeallocPassFromString(OpPassManager &pm,
   createTTNNPipelineDeallocPass(pm, *optionsStruct);
 }
 
+void createTTNNPipelineTTIRBroadcastFoldPass(
+    OpPassManager &pm, const TTIRToTTNNBackendPipelineOptions &options) {
+  pm.addPass(mlir::tt::ttir::createTTIRBroadcastFold());
+}
+
+void createTTNNPipelineTTIRBroadcastFoldPassFromString(OpPassManager &pm,
+                                                       std::string options) {
+  auto optionsStruct =
+      TTIRToTTNNBackendPipelineOptions::createFromString(options);
+  createTTNNPipelineTTIRBroadcastFoldPass(pm, *optionsStruct);
+}
+
 void createTTIRToTTNNBackendPipeline(
     OpPassManager &pm, const TTIRToTTNNBackendPipelineOptions &options) {
   createTTNNPipelineTTIRPasses(pm, options);
+  createTTNNPipelineTTIRBroadcastFoldPass(pm, options);
   createTTNNPipelineLoweringPasses(pm, options);
   createTTNNPipelineAnalysisPasses(pm, options);
   createTTNNPipelineLayoutDecompositionPass(pm, options);
diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/broadcast_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/broadcast_op.mlir
index fa6cbb4236..42a26ad15f 100644
--- a/test/ttmlir/Conversion/StableHLOToTTIR/broadcast_op.mlir
+++ b/test/ttmlir/Conversion/StableHLOToTTIR/broadcast_op.mlir
@@ -8,3 +8,54 @@ module @jit_broadcast attributes {mhlo.num_partitions = 1 : i32, mhlo.num_replic
     return %1 : tensor<512x512xf32>
   }
 }
+
+module {
+  func.func @main(%arg0: tensor<1x23x40x1xf32>, %arg1: tensor<128xf32>) -> tensor<1x23x40x128xf32> {
+    %0 = stablehlo.broadcast_in_dim %arg0, dims = [0, 1, 2, 3] : (tensor<1x23x40x1xf32>) -> tensor<1x23x40x128xf32>
+    %1 = stablehlo.broadcast_in_dim %arg1, dims = [3] : (tensor<128xf32>) -> tensor<1x23x40x128xf32>
+    // CHECK: %[[C:.*]] = "ttir.broadcast"[[C:.*]]
+    %2 = stablehlo.divide %0, %1 : tensor<1x23x40x128xf32>
+    return %2 : tensor<1x23x40x128xf32>
+  }
+}
+
+module {
+  func.func @main(%arg0: tensor<32xi64>, %arg1: tensor<32x1xi64>) -> tensor<32x32xi1> {
+    %0 = stablehlo.broadcast_in_dim %arg0, dims = [1] : (tensor<32xi64>) -> tensor<32x32xi64>
+    %1 = stablehlo.broadcast_in_dim %arg1, dims = [0, 1] : (tensor<32x1xi64>) -> tensor<32x32xi64>
+    %2 = stablehlo.compare  GT, %0, %1,  SIGNED : (tensor<32x32xi64>, tensor<32x32xi64>) -> tensor<32x32xi1>
+    // CHECK: %[[C:.*]] = "ttir.broadcast"[[C:.*]]
+    return %2 : tensor<32x32xi1>
+  }
+}
+
+module {
+  func.func @main(%arg0: tensor<16x1xf32>, %arg1: tensor<1x1x32xi64>) -> tensor<1x16x32xf32> {
+    %0 = stablehlo.convert %arg1 : (tensor<1x1x32xi64>) -> tensor<1x1x32xf32>
+    %1 = stablehlo.broadcast_in_dim %arg0, dims = [1, 2] : (tensor<16x1xf32>) -> tensor<1x16x32xf32>
+    %2 = stablehlo.broadcast_in_dim %0, dims = [0, 1, 2] : (tensor<1x1x32xf32>) -> tensor<1x16x32xf32>
+    %3 = stablehlo.multiply %1, %2 : tensor<1x16x32xf32>
+    // CHECK: %[[C:.*]] = "ttir.broadcast"[[C:.*]]
+    return %3 : tensor<1x16x32xf32>
+  }
+}
+
+module {
+  func.func @main(%arg0: tensor<1x10xi64>, %arg1: tensor<10x1xi64>) -> tensor<10x10xi64> {
+    %0 = stablehlo.broadcast_in_dim %arg0, dims = [0, 1] : (tensor<1x10xi64>) -> tensor<10x10xi64>
+    %1 = stablehlo.broadcast_in_dim %arg1, dims = [0, 1] : (tensor<10x1xi64>) -> tensor<10x10xi64>
+    %2 = stablehlo.subtract %0, %1 : tensor<10x10xi64>
+    // CHECK: %[[C:.*]] = "ttir.broadcast"[[C:.*]]
+    return %2 : tensor<10x10xi64>
+  }
+}
+
+module {
+  func.func @main(%arg0: tensor<8xf32>, %arg1: tensor<1xf32>) -> tensor<8xf32> {
+    %0 = stablehlo.broadcast_in_dim %arg0, dims = [0] : (tensor<8xf32>) -> tensor<8xf32>
+    %1 = stablehlo.broadcast_in_dim %arg1, dims = [0] : (tensor<1xf32>) -> tensor<8xf32>
+    %2 = stablehlo.add %0, %1 : tensor<8xf32>
+    // CHECK: %[[C:.*]] = "ttir.broadcast"[[C:.*]]
+    return %2 : tensor<8xf32>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/arange/arange_tests_positive.mlir b/test/ttmlir/Dialect/TTNN/arange/arange_tests_positive.mlir
index 4c04e138bb..16c396c00e 100644
--- a/test/ttmlir/Dialect/TTNN/arange/arange_tests_positive.mlir
+++ b/test/ttmlir/Dialect/TTNN/arange/arange_tests_positive.mlir
@@ -1,4 +1,6 @@
 // RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s | FileCheck %s
+// XFAIL: *
+// https://github.com/tenstorrent/tt-mlir/issues/1448
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 module attributes {} {
   func.func @forward(%arg0: tensor<1x32x128x128xf32>) -> tensor<1x32x128x128xf32> {
diff --git a/test/ttmlir/Silicon/TTNN/arange/simple_device_arange_dim2.mlir b/test/ttmlir/Silicon/TTNN/arange/simple_device_arange_dim2.mlir
index ec509a1b6f..f3affc69d4 100644
--- a/test/ttmlir/Silicon/TTNN/arange/simple_device_arange_dim2.mlir
+++ b/test/ttmlir/Silicon/TTNN/arange/simple_device_arange_dim2.mlir
@@ -1,6 +1,8 @@
 // RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
 // RUN: FileCheck %s --input-file=%t.mlir
 // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+// UNSUPPORTED: true
+// https://github.com/tenstorrent/tt-mlir/issues/1448
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 module attributes {} {
   func.func @forward(%arg0: tensor<1x1x32x128xbf16>) -> tensor<1x1x32x128xbf16> {
diff --git a/test/ttmlir/Silicon/TTNN/simple_broadcast.mlir b/test/ttmlir/Silicon/TTNN/simple_broadcast.mlir
deleted file mode 100644
index 1d88725d1d..0000000000
--- a/test/ttmlir/Silicon/TTNN/simple_broadcast.mlir
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
-// RUN: FileCheck %s --input-file=%t.mlir
-// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
-#any_device = #tt.operand_constraint<dram|l1|tile|any_device|any_device_tile>
-
-func.func public @broadcast() -> (tensor<32xf32>) {
-  %0 = "ttir.constant"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
-  %1 = tensor.empty() : tensor<32xf32>
-  %2 = "ttir.broadcast"(%0, %1) <{dimension = [0], operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1xf32>, tensor<32xf32>) -> tensor<32xf32>
-  %3 = tensor.empty() : tensor<32xf32>
-  %4 = "ttir.broadcast"(%2, %3) <{dimension = [0], operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<32xf32>, tensor<32xf32>) -> tensor<32xf32>
-  // CHECK-NOT: %[[C:.*]] = "ttir.broadcast"[[C:.*]]
-  return %4 : tensor<32xf32>
-}

From 12fc71c1b4b86dab7eb0b06181ff45bdf375f2c6 Mon Sep 17 00:00:00 2001
From: Vlad Roubtsov <vroubtsov@tenstorrent.com>
Date: Mon, 2 Dec 2024 11:47:37 -0600
Subject: [PATCH 39/84] implement eltwise_max direct to metal (#1335)

* made the necessary tablegen changes to switch max_* ops to be generic binary ops
* added TTIRToTTMetal conversion logic to support SFPU ops taking DST operands
* added #include for 'compute_kernel_api.h' for metal max_* ops in emitc converter
---
 include/ttmlir/Dialect/TTIR/IR/TTIROps.td     |  27 ++--
 .../ttmlir/Dialect/TTKernel/IR/TTKernelOps.td |  34 ++++++
 .../TTIRToTTMetal/TTIRToTTMetal.cpp           | 115 +++++++++++++++---
 .../TTKernelToEmitC/TTKernelToEmitC.cpp       |   4 +
 lib/Dialect/TTIR/IR/TTIROps.cpp               |   7 ++
 test/ttmlir/Silicon/TTMetal/simple_max.mlir   |  13 ++
 6 files changed, 167 insertions(+), 33 deletions(-)
 create mode 100644 test/ttmlir/Silicon/TTMetal/simple_max.mlir

diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
index 8908e470e0..f55b3acbde 100644
--- a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
+++ b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
@@ -506,18 +506,6 @@ def TTIR_LogicalXorOp : TTIR_ElementwiseBinaryOp<"logical_xor"> {
     }];
 }
 
-def TTIR_MaximumOp :  TTIR_ElementwiseBinaryOp<"maximum"> {
-    let summary = "Eltwise maximum OP.";
-    let description = [{
-      Calculates maximum of input tensors' values element-wise and stores result in output tensor.
-
-      Example:
-        %lhs: [[3, 2, 7], [1, 4, 4]]
-        %rhs: [[1, 4, 2], [1, 2, 3]]
-        "ttir.maximum"(%lhs, %rhs, %out) -> %out: [[3, 4, 7], [1, 4, 4]]
-    }];
-}
-
 def TTIR_MinimumOp :  TTIR_ElementwiseBinaryOp<"minimum"> {
     let summary = "Eltwise minimum OP.";
     let description = [{
@@ -1244,6 +1232,20 @@ def TTIR_DivOp : TTIR_GenericElementwiseBinaryOp<"div"> {
     }];
 }
 
+def TTIR_MaximumOp : TTIR_GenericElementwiseBinaryOp<"maximum"> {
+    let summary = "Eltwise maximum.";
+    let description = [{
+      Calculates maximum of input tensors' values element-wise and stores result in output tensor.
+
+      Example:
+        %lhs: [[3, 2, 7], [1, 4, 4]]
+        %rhs: [[1, 4, 2], [1, 2, 3]]
+        "ttir.maximum"(%lhs, %rhs, %out) -> %out: [[3, 4, 7], [1, 4, 4]]
+    }];
+}
+
+//===----------------------------------------------------------------------===//
+
 def TTIR_ScatterOp: TTIR_DPSOp<"scatter"> {
   let summary = "Scatter operation";
   let description = [{
@@ -1275,7 +1277,6 @@ def TTIR_ScatterOp: TTIR_DPSOp<"scatter"> {
   let extraClassDeclaration = [{
     MutableOperandRange getDpsInitsMutable() { return getOutputMutable(); }
   }];
-
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/include/ttmlir/Dialect/TTKernel/IR/TTKernelOps.td b/include/ttmlir/Dialect/TTKernel/IR/TTKernelOps.td
index c0f6d43619..ed70d7da68 100644
--- a/include/ttmlir/Dialect/TTKernel/IR/TTKernelOps.td
+++ b/include/ttmlir/Dialect/TTKernel/IR/TTKernelOps.td
@@ -180,6 +180,15 @@ def TTKernel_MulOp : TTKernel_Op<"mul"> {
     let arguments = (ins I32:$dst_index);
 }
 
+def TTKernel_MaxOp : TTKernel_Op<"max"> {
+    let summary = "Max operation";
+    let description = [{
+      Max operation
+    }];
+
+    let arguments = (ins I32:$dst_index);
+}
+
 def TTKernel_MatmulOp : TTKernel_Op<"matmul"> {
     let summary = "Matmul operation";
     let description = [{
@@ -333,6 +342,31 @@ def TTKernel_ReduceTileOp : TTKernel_Op<"reduce_tile"> {
                          TTKernel_ReduceDimAttr:$reduce_dim);
 }
 
+//===----------------------------------------------------------------------===//
+// TTKernel SFPU operations
+//===----------------------------------------------------------------------===//
+
+def TTKernel_MaxTilesInitOp : TTKernel_Op<"max_tile_init"> {
+    let summary = "Short init function";
+    let description = [{
+      Must be run before max_tile.
+    }];
+
+    let arguments = (ins);
+}
+
+def TTKernel_MaxTilesOp : TTKernel_Op<"max_tile"> {
+    let summary = "Max operation";
+    let description = [{
+      Performs element-wise computation of maximum operation
+      DST[dst0_index] <- max(DST[dst0_index], DST[dst1_index])
+      on DST register operands. The DST register buffer must be in
+      acquired state via *tile_regs_acquire* call.
+    }];
+
+    let arguments = (ins I32:$dst0_index, I32:$dst1_index);
+}
+
 //===----------------------------------------------------------------------===//
 // TTKernel CB operations
 //===----------------------------------------------------------------------===//
diff --git a/lib/Conversion/TTIRToTTMetal/TTIRToTTMetal.cpp b/lib/Conversion/TTIRToTTMetal/TTIRToTTMetal.cpp
index 60c0328197..09727e2034 100644
--- a/lib/Conversion/TTIRToTTMetal/TTIRToTTMetal.cpp
+++ b/lib/Conversion/TTIRToTTMetal/TTIRToTTMetal.cpp
@@ -799,6 +799,8 @@ class TTIRToTTMetalDispatchRewriter : public OpRewritePattern<ttir::GenericOp> {
                                                inCB1);
     } else if (mlir::isa<arith::DivFOp>(arithOrMathOp)) {
       builder.create<ttkernel::MulTilesInitFOp>(arithOrMathOp.getLoc());
+    } else if (mlir::isa<arith::MaximumFOp>(arithOrMathOp)) {
+      builder.create<ttkernel::MaxTilesInitOp>(arithOrMathOp.getLoc());
     } else {
       llvm_unreachable("Unhandled binary op init conversion.");
     }
@@ -905,27 +907,13 @@ class TTIRToTTMetalDispatchRewriter : public OpRewritePattern<ttir::GenericOp> {
     assert(cbOperands.size() == 3 &&
            "Expected two input and one output CB for binary op.");
 
-    auto inCB0TileIndex = iterators[blockArgIteratorMapping[0]];
-    auto inCB0 = cbOperands[0];
-    auto inCB1TileIndex = iterators[blockArgIteratorMapping[1]];
-    auto inCB1 = cbOperands[1];
-    auto outCB = cbOperands[2];
-    auto outCBTileIndex = iterators[blockArgIteratorMapping[2]];
-
-    auto location = arithOrMathOp.getLoc();
-
-    // Perform computation C = A (*) B on tile A from inCB0 and tile B from
-    // inCB1 and store the result C in DST register on dstTileIndex.
+    // Perform computation C = A (*) B on tile A from cbOperands[0] and tile B
+    // from cbOperands[1] and store the result C in DST register on
+    // dstTileIndex.
     if (mlir::isa<arith::AddFOp>(arithOrMathOp)) {
-      Value dstIndex = i32(0, builder);
-      builder.create<ttkernel::TileRegsAcquireOp>(location);
-      builder.create<ttkernel::AddTilesOp>(
-          location, inCB0, inCB1, inCB0TileIndex, inCB1TileIndex, dstIndex);
-      builder.create<ttkernel::TileRegsCommitOp>(location);
-      builder.create<ttkernel::TileRegsWaitOp>(location);
-      builder.create<ttkernel::PackTileOp>(location, dstIndex, outCB,
-                                           outCBTileIndex);
-      builder.create<ttkernel::TileRegsReleaseOp>(location);
+      convertComputeBinaryFPUOp<ttkernel::AddTilesOp>(
+          arithOrMathOp, cbOperands, iterators, blockArgIteratorMapping,
+          builder);
     } else if (mlir::isa<arith::MulFOp>(arithOrMathOp)) {
       commonComputeMulOp(arithOrMathOp, cbOperands, iterators,
                          blockArgIteratorMapping, builder);
@@ -938,6 +926,10 @@ class TTIRToTTMetalDispatchRewriter : public OpRewritePattern<ttir::GenericOp> {
                            blockArgIteratorMapping, builder,
                            operandIndicesRecip);
 
+      auto inCB0 = cbOperands[0];
+      auto inCB1 = cbOperands[1];
+      auto location = arithOrMathOp.getLoc();
+
       Value one = i32(1, builder);
       builder.create<ttkernel::CBWaitFrontOp>(location, inCB1, one);
 
@@ -947,12 +939,95 @@ class TTIRToTTMetalDispatchRewriter : public OpRewritePattern<ttir::GenericOp> {
                          blockArgIteratorMapping, builder);
 
       builder.create<ttkernel::CBPopFrontOp>(location, inCB1, one);
+    } else if (mlir::isa<arith::MaximumFOp>(arithOrMathOp)) {
+      convertComputeBinarySFPUOp<ttkernel::MaxTilesOp>(
+          arithOrMathOp, cbOperands, iterators, blockArgIteratorMapping,
+          builder);
     } else {
       llvm_unreachable("Unhandled conversion for operation which is neither "
                        "unary nor binary.");
     }
   }
 
+  template <typename TTKernelTilesOp>
+  void convertComputeBinaryFPUOp(
+      Operation &arithOrMathOp, ArrayRef<BlockArgument> cbOperands,
+      ArrayRef<BlockArgument> iterators,
+      const SmallVector<unsigned> &blockArgIteratorMapping,
+      OpBuilder &builder) const {
+    auto inCB0TileIndex = iterators[blockArgIteratorMapping[0]];
+    auto inCB0 = cbOperands[0];
+    auto inCB1TileIndex = iterators[blockArgIteratorMapping[1]];
+    auto inCB1 = cbOperands[1];
+    auto outCB = cbOperands[2];
+    auto outCBTileIndex = iterators[blockArgIteratorMapping[2]];
+
+    auto location = arithOrMathOp.getLoc();
+
+    Value dstIndex = i32(0, builder);
+
+    // acquire DST register lock (MATH)
+    builder.create<ttkernel::TileRegsAcquireOp>(location);
+    {
+      builder.create<TTKernelTilesOp>(location, inCB0, inCB1, inCB0TileIndex,
+                                      inCB1TileIndex, dstIndex);
+    }
+    builder.create<ttkernel::TileRegsCommitOp>(location);
+    // release DST register lock (MATH)
+
+    // acquire DST register lock (PACK)
+    builder.create<ttkernel::TileRegsWaitOp>(location);
+    {
+      builder.create<ttkernel::PackTileOp>(location, dstIndex, outCB,
+                                           outCBTileIndex);
+    }
+    builder.create<ttkernel::TileRegsReleaseOp>(location);
+    // release DST register lock (PACK)
+  }
+
+  template <typename TTKernelTilesOp>
+  void convertComputeBinarySFPUOp(
+      Operation &arithOrMathOp, ArrayRef<BlockArgument> cbOperands,
+      ArrayRef<BlockArgument> iterators,
+      const SmallVector<unsigned> &blockArgIteratorMapping,
+      OpBuilder &builder) const {
+    auto inCB0TileIndex = iterators[blockArgIteratorMapping[0]];
+    auto inCB0 = cbOperands[0];
+    auto inCB1TileIndex = iterators[blockArgIteratorMapping[1]];
+    auto inCB1 = cbOperands[1];
+    auto outCB = cbOperands[2];
+    auto outCBTileIndex = iterators[blockArgIteratorMapping[2]];
+
+    auto location = arithOrMathOp.getLoc();
+
+    Value dstLhsTileIndex = i32(0, builder);
+    Value dstRhsTileIndex = i32(1, builder); // note: rhs is always lhs+1
+
+    // acquire DST register lock (MATH)
+    builder.create<ttkernel::TileRegsAcquireOp>(location);
+    {
+      // copy inCB0[inCB0TileIndex] and inCB1[inCB1TileIndex] to DST:
+      builder.create<ttkernel::CopyTileOp>(location, inCB0, inCB0TileIndex,
+                                           dstLhsTileIndex);
+      builder.create<ttkernel::CopyTileOp>(location, inCB1, inCB1TileIndex,
+                                           dstRhsTileIndex);
+      // SFPU ooperates on DST tiles:
+      builder.create<TTKernelTilesOp>(location, dstLhsTileIndex,
+                                      dstRhsTileIndex);
+    }
+    builder.create<ttkernel::TileRegsCommitOp>(location);
+    // release DST register lock (MATH)
+
+    // acquire DST register lock (PACK)
+    builder.create<ttkernel::TileRegsWaitOp>(location);
+    {
+      builder.create<ttkernel::PackTileOp>(location, dstLhsTileIndex, outCB,
+                                           outCBTileIndex);
+    }
+    builder.create<ttkernel::TileRegsReleaseOp>(location);
+    // release DST register lock (PACK)
+  }
+
   void commonComputeMulOp(Operation &op, ArrayRef<BlockArgument> cbOperands,
                           ArrayRef<BlockArgument> iterators,
                           SmallVector<unsigned> blockArgIteratorMapping,
diff --git a/lib/Conversion/TTKernelToEmitC/TTKernelToEmitC.cpp b/lib/Conversion/TTKernelToEmitC/TTKernelToEmitC.cpp
index b907ad7f36..c265e89283 100644
--- a/lib/Conversion/TTKernelToEmitC/TTKernelToEmitC.cpp
+++ b/lib/Conversion/TTKernelToEmitC/TTKernelToEmitC.cpp
@@ -406,8 +406,10 @@ class ConvertTTKernelToEmitCPass
                TTMetalToEmitCOpaqueRewriter<ttkernel::AddTilesInitOp>,
                TTMetalToEmitCOpaqueRewriter<ttkernel::MulTilesInitOp>,
                TTMetalToEmitCOpaqueRewriter<ttkernel::MulTilesInitFOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::MaxTilesInitOp>,
                TTMetalToEmitCOpaqueRewriter<ttkernel::AddTilesOp>,
                TTMetalToEmitCOpaqueRewriter<ttkernel::MulTilesOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::MaxTilesOp>,
                TTMetalToEmitCOpaqueRewriter<ttkernel::ReduceInitOp>,
                TTMetalToEmitCOpaqueRewriter<ttkernel::ReduceTileOp>,
                TTMetalToEmitCOpaqueRewriter<ttkernel::GetNocAddrOp>,
@@ -479,6 +481,8 @@ class ThreadConfigHelper {
       builder->create<emitc::IncludeOp>(loc,
                                         "compute_kernel_api/eltwise_binary.h",
                                         /*isStandard=*/false);
+      builder->create<emitc::IncludeOp>(loc, "compute_kernel_api.h", // max ops
+                                        /*isStandard=*/false);
       builder->create<emitc::IncludeOp>(loc,
                                         "compute_kernel_api/tile_move_copy.h",
                                         /*isStandard=*/false);
diff --git a/lib/Dialect/TTIR/IR/TTIROps.cpp b/lib/Dialect/TTIR/IR/TTIROps.cpp
index aacb2a43de..a3a6dd586c 100644
--- a/lib/Dialect/TTIR/IR/TTIROps.cpp
+++ b/lib/Dialect/TTIR/IR/TTIROps.cpp
@@ -1466,6 +1466,13 @@ void mlir::tt::ttir::DivOp::buildGenericRegion(::mlir::OpBuilder &opBuilder,
                                                         block);
 }
 
+// MaximumOp generic region builder
+void mlir::tt::ttir::MaximumOp::buildGenericRegion(::mlir::OpBuilder &opBuilder,
+                                                   ::mlir::Block *block) {
+  buildGenericEltwiseBinaryRegion<arith::MaximumFOp>(getLoc(), opBuilder,
+                                                     block);
+}
+
 //===----------------------------------------------------------------------===//
 // KernelOp
 //===----------------------------------------------------------------------===//
diff --git a/test/ttmlir/Silicon/TTMetal/simple_max.mlir b/test/ttmlir/Silicon/TTMetal/simple_max.mlir
new file mode 100644
index 0000000000..92bdbe72c7
--- /dev/null
+++ b/test/ttmlir/Silicon/TTMetal/simple_max.mlir
@@ -0,0 +1,13 @@
+// RUN: ttmlir-opt --ttir-to-ttmetal-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttmetal-to-flatbuffer %t.mlir > %t.ttm
+
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+
+func.func @maximum(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
+  %0 = tensor.empty() : tensor<64x128xf32>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
+  %1 = "ttir.maximum"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+  return %1 : tensor<64x128xf32>
+}

From 139c7cceab2fe312a7cc199e0aafac43574ce492 Mon Sep 17 00:00:00 2001
From: Vraj Prajapati <vprajapati@tenstorrent.com>
Date: Mon, 2 Dec 2024 12:04:28 -0600
Subject: [PATCH 40/84] Added LOC info to TTRT Perf output (#1401)

* Added LOC info to TTRT Perf output

* Rebase fixes

* Non-Runtime build fix

* Readability fixes

* Fixed row removal in perf results

* Added LocInfo
---
 include/ttmlir/Target/TTNN/program.fbs        |   1 +
 include/ttmlir/Target/Utils/FuncOpToProgram.h |  10 +-
 .../ttmlir/Target/Utils/MLIRToFlatbuffer.h    |   6 +-
 lib/Target/TTNN/TTNNToFlatbuffer.cpp          | 181 +++++++++++-------
 runtime/include/tt/runtime/detail/ttmetal.h   |   2 +
 runtime/include/tt/runtime/detail/ttnn.h      |   2 +
 runtime/include/tt/runtime/runtime.h          |   2 +
 runtime/lib/runtime.cpp                       |  15 ++
 runtime/lib/ttmetal/runtime.cpp               |   6 +
 runtime/lib/ttnn/program.cpp                  |  11 ++
 runtime/lib/ttnn/runtime.cpp                  |   6 +
 runtime/tools/python/ttrt/common/perf.py      |  37 ++++
 runtime/tools/python/ttrt/runtime/module.cpp  |   2 +
 third_party/CMakeLists.txt                    |   1 +
 14 files changed, 212 insertions(+), 70 deletions(-)

diff --git a/include/ttmlir/Target/TTNN/program.fbs b/include/ttmlir/Target/TTNN/program.fbs
index f145aaf657..19b1dbc92a 100644
--- a/include/ttmlir/Target/TTNN/program.fbs
+++ b/include/ttmlir/Target/TTNN/program.fbs
@@ -286,6 +286,7 @@ union OpType {
 table Operation {
   type: OpType;
   debug_info: string;
+  loc_info: string;
 }
 
 table Program {
diff --git a/include/ttmlir/Target/Utils/FuncOpToProgram.h b/include/ttmlir/Target/Utils/FuncOpToProgram.h
index d9e8d98207..a28f2f5e9a 100644
--- a/include/ttmlir/Target/Utils/FuncOpToProgram.h
+++ b/include/ttmlir/Target/Utils/FuncOpToProgram.h
@@ -31,6 +31,13 @@ inline std::string getOpDebugString(mlir::Operation *op,
   return str;
 };
 
+inline std::string getOpLocInfo(mlir::Operation *op) {
+  std::string str;
+  llvm::raw_string_ostream os(str);
+  op->getLoc().print(os);
+  return str;
+}
+
 inline Value getOperandThroughDPSOps(Value value) {
   auto *op = value.getDefiningOp();
   if (!op) {
@@ -76,7 +83,8 @@ Program<OpT> funcOpToProgram(FlatbufferObjectCache &cache, func::FuncOp entry,
       }
     } else {
       std::string debugStr = getOpDebugString(op, printFlags);
-      program.ops.push_back(fn(cache, op, debugStr));
+      std::string locInfo = getOpLocInfo(op);
+      program.ops.push_back(fn(cache, op, debugStr, locInfo));
     }
   });
 
diff --git a/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h b/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h
index d5be2bb97c..cb9439d978 100644
--- a/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h
+++ b/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h
@@ -479,7 +479,11 @@ toDebugInfo(::flatbuffers::FlatBufferBuilder &fbb, std::string const &name,
             ModuleOp module) {
   std::string source;
   llvm::raw_string_ostream os(source);
-  module->print(os);
+
+  mlir::OpPrintingFlags flags;
+  flags.enableDebugInfo(); // Enable the loc dumping
+  module->print(os, flags);
+
   return ::tt::target::CreateMLIRDirect(fbb, name.c_str(), source.c_str());
 }
 } // namespace mlir::tt
diff --git a/lib/Target/TTNN/TTNNToFlatbuffer.cpp b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
index 34a0c4725d..9706880e38 100644
--- a/lib/Target/TTNN/TTNNToFlatbuffer.cpp
+++ b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
@@ -163,10 +163,10 @@ createDeviceRef(FlatbufferObjectCache &cache, Value device) {
 template <typename OpT>
 ::flatbuffers::Offset<::tt::target::ttnn::Operation>
 createOperation(FlatbufferObjectCache &cache, ::flatbuffers::Offset<OpT> op,
-                std::string const &debugString) {
+                std::string const &debugString, std::string const &locInfo) {
   return CreateOperationDirect(
       *cache.fbb, ::tt::target::ttnn::OpTypeTraits<OpT>::enum_value, op.Union(),
-      debugString.c_str());
+      debugString.c_str(), locInfo.c_str());
 }
 
 ::flatbuffers::Offset<::tt::target::ttnn::GetDeviceOp>
@@ -701,218 +701,263 @@ createDeallocateOp(FlatbufferObjectCache &cache, DeallocateOp op) {
 
 ::flatbuffers::Offset<::tt::target::ttnn::Operation>
 emitTTNNOperation(FlatbufferObjectCache &cache, Operation *op,
-                  std::string const &debugString) {
+                  std::string const &debugString, std::string const &locInfo) {
   if (auto getDeviceOp = dyn_cast<GetDeviceOp>(op); getDeviceOp) {
-    return createOperation(cache, createOp(cache, getDeviceOp), debugString);
+    return createOperation(cache, createOp(cache, getDeviceOp), debugString,
+                           locInfo);
   }
   if (auto toMemoryConfigOp = dyn_cast<ToMemoryConfigOp>(op);
       toMemoryConfigOp) {
     return createOperation(cache, createOp(cache, toMemoryConfigOp),
-                           debugString);
+                           debugString, locInfo);
   }
   if (auto toLayoutOp = dyn_cast<ToLayoutOp>(op); toLayoutOp) {
-    return createOperation(cache, createOp(cache, toLayoutOp), debugString);
+    return createOperation(cache, createOp(cache, toLayoutOp), debugString,
+                           locInfo);
   }
   if (auto typecastOp = dyn_cast<TypecastOp>(op); typecastOp) {
-    return createOperation(cache, createOp(cache, typecastOp), debugString);
+    return createOperation(cache, createOp(cache, typecastOp), debugString,
+                           locInfo);
   }
   if (auto toDeviceOp = dyn_cast<ToDeviceOp>(op); toDeviceOp) {
-    return createOperation(cache, createOp(cache, toDeviceOp), debugString);
+    return createOperation(cache, createOp(cache, toDeviceOp), debugString,
+                           locInfo);
   }
   if (auto fromDeviceOp = dyn_cast<FromDeviceOp>(op); fromDeviceOp) {
-    return createOperation(cache, createOp(cache, fromDeviceOp), debugString);
+    return createOperation(cache, createOp(cache, fromDeviceOp), debugString,
+                           locInfo);
   }
   if (auto emptyOp = dyn_cast<EmptyOp>(op); emptyOp) {
-    return createOperation(cache, createOp(cache, emptyOp), debugString);
+    return createOperation(cache, createOp(cache, emptyOp), debugString,
+                           locInfo);
   }
   if (auto fullOp = dyn_cast<FullOp>(op); fullOp) {
-    return createOperation(cache, createOp(cache, fullOp), debugString);
+    return createOperation(cache, createOp(cache, fullOp), debugString,
+                           locInfo);
   }
   if (auto absOp = dyn_cast<AbsOp>(op); absOp) {
-    return createOperation(cache, createEltwiseOp(cache, absOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, absOp), debugString,
+                           locInfo);
   }
   if (auto addOp = dyn_cast<AddOp>(op); addOp) {
-    return createOperation(cache, createEltwiseOp(cache, addOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, addOp), debugString,
+                           locInfo);
   }
   if (auto floorOp = dyn_cast<FloorOp>(op); floorOp) {
-    return createOperation(cache, createEltwiseOp(cache, floorOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, floorOp), debugString,
+                           locInfo);
   }
   if (auto isFiniteOp = dyn_cast<IsFiniteOp>(op); isFiniteOp) {
     return createOperation(cache, createEltwiseOp(cache, isFiniteOp),
-                           debugString);
+                           debugString, locInfo);
   }
   if (auto andOp = dyn_cast<LogicalAndOp>(op); andOp) {
-    return createOperation(cache, createEltwiseOp(cache, andOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, andOp), debugString,
+                           locInfo);
   }
   if (auto cbrtOp = dyn_cast<CbrtOp>(op); cbrtOp) {
-    return createOperation(cache, createEltwiseOp(cache, cbrtOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, cbrtOp), debugString,
+                           locInfo);
   }
   if (auto notOp = dyn_cast<LogicalNotOp>(op); notOp) {
-    return createOperation(cache, createEltwiseOp(cache, notOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, notOp), debugString,
+                           locInfo);
   }
   if (auto orOp = dyn_cast<LogicalOrOp>(op); orOp) {
-    return createOperation(cache, createEltwiseOp(cache, orOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, orOp), debugString,
+                           locInfo);
   }
   if (auto xorOp = dyn_cast<LogicalXorOp>(op); xorOp) {
-    return createOperation(cache, createEltwiseOp(cache, xorOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, xorOp), debugString,
+                           locInfo);
   }
   if (auto multiplyOp = dyn_cast<MultiplyOp>(op); multiplyOp) {
     return createOperation(cache, createEltwiseOp(cache, multiplyOp),
-                           debugString);
+                           debugString, locInfo);
   }
   if (auto negOp = dyn_cast<NegOp>(op); negOp) {
-    return createOperation(cache, createEltwiseOp(cache, negOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, negOp), debugString,
+                           locInfo);
   }
   if (auto subtractOp = dyn_cast<SubtractOp>(op); subtractOp) {
     return createOperation(cache, createEltwiseOp(cache, subtractOp),
-                           debugString);
+                           debugString, locInfo);
   }
   if (auto eqOp = dyn_cast<EqualOp>(op); eqOp) {
-    return createOperation(cache, createEltwiseOp(cache, eqOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, eqOp), debugString,
+                           locInfo);
   }
   if (auto neOp = dyn_cast<NotEqualOp>(op); neOp) {
-    return createOperation(cache, createEltwiseOp(cache, neOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, neOp), debugString,
+                           locInfo);
   }
   if (auto geOp = dyn_cast<GreaterEqualOp>(op); geOp) {
-    return createOperation(cache, createEltwiseOp(cache, geOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, geOp), debugString,
+                           locInfo);
   }
   if (auto gtOp = dyn_cast<GreaterThanOp>(op); gtOp) {
-    return createOperation(cache, createEltwiseOp(cache, gtOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, gtOp), debugString,
+                           locInfo);
   }
   if (auto leOp = dyn_cast<LessEqualOp>(op); leOp) {
-    return createOperation(cache, createEltwiseOp(cache, leOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, leOp), debugString,
+                           locInfo);
   }
   if (auto ltOp = dyn_cast<LessThanOp>(op); ltOp) {
-    return createOperation(cache, createEltwiseOp(cache, ltOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, ltOp), debugString,
+                           locInfo);
   }
   if (auto maximumOp = dyn_cast<MaximumOp>(op); maximumOp) {
     return createOperation(cache, createEltwiseOp(cache, maximumOp),
-                           debugString);
+                           debugString, locInfo);
   }
   if (auto minimumOp = dyn_cast<MinimumOp>(op); minimumOp) {
     return createOperation(cache, createEltwiseOp(cache, minimumOp),
-                           debugString);
+                           debugString, locInfo);
   }
   if (auto reluOp = dyn_cast<ReluOp>(op); reluOp) {
-    return createOperation(cache, createEltwiseOp(cache, reluOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, reluOp), debugString,
+                           locInfo);
   }
   if (auto sqrtOp = dyn_cast<SqrtOp>(op); sqrtOp) {
-    return createOperation(cache, createEltwiseOp(cache, sqrtOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, sqrtOp), debugString,
+                           locInfo);
   }
   if (auto rsqrtOp = dyn_cast<RsqrtOp>(op); rsqrtOp) {
-    return createOperation(cache, createEltwiseOp(cache, rsqrtOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, rsqrtOp), debugString,
+                           locInfo);
   }
   if (auto signOp = dyn_cast<SignOp>(op); signOp) {
-    return createOperation(cache, createEltwiseOp(cache, signOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, signOp), debugString,
+                           locInfo);
   }
   if (auto expOp = dyn_cast<ExpOp>(op); expOp) {
-    return createOperation(cache, createEltwiseOp(cache, expOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, expOp), debugString,
+                           locInfo);
   }
   if (auto logOp = dyn_cast<LogOp>(op); logOp) {
-    return createOperation(cache, createEltwiseOp(cache, logOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, logOp), debugString,
+                           locInfo);
   }
   if (auto expm1Op = dyn_cast<Expm1Op>(op); expm1Op) {
-    return createOperation(cache, createEltwiseOp(cache, expm1Op), debugString);
+    return createOperation(cache, createEltwiseOp(cache, expm1Op), debugString,
+                           locInfo);
   }
   if (auto sigmoidOp = dyn_cast<SigmoidOp>(op); sigmoidOp) {
     return createOperation(cache, createEltwiseOp(cache, sigmoidOp),
-                           debugString);
+                           debugString, locInfo);
   }
   if (auto log1pOp = dyn_cast<Log1pOp>(op); log1pOp) {
-    return createOperation(cache, createEltwiseOp(cache, log1pOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, log1pOp), debugString,
+                           locInfo);
   }
   if (auto scatterOp = dyn_cast<ScatterOp>(op); scatterOp) {
     return createOperation(cache, createEltwiseOp(cache, scatterOp),
-                           debugString);
+                           debugString, locInfo);
   }
   if (auto reciprocalOp = dyn_cast<ReciprocalOp>(op); reciprocalOp) {
     return createOperation(cache, createEltwiseOp(cache, reciprocalOp),
-                           debugString);
+                           debugString, locInfo);
   }
   if (auto divOp = dyn_cast<DivOp>(op); divOp) {
-    return createOperation(cache, createEltwiseOp(cache, divOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, divOp), debugString,
+                           locInfo);
   }
   if (auto remainderOp = dyn_cast<RemainderOp>(op); remainderOp) {
     return createOperation(cache, createEltwiseOp(cache, remainderOp),
-                           debugString);
+                           debugString, locInfo);
   }
   if (auto leakyReluOp = dyn_cast<LeakyReluOp>(op); leakyReluOp) {
     return createOperation(cache, createEltwiseOp(cache, leakyReluOp),
-                           debugString);
+                           debugString, locInfo);
   }
   if (auto linearOp = dyn_cast<LinearOp>(op); linearOp) {
-    return createOperation(cache, createOp(cache, linearOp), debugString);
+    return createOperation(cache, createOp(cache, linearOp), debugString,
+                           locInfo);
   }
   if (auto matmulOp = dyn_cast<MatmulOp>(op); matmulOp) {
-    return createOperation(cache, createOp(cache, matmulOp), debugString);
+    return createOperation(cache, createOp(cache, matmulOp), debugString,
+                           locInfo);
   }
   if (auto sumOp = dyn_cast<SumOp>(op); sumOp) {
-    return createOperation(cache, createReductionOp(cache, sumOp), debugString);
+    return createOperation(cache, createReductionOp(cache, sumOp), debugString,
+                           locInfo);
   }
   if (auto meanOp = dyn_cast<MeanOp>(op); meanOp) {
-    return createOperation(cache, createReductionOp(cache, meanOp),
-                           debugString);
+    return createOperation(cache, createReductionOp(cache, meanOp), debugString,
+                           locInfo);
   }
   if (auto maxOp = dyn_cast<MaxOp>(op); maxOp) {
-    return createOperation(cache, createReductionOp(cache, maxOp), debugString);
+    return createOperation(cache, createReductionOp(cache, maxOp), debugString,
+                           locInfo);
   }
   if (auto embeddingOp = dyn_cast<EmbeddingOp>(op); embeddingOp) {
     return createOperation(cache, createEmbeddingOp(cache, embeddingOp),
-                           debugString);
+                           debugString, locInfo);
   }
   if (auto softmaxOp = dyn_cast<SoftmaxOp>(op); softmaxOp) {
     return createOperation(cache, createSoftmaxOp(cache, softmaxOp),
-                           debugString);
+                           debugString, locInfo);
   }
   if (auto transposeOp = dyn_cast<TransposeOp>(op); transposeOp) {
     return createOperation(cache, createTransposeOp(cache, transposeOp),
-                           debugString);
+                           debugString, locInfo);
   }
   if (auto clampOp = dyn_cast<ClampOp>(op); clampOp) {
     return createOperation(cache, createNonDPSEltwiseOp(cache, clampOp),
-                           debugString);
+                           debugString, locInfo);
   }
   if (auto conv2dOp = dyn_cast<Conv2dOp>(op); conv2dOp) {
-    return createOperation(cache, createOp(cache, conv2dOp), debugString);
+    return createOperation(cache, createOp(cache, conv2dOp), debugString,
+                           locInfo);
   }
   if (auto allGatherOp = dyn_cast<AllGatherOp>(op); allGatherOp) {
-    return createOperation(cache, createOp(cache, allGatherOp), debugString);
+    return createOperation(cache, createOp(cache, allGatherOp), debugString,
+                           locInfo);
   }
   if (auto concatOp = dyn_cast<ConcatOp>(op); concatOp) {
-    return createOperation(cache, createConcatOp(cache, concatOp), debugString);
+    return createOperation(cache, createConcatOp(cache, concatOp), debugString,
+                           locInfo);
   }
   if (auto reshapeOp = dyn_cast<ReshapeOp>(op); reshapeOp) {
     return createOperation(cache, createReshapeOp(cache, reshapeOp),
-                           debugString);
+                           debugString, locInfo);
   }
   if (auto sliceOp = dyn_cast<SliceOp>(op); sliceOp) {
-    return createOperation(cache, createSliceOp(cache, sliceOp), debugString);
+    return createOperation(cache, createSliceOp(cache, sliceOp), debugString,
+                           locInfo);
   }
   if (auto max_pool2dOp = dyn_cast<MaxPool2dOp>(op); max_pool2dOp) {
     return createOperation(cache, createMaxPool2dOp(cache, max_pool2dOp),
-                           debugString);
+                           debugString, locInfo);
   }
   if (auto deallocateOp = dyn_cast<DeallocateOp>(op); deallocateOp) {
     return createOperation(cache, createDeallocateOp(cache, deallocateOp),
-                           debugString);
+                           debugString, locInfo);
   }
   if (auto ceilOp = dyn_cast<CeilOp>(op); ceilOp) {
-    return createOperation(cache, createEltwiseOp(cache, ceilOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, ceilOp), debugString,
+                           locInfo);
   }
   if (auto cosOp = dyn_cast<CosOp>(op); cosOp) {
-    return createOperation(cache, createEltwiseOp(cache, cosOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, cosOp), debugString,
+                           locInfo);
   }
   if (auto sinOp = dyn_cast<SinOp>(op); sinOp) {
-    return createOperation(cache, createEltwiseOp(cache, sinOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, sinOp), debugString,
+                           locInfo);
   }
   if (auto whereOp = dyn_cast<WhereOp>(op); whereOp) {
-    return createOperation(cache, createEltwiseOp(cache, whereOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, whereOp), debugString,
+                           locInfo);
   }
   if (auto geluOp = dyn_cast<GeluOp>(op); geluOp) {
-    return createOperation(cache, createEltwiseOp(cache, geluOp), debugString);
+    return createOperation(cache, createEltwiseOp(cache, geluOp), debugString,
+                           locInfo);
   }
   if (auto arangeOp = dyn_cast<ArangeOp>(op); arangeOp) {
-    return createOperation(cache, createOp(cache, arangeOp), debugString);
+    return createOperation(cache, createOp(cache, arangeOp), debugString,
+                           locInfo);
   }
 
   llvm_unreachable("unhandled op in emitTTNNOperation");
diff --git a/runtime/include/tt/runtime/detail/ttmetal.h b/runtime/include/tt/runtime/detail/ttmetal.h
index 7a68a7e944..5544e1d70f 100644
--- a/runtime/include/tt/runtime/detail/ttmetal.h
+++ b/runtime/include/tt/runtime/detail/ttmetal.h
@@ -47,6 +47,8 @@ void wait(Event event);
 
 std::string getOpDebugString(OpContext opContextHandle);
 
+std::string getOpLocInfo(OpContext opContextHandle);
+
 Tensor getOpOutputTensor(OpContext opContextHandle,
                          CallbackContext programContextHandle);
 
diff --git a/runtime/include/tt/runtime/detail/ttnn.h b/runtime/include/tt/runtime/detail/ttnn.h
index 6c55ac1de7..67aa91a71e 100644
--- a/runtime/include/tt/runtime/detail/ttnn.h
+++ b/runtime/include/tt/runtime/detail/ttnn.h
@@ -83,6 +83,8 @@ void wait(Event event);
 
 std::string getOpDebugString(OpContext opContextHandle);
 
+std::string getOpLocInfo(OpContext opContextHandle);
+
 Tensor getOpOutputTensor(OpContext opContextHandle,
                          CallbackContext programContextHandle);
 
diff --git a/runtime/include/tt/runtime/runtime.h b/runtime/include/tt/runtime/runtime.h
index 1dc721f662..e4348da608 100644
--- a/runtime/include/tt/runtime/runtime.h
+++ b/runtime/include/tt/runtime/runtime.h
@@ -71,6 +71,8 @@ void wait(Event event);
 
 std::string getOpDebugString(OpContext opContextHandle);
 
+std::string getOpLocInfo(OpContext opContextHandle);
+
 Tensor getOpOutputTensor(OpContext opContextHandle,
                          CallbackContext programContextHandle);
 
diff --git a/runtime/lib/runtime.cpp b/runtime/lib/runtime.cpp
index 586b8394ea..a57ac3fcd3 100644
--- a/runtime/lib/runtime.cpp
+++ b/runtime/lib/runtime.cpp
@@ -261,6 +261,21 @@ std::string getOpDebugString(OpContext opContextHandle) {
   throw std::runtime_error("runtime is not enabled");
 }
 
+std::string getOpLocInfo(OpContext opContextHandle) {
+#ifdef TT_RUNTIME_ENABLE_TTNN
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    return ::tt::runtime::ttnn::getOpLocInfo(opContextHandle);
+  }
+#endif
+
+#ifdef TT_RUNTIME_ENABLE_TTMETAL
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    return ::tt::runtime::ttmetal::getOpLocInfo(opContextHandle);
+  }
+#endif
+  throw std::runtime_error("runtime is not enabled");
+}
+
 Tensor getOpOutputTensor(OpContext opContextHandle,
                          CallbackContext programContextHandle) {
 #if defined(TT_RUNTIME_ENABLE_TTNN)
diff --git a/runtime/lib/ttmetal/runtime.cpp b/runtime/lib/ttmetal/runtime.cpp
index ab343554ed..22d43ba366 100644
--- a/runtime/lib/ttmetal/runtime.cpp
+++ b/runtime/lib/ttmetal/runtime.cpp
@@ -262,6 +262,12 @@ std::string getOpDebugString(OpContext opContextHandle) {
   return "";
 }
 
+std::string getOpLocInfo(OpContext opContextHandle) {
+  // Not implemented
+  LOG_WARNING("obtaining op location info for metal runtime not implemented");
+  return "";
+}
+
 Tensor getOpOutputTensor(OpContext opContextHandle,
                          CallbackContext programContextHandle) {
   // Not implemented
diff --git a/runtime/lib/ttnn/program.cpp b/runtime/lib/ttnn/program.cpp
index 48b0be7ff4..3aab3a94cd 100644
--- a/runtime/lib/ttnn/program.cpp
+++ b/runtime/lib/ttnn/program.cpp
@@ -33,9 +33,19 @@
 #include "tt/runtime/utils.h"
 #include "ttmlir/Target/TTNN/program_generated.h"
 
+#ifdef TT_RUNTIME_ENABLE_PERF_TRACE
+#include "tracy/Tracy.hpp"
+#endif
+
 namespace tt::runtime::ttnn {
 using LogType = ::tt::runtime::logger::LogType;
 
+void tracyLogOpLocation(const ::tt::target::ttnn::Operation *op) {
+#ifdef TT_RUNTIME_ENABLE_PERF_TRACE
+  TracyMessage(op->loc_info()->c_str(), op->loc_info()->size());
+#endif
+}
+
 static ::tt::target::ttnn::TTNNBinary const *getBinary(Flatbuffer binary) {
   bool isTTNN = ::tt::target::ttnn::SizePrefixedTTNNBinaryBufferHasIdentifier(
       binary.handle.get());
@@ -74,6 +84,7 @@ class ProgramExecutor {
     for (const ::tt::target::ttnn::Operation *op : *program->operations()) {
       LOG_DEBUG(LogType::LogRuntimeTTNN,
                 "Executing operation: ", op->debug_info()->c_str());
+      tracyLogOpLocation(op);
       runOperation(op);
       runCallback(executableHandle, op, &context);
     }
diff --git a/runtime/lib/ttnn/runtime.cpp b/runtime/lib/ttnn/runtime.cpp
index 86fd2d25c6..2dfc077884 100644
--- a/runtime/lib/ttnn/runtime.cpp
+++ b/runtime/lib/ttnn/runtime.cpp
@@ -202,6 +202,12 @@ std::string getOpDebugString(OpContext opContextHandle) {
   return std::string(opContext.debug_info()->c_str());
 }
 
+std::string getOpLocInfo(OpContext opContextHandle) {
+  auto const &opContext =
+      opContextHandle.as<::tt::target::ttnn::Operation>(DeviceRuntime::TTNN);
+  return std::string(opContext.loc_info()->c_str());
+}
+
 Tensor getOpOutputTensor(OpContext opContextHandle,
                          CallbackContext programContextHandle) {
   auto const &programContext =
diff --git a/runtime/tools/python/ttrt/common/perf.py b/runtime/tools/python/ttrt/common/perf.py
index a341c2b4f4..f70defa313 100644
--- a/runtime/tools/python/ttrt/common/perf.py
+++ b/runtime/tools/python/ttrt/common/perf.py
@@ -17,11 +17,16 @@
 import atexit
 import traceback
 from pathlib import Path
+import csv
 
 from ttrt.common.util import *
 from ttrt.common.query import Query
 
 
+def get_loc_data_hook(binary, programContext, opContext):
+    op_debug_str = ttrt.runtime.get_op_debug_str(opContext)
+
+
 class Perf:
     registered_args = {}
 
@@ -456,6 +461,38 @@ def signal_handler(sig, frame):
                         )
 
                     process_ops(None, None, False)
+
+                    # Add post-processing steps to insert location data into the ops_perf data file
+                    with open(profiler_csv_file_path, "r") as perf_file:
+                        perf_reader = csv.DictReader(perf_file)
+                        headers = list(perf_reader.fieldnames) + ["LOC"]
+                        perf_data = list(perf_reader)
+
+                    with open(profiler_csv_file_path, "w+") as perf_file, open(
+                        tracy_ops_data_file_path, "r"
+                    ) as message_file:
+                        message_reader = csv.reader(message_file, delimiter=";")
+                        ops_index = 0
+                        prev = None
+                        for message in message_reader:
+                            message = message[0]  # Don't need timestamp information
+                            if message.startswith("`"):
+                                # This is a TTNN Message
+                                # The location data is now in the previous message
+                                # The order of data is maintained in perf_data so as the messages are received, they update the id last encountered.
+                                # Now that we have a new message, we can update the location data from the previous message
+                                if prev:
+                                    # Get the location data from the previous message and add it as new data for the perf_data (as a new col)
+                                    if len(perf_data) > ops_index:
+                                        perf_data[ops_index]["LOC"] = prev
+                                        ops_index += 1
+                            else:
+                                prev = message
+                        perf_writer = csv.DictWriter(perf_file, fieldnames=headers)
+                        perf_writer.writeheader()
+                        for row in perf_data:
+                            perf_writer.writerow(row)
+
                     self.file_manager.copy_file(
                         perf_folder_path,
                         profiler_csv_file_path,
diff --git a/runtime/tools/python/ttrt/runtime/module.cpp b/runtime/tools/python/ttrt/runtime/module.cpp
index dfc4a68201..c0378727c0 100644
--- a/runtime/tools/python/ttrt/runtime/module.cpp
+++ b/runtime/tools/python/ttrt/runtime/module.cpp
@@ -100,6 +100,8 @@ PYBIND11_MODULE(_C, m) {
       "Get the input tensor of the op");
   m.def("get_op_debug_str", &tt::runtime::getOpDebugString,
         "Get the debug string of the op");
+  m.def("get_op_loc_info", &tt::runtime::getOpLocInfo,
+        "Get the location info of the op");
 
   py::class_<tt::runtime::debug::Env>(m, "DebugEnv")
       .def_static("get", &tt::runtime::debug::Env::get)
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index e033913e24..90173880d1 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -28,6 +28,7 @@ set(TTMETAL_INCLUDE_DIRS
   ${PROJECT_SOURCE_DIR}/third_party/tt-metal/src/tt-metal/tt_metal/hw/inc/${ARCH_EXTRA_DIR}
   ${PROJECT_SOURCE_DIR}/third_party/tt-metal/src/tt-metal/tt_metal/third_party/umd/src/firmware/riscv/${ARCH_NAME}
   ${PROJECT_SOURCE_DIR}/third_party/tt-metal/src/tt-metal/tt_eager
+  ${PROJECT_SOURCE_DIR}/third_party/tt-metal/src/tt-metal-build/include
   ${PROJECT_SOURCE_DIR}/third_party/tt-metal/src/tt-metal/.cpmcache/reflect/e75434c4c5f669e4a74e4d84e0a30d7249c1e66f
   ${PROJECT_SOURCE_DIR}/third_party/tt-metal/src/tt-metal/.cpmcache/nanomsg/28cc32d5bdb6a858fe53b3ccf7e923957e53eada/include
   ${PROJECT_SOURCE_DIR}/third_party/tt-metal/src/tt-metal/.cpmcache/fmt/73b5ec45edbd92babfd91c3777a9e1ab9cac8238/include

From 0640c7c8366b9b44fd6e210eb7ca76d1c2ec4121 Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <157983820+vmilosevic@users.noreply.github.com>
Date: Tue, 3 Dec 2024 03:01:53 +0100
Subject: [PATCH 41/84] Uplift third_party/tt-metal to
 ab3dc0c4f5c3ce9722261c878970bfa92a212fc9 2024-12-02 (#1439)

* Uplift third_party/tt-metal to ab3dc0c4f5c3ce9722261c878970bfa92a212fc9 2024-12-02
* Linker and Include compile fixes (required with tt-metal uplift)
 - Create DEVICE_LIBRARY as libdevice.so and link against it everywhere we
   link against TTMETAL_LIBRARY (libtt_metal.so)
 - Update UMD include path for headers to include "api" folder in path

---------

Co-authored-by: kmitrovicTT <169657397+kmitrovicTT@users.noreply.github.com>
Co-authored-by: Kyle Mabee <kmabee@tenstorrent.com>
---
 runtime/lib/ttmetal/CMakeLists.txt |  4 ++--
 runtime/test/CMakeLists.txt        |  1 +
 third_party/CMakeLists.txt         | 12 +++++++-----
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/runtime/lib/ttmetal/CMakeLists.txt b/runtime/lib/ttmetal/CMakeLists.txt
index 3706d74333..f31fad2530 100644
--- a/runtime/lib/ttmetal/CMakeLists.txt
+++ b/runtime/lib/ttmetal/CMakeLists.txt
@@ -10,7 +10,7 @@ target_include_directories(TTRuntimeTTMetal PUBLIC
   ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common
 )
 target_include_directories(TTRuntimeTTMetal SYSTEM PUBLIC "$<BUILD_INTERFACE:${TTMETAL_INCLUDE_DIRS}>")
-target_link_libraries(TTRuntimeTTMetal PUBLIC TTMETAL_LIBRARY)
-add_dependencies(TTRuntimeTTMetal TTMETAL_LIBRARY tt-metal FBS_GENERATION)
+target_link_libraries(TTRuntimeTTMetal PUBLIC TTMETAL_LIBRARY DEVICE_LIBRARY)
+add_dependencies(TTRuntimeTTMetal TTMETAL_LIBRARY DEVICE_LIBRARY tt-metal FBS_GENERATION)
 
 # Optionally compile profiling code and link tracy client for perf profiling.
diff --git a/runtime/test/CMakeLists.txt b/runtime/test/CMakeLists.txt
index 8a0d12ee33..e4a7adc406 100644
--- a/runtime/test/CMakeLists.txt
+++ b/runtime/test/CMakeLists.txt
@@ -37,6 +37,7 @@ target_include_directories(TTRuntimeTEST INTERFACE
 
 target_link_libraries(TTRuntimeTEST INTERFACE
     TTMETAL_LIBRARY
+    DEVICE_LIBRARY
     TTBinary
     TTRuntime
     TTRuntimeTTNN
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index 90173880d1..bf28aebc9f 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -1,6 +1,6 @@
 include(ExternalProject)
 
-set(TT_METAL_VERSION "82ba2cbad64d1e36cad446d1f2f9bd266883ae74")
+set(TT_METAL_VERSION "ab3dc0c4f5c3ce9722261c878970bfa92a212fc9")
 
 if ("$ENV{ARCH_NAME}" STREQUAL "grayskull")
   set(ARCH_NAME "grayskull")
@@ -22,7 +22,7 @@ set(TTMETAL_INCLUDE_DIRS
   ${PROJECT_SOURCE_DIR}/third_party/tt-metal/src/tt-metal
   ${PROJECT_SOURCE_DIR}/third_party/tt-metal/src/tt-metal/tt_metal
   ${PROJECT_SOURCE_DIR}/third_party/tt-metal/src/tt-metal/tt_metal/third_party/umd
-  ${PROJECT_SOURCE_DIR}/third_party/tt-metal/src/tt-metal/tt_metal/third_party/umd/device
+  ${PROJECT_SOURCE_DIR}/third_party/tt-metal/src/tt-metal/tt_metal/third_party/umd/device/api
   ${PROJECT_SOURCE_DIR}/third_party/tt-metal/src/tt-metal/tt_metal/hw/inc
   ${PROJECT_SOURCE_DIR}/third_party/tt-metal/src/tt-metal/tt_metal/hw/inc/${ARCH_NAME}
   ${PROJECT_SOURCE_DIR}/third_party/tt-metal/src/tt-metal/tt_metal/hw/inc/${ARCH_EXTRA_DIR}
@@ -40,6 +40,7 @@ set(TTMETAL_INCLUDE_DIRS
 set(TTMETAL_LIBRARY_DIR ${PROJECT_SOURCE_DIR}/third_party/tt-metal/src/tt-metal-build/lib)
 set(TTNN_LIBRARY_PATH ${TTMETAL_LIBRARY_DIR}/_ttnn.so)
 set(TTMETAL_LIBRARY_PATH ${TTMETAL_LIBRARY_DIR}/libtt_metal.so)
+set(DEVICE_LIBRARY_PATH ${TTMETAL_LIBRARY_DIR}/libdevice.so)
 if (TT_RUNTIME_ENABLE_PERF_TRACE)
   set(TRACY_LIBRARY_PATH ${TTMETAL_LIBRARY_DIR}/libtracy.so)
 else()
@@ -49,6 +50,7 @@ endif()
 set(TTMETAL_LIBRARY_DIR ${TTMETAL_LIBRARY_DIR} PARENT_SCOPE)
 set(TTNN_LIBRARY_PATH ${TTNN_LIBRARY_PATH} PARENT_SCOPE)
 set(TTMETAL_LIBRARY_PATH ${TTMETAL_LIBRARY_PATH} PARENT_SCOPE)
+set(DEVICE_LIBRARY_PATH ${DEVICE_LIBRARY_PATH} PARENT_SCOPE)
 set(TRACY_LIBRARY_PATH ${TRACY_LIBRARY_PATH} PARENT_SCOPE)
 
 ExternalProject_Add(
@@ -66,13 +68,13 @@ ExternalProject_Add(
   GIT_REPOSITORY https://github.com/tenstorrent/tt-metal.git
   GIT_TAG ${TT_METAL_VERSION}
   GIT_PROGRESS ON
-  BUILD_BYPRODUCTS ${TTNN_LIBRARY_PATH} ${TTMETAL_LIBRARY_PATH} ${TRACY_LIBRARY_PATH}
+  BUILD_BYPRODUCTS ${TTNN_LIBRARY_PATH} ${TTMETAL_LIBRARY_PATH} ${TRACY_LIBRARY_PATH} ${DEVICE_LIBRARY_PATH}
 )
 
 set_target_properties(tt-metal PROPERTIES EXCLUDE_FROM_ALL TRUE)
 
-list(APPEND library_names TTNN_LIBRARY TTMETAL_LIBRARY)
-list(APPEND library_paths ${TTNN_LIBRARY_PATH} ${TTMETAL_LIBRARY_PATH})
+list(APPEND library_names TTNN_LIBRARY TTMETAL_LIBRARY DEVICE_LIBRARY)
+list(APPEND library_paths ${TTNN_LIBRARY_PATH} ${TTMETAL_LIBRARY_PATH} ${DEVICE_LIBRARY_PATH})
 
 if (TT_RUNTIME_ENABLE_PERF_TRACE)
   list(APPEND library_names TRACY_LIBRARY)

From 37e10f39eae5ca7f7017d071be05fe116f3dec08 Mon Sep 17 00:00:00 2001
From: Stefan Gligorijevic <189116645+sgligorijevicTT@users.noreply.github.com>
Date: Tue, 3 Dec 2024 14:37:20 +0100
Subject: [PATCH 42/84] Add TOSA to TTIR conversions for some simple ops
 (#1445)

* Add Tosa conversion for sin

* Add Tosa conversion for sigmoid

* Add Tosa conversion for reciprocal

* Add Tosa conversion for rsqrt

* Add Tosa conversion for where

* Add Tosa conversion for maximum

* Add Tosa conversion for minimum

* Add neg and sub tests

* Refactor tests

* Improve tests; fix whitespace

* Add return checks to tests
---
 .../TosaToTTIR/TosaToTTIRPatterns.cpp         | 20 +++++++++++++++++++
 .../elementwise_binary/maximum_op.mlir        | 10 ++++++++++
 .../elementwise_binary/minimum_op.mlir        | 10 ++++++++++
 .../elementwise_binary/subtract_op.mlir       | 10 ++++++++++
 .../elementwise_ternary/select_op.mlir        | 11 ++++++++++
 .../elementwise_unary/negate_op.mlir          | 10 ++++++++++
 .../elementwise_unary/reciprocal_op.mlir      | 10 ++++++++++
 .../elementwise_unary/rsqrt_op.mlir           | 10 ++++++++++
 .../elementwise_unary/sigmoid_op.mlir         | 10 ++++++++++
 .../TosaToTTIR/elementwise_unary/sin_op.mlir  | 10 ++++++++++
 10 files changed, 111 insertions(+)
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/maximum_op.mlir
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/minimum_op.mlir
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/subtract_op.mlir
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/elementwise_ternary/select_op.mlir
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/negate_op.mlir
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/reciprocal_op.mlir
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/rsqrt_op.mlir
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/sigmoid_op.mlir
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/sin_op.mlir

diff --git a/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp b/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp
index 46eadb7899..b864a111a6 100644
--- a/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp
+++ b/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp
@@ -91,6 +91,15 @@ void addElementwiseUnaryOpsConversionPatterns(MLIRContext *ctx,
   patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<tosa::NegateOp,
                                                        mlir::tt::ttir::NegOp>>(
       typeConverter, ctx);
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<tosa::SinOp,
+                                                       mlir::tt::ttir::SinOp>>(
+      typeConverter, ctx);
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
+      tosa::SigmoidOp, mlir::tt::ttir::SigmoidOp>>(typeConverter, ctx);
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
+      tosa::ReciprocalOp, mlir::tt::ttir::ReciprocalOp>>(typeConverter, ctx);
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
+      tosa::RsqrtOp, mlir::tt::ttir::RsqrtOp>>(typeConverter, ctx);
 }
 
 void addElementwiseBinaryOpsConversionPatterns(MLIRContext *ctx,
@@ -102,6 +111,10 @@ void addElementwiseBinaryOpsConversionPatterns(MLIRContext *ctx,
   patterns.add<TosaToTTIRMultiplyOpConversionPattern>(typeConverter, ctx);
   patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
       tosa::SubOp, mlir::tt::ttir::SubtractOp>>(typeConverter, ctx);
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
+      tosa::MaximumOp, mlir::tt::ttir::MaximumOp>>(typeConverter, ctx);
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
+      tosa::MinimumOp, mlir::tt::ttir::MinimumOp>>(typeConverter, ctx);
 }
 
 void addCompareOpsConversionPatterns(MLIRContext *ctx,
@@ -112,6 +125,12 @@ void addCompareOpsConversionPatterns(MLIRContext *ctx,
                                                              ctx);
 }
 
+void addElementwiseTernaryOpsConversionPatterns(MLIRContext *ctx,
+                                                RewritePatternSet &patterns,
+                                                TypeConverter &typeConverter) {
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
+      tosa::SelectOp, mlir::tt::ttir::WhereOp>>(typeConverter, ctx);
+}
 } // namespace
 
 namespace mlir::tt {
@@ -120,6 +139,7 @@ void populateTosaToTTIRPatterns(MLIRContext *ctx, RewritePatternSet &patterns,
                                 TypeConverter &typeConverter) {
   addElementwiseUnaryOpsConversionPatterns(ctx, patterns, typeConverter);
   addElementwiseBinaryOpsConversionPatterns(ctx, patterns, typeConverter);
+  addElementwiseTernaryOpsConversionPatterns(ctx, patterns, typeConverter);
   addCompareOpsConversionPatterns(ctx, patterns, typeConverter);
 }
 
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/maximum_op.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/maximum_op.mlir
new file mode 100644
index 0000000000..66691e2f07
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/maximum_op.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_maximum(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+    %0 = tosa.maximum %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+    // CHECK: %[[OP_OUT:[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<[0-9]+x[0-9]+x[0-9]+xf[0-9]+>]]
+    // CHECK: %[[VAL:[0-9]+]] = "ttir.maximum"(%arg{{[0-9]+}}, %arg{{[0-9]+}}, %[[OP_OUT]]){{.+}} : ([[TENSOR_SIZE]], [[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    // CHECK: return %[[VAL]] : [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xf32>
+  }
+}
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/minimum_op.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/minimum_op.mlir
new file mode 100644
index 0000000000..7bfb100927
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/minimum_op.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_minimum(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+    %0 = tosa.minimum %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+    // CHECK: %[[OP_OUT:[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<[0-9]+x[0-9]+x[0-9]+xf[0-9]+>]]
+    // CHECK: %[[VAL:[0-9]+]] = "ttir.minimum"(%arg{{[0-9]+}}, %arg{{[0-9]+}}, %[[OP_OUT]]){{.+}} : ([[TENSOR_SIZE]], [[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    // CHECK: return %[[VAL]] : [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xf32>
+  }
+}
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/subtract_op.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/subtract_op.mlir
new file mode 100644
index 0000000000..5f8f5bf849
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/subtract_op.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_sub(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+    %0 = tosa.sub %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+    // CHECK: %[[OP_OUT:[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<[0-9]+x[0-9]+x[0-9]+xf[0-9]+>]]
+    // CHECK: %[[VAL:[0-9]+]] = "ttir.subtract"(%arg{{[0-9]+}}, %arg{{[0-9]+}}, %[[OP_OUT]]){{.+}} : ([[TENSOR_SIZE]], [[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    // CHECK: return %[[VAL]] : [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xf32>
+  }
+}
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_ternary/select_op.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_ternary/select_op.mlir
new file mode 100644
index 0000000000..2e02be5ebf
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/elementwise_ternary/select_op.mlir
@@ -0,0 +1,11 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_select(%arg0: tensor<32x128xi1>, %arg1: tensor<32x128xf32>, %arg2: tensor<32x128xf32>) -> tensor<32x128xf32> {
+    // CHECK: func.func {{.+}} [[SELECTOR:tensor<[0-9]+x[0-9]+xi1>]]
+    %0 = tosa.select %arg0, %arg1, %arg2 : (tensor<32x128xi1>, tensor<32x128xf32>, tensor<32x128xf32>) -> tensor<32x128xf32>
+    // CHECK: %[[OP_OUT:[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<[0-9]+x[0-9]+xf[0-9]+>]]
+    // CHECK: %[[VAL:[0-9]+]] = "ttir.where"(%arg{{[0-9]+}}, %arg{{[0-9]+}}, %arg{{[0-9]+}}, %[[OP_OUT]]){{.+}} : ([[SELECTOR]], [[TENSOR_SIZE]], [[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    // CHECK: return %[[VAL]] : [[TENSOR_SIZE]]
+    return %0 : tensor<32x128xf32>
+  }
+}
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/negate_op.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/negate_op.mlir
new file mode 100644
index 0000000000..d1c294a848
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/negate_op.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_negate(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+    %0 = tosa.negate %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+    // CHECK: %[[OP_OUT:[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<[0-9]+x[0-9]+x[0-9]+xf[0-9]+>]]
+    // CHECK: %[[VAL:[0-9]+]] = "ttir.neg"(%arg{{[0-9]+}}, %[[OP_OUT]]){{.+}} : ([[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    // CHECK: return %[[VAL]] : [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xf32>
+  }
+}
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/reciprocal_op.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/reciprocal_op.mlir
new file mode 100644
index 0000000000..ee3251eb63
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/reciprocal_op.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_reciprocal(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+    %0 = tosa.reciprocal %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+    // CHECK: %[[OP_OUT:[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<[0-9]+x[0-9]+x[0-9]+xf[0-9]+>]]
+    // CHECK: %[[VAL:[0-9]+]] = "ttir.reciprocal"(%arg{{[0-9]+}}, %[[OP_OUT]]){{.+}} : ([[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    // CHECK: return %[[VAL]] : [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xf32>
+  }
+}
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/rsqrt_op.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/rsqrt_op.mlir
new file mode 100644
index 0000000000..2475ffacd5
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/rsqrt_op.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_rsqrt(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+    %0 = tosa.rsqrt %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+    // CHECK: %[[OP_OUT:[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<[0-9]+x[0-9]+x[0-9]+xf[0-9]+>]]
+    // CHECK: %[[VAL:[0-9]+]] = "ttir.rsqrt"(%arg{{[0-9]+}}, %[[OP_OUT]]){{.+}} : ([[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    // CHECK: return %[[VAL]] : [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xf32>
+  }
+}
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/sigmoid_op.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/sigmoid_op.mlir
new file mode 100644
index 0000000000..18453f71aa
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/sigmoid_op.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_sigmoid(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+    %0 = tosa.sigmoid %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+    // CHECK: %[[OP_OUT:[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<[0-9]+x[0-9]+x[0-9]+xf[0-9]+>]]
+    // CHECK: %[[VAL:[0-9]+]] = "ttir.sigmoid"(%arg{{[0-9]+}}, %[[OP_OUT]]){{.+}} : ([[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    // CHECK: return %[[VAL]] : [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xf32>
+  }
+}
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/sin_op.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/sin_op.mlir
new file mode 100644
index 0000000000..017e9f366c
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/sin_op.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_sin(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+    %0 = tosa.sin %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+    // CHECK: %[[OP_OUT:[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<[0-9]+x[0-9]+x[0-9]+xf[0-9]+>]]
+    // CHECK: %[[VAL:[0-9]+]] = "ttir.sin"(%arg{{[0-9]+}}, %[[OP_OUT]]){{.+}} : ([[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    // CHECK: return %[[VAL]] : [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xf32>
+  }
+}

From d95caff8faaf09cda17dda372eb4f9362176e48b Mon Sep 17 00:00:00 2001
From: Sanja Djukic <sdjukic@tenstorrent.com>
Date: Tue, 3 Dec 2024 17:53:13 +0100
Subject: [PATCH 43/84] TOSA to TTIR: adding simple op conversion patterns
 (#1443)

* adding simple op patterns and corresponding tests

* made test checks more consistent, added return check to tests, removed duplicate line in TOSAToTTIRPatterns.cpp

* made test names consistent
---
 .../TosaToTTIR/TosaToTTIRPatterns.cpp         | 58 ++++++++++++++-----
 .../Conversion/TosaToTTIR/compare/equal.mlir  | 10 ++++
 .../TosaToTTIR/compare/greater.mlir           | 10 ++++
 .../TosaToTTIR/compare/greater_equal.mlir     | 10 ++++
 .../TosaToTTIR/elementwise_binary/add.mlir    | 10 ++++
 .../{maximum_op.mlir => maximum.mlir}         |  0
 .../{minimum_op.mlir => minimum.mlir}         |  0
 .../TosaToTTIR/elementwise_binary/mul.mlir}   |  6 +-
 .../{subtract_op.mlir => sub.mlir}            |  0
 .../{select_op.mlir => select.mlir}           |  0
 .../TosaToTTIR/elementwise_unary/abs.mlir     | 10 ++++
 .../TosaToTTIR/elementwise_unary/cast.mlir    | 10 ++++
 .../TosaToTTIR/elementwise_unary/ceil.mlir    | 10 ++++
 .../TosaToTTIR/elementwise_unary/cos.mlir     | 10 ++++
 .../TosaToTTIR/elementwise_unary/exp.mlir     | 10 ++++
 .../TosaToTTIR/elementwise_unary/floor.mlir   | 10 ++++
 .../{negate_op.mlir => negate.mlir}           |  0
 .../{reciprocal_op.mlir => reciprocal.mlir}   |  0
 .../{rsqrt_op.mlir => rsqrt.mlir}             |  0
 .../{sigmoid_op.mlir => sigmoid.mlir}         |  0
 .../{sin_op.mlir => sin.mlir}                 |  0
 .../TosaToTTIR/logical/logical_and.mlir       | 10 ++++
 .../TosaToTTIR/logical/logical_not.mlir       | 10 ++++
 .../TosaToTTIR/logical/logical_or.mlir        | 10 ++++
 .../TosaToTTIR/logical/logical_xor.mlir       | 10 ++++
 25 files changed, 188 insertions(+), 16 deletions(-)
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/compare/equal.mlir
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/compare/greater.mlir
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/compare/greater_equal.mlir
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/add.mlir
 rename test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/{maximum_op.mlir => maximum.mlir} (100%)
 rename test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/{minimum_op.mlir => minimum.mlir} (100%)
 rename test/ttmlir/{Dialect/TTIR/tosa_to_ttir_multiply.mlir => Conversion/TosaToTTIR/elementwise_binary/mul.mlir} (53%)
 rename test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/{subtract_op.mlir => sub.mlir} (100%)
 rename test/ttmlir/Conversion/TosaToTTIR/elementwise_ternary/{select_op.mlir => select.mlir} (100%)
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/abs.mlir
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/cast.mlir
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/ceil.mlir
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/cos.mlir
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/exp.mlir
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/floor.mlir
 rename test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/{negate_op.mlir => negate.mlir} (100%)
 rename test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/{reciprocal_op.mlir => reciprocal.mlir} (100%)
 rename test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/{rsqrt_op.mlir => rsqrt.mlir} (100%)
 rename test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/{sigmoid_op.mlir => sigmoid.mlir} (100%)
 rename test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/{sin_op.mlir => sin.mlir} (100%)
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/logical/logical_and.mlir
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/logical/logical_not.mlir
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/logical/logical_or.mlir
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/logical/logical_xor.mlir

diff --git a/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp b/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp
index b864a111a6..7ab3ed5d27 100644
--- a/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp
+++ b/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp
@@ -88,18 +88,31 @@ void addElementwiseUnaryOpsConversionPatterns(MLIRContext *ctx,
   patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<tosa::AbsOp,
                                                        mlir::tt::ttir::AbsOp>>(
       typeConverter, ctx);
-  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<tosa::NegateOp,
-                                                       mlir::tt::ttir::NegOp>>(
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
+      tosa::CastOp, mlir::tt::ttir::TypecastOp>>(typeConverter, ctx);
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<tosa::CeilOp,
+                                                       mlir::tt::ttir::CeilOp>>(
       typeConverter, ctx);
-  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<tosa::SinOp,
-                                                       mlir::tt::ttir::SinOp>>(
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<tosa::CosOp,
+                                                       mlir::tt::ttir::CosOp>>(
+      typeConverter, ctx);
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<tosa::ExpOp,
+                                                       mlir::tt::ttir::ExpOp>>(
       typeConverter, ctx);
   patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
-      tosa::SigmoidOp, mlir::tt::ttir::SigmoidOp>>(typeConverter, ctx);
+      tosa::FloorOp, mlir::tt::ttir::FloorOp>>(typeConverter, ctx);
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<tosa::NegateOp,
+                                                       mlir::tt::ttir::NegOp>>(
+      typeConverter, ctx);
   patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
       tosa::ReciprocalOp, mlir::tt::ttir::ReciprocalOp>>(typeConverter, ctx);
   patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
       tosa::RsqrtOp, mlir::tt::ttir::RsqrtOp>>(typeConverter, ctx);
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
+      tosa::SigmoidOp, mlir::tt::ttir::SigmoidOp>>(typeConverter, ctx);
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<tosa::SinOp,
+                                                       mlir::tt::ttir::SinOp>>(
+      typeConverter, ctx);
 }
 
 void addElementwiseBinaryOpsConversionPatterns(MLIRContext *ctx,
@@ -108,29 +121,47 @@ void addElementwiseBinaryOpsConversionPatterns(MLIRContext *ctx,
   patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<tosa::AddOp,
                                                        mlir::tt::ttir::AddOp>>(
       typeConverter, ctx);
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
+      tosa::MaximumOp, mlir::tt::ttir::MaximumOp>>(typeConverter, ctx);
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
+      tosa::MinimumOp, mlir::tt::ttir::MinimumOp>>(typeConverter, ctx);
   patterns.add<TosaToTTIRMultiplyOpConversionPattern>(typeConverter, ctx);
   patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
       tosa::SubOp, mlir::tt::ttir::SubtractOp>>(typeConverter, ctx);
+}
+
+void addElementwiseTernaryOpsConversionPatterns(MLIRContext *ctx,
+                                                RewritePatternSet &patterns,
+                                                TypeConverter &typeConverter) {
   patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
-      tosa::MaximumOp, mlir::tt::ttir::MaximumOp>>(typeConverter, ctx);
+      tosa::SelectOp, mlir::tt::ttir::WhereOp>>(typeConverter, ctx);
+}
+
+void addLogicalOpsConversionPatterns(MLIRContext *ctx,
+                                     RewritePatternSet &patterns,
+                                     TypeConverter &typeConverter) {
   patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
-      tosa::MinimumOp, mlir::tt::ttir::MinimumOp>>(typeConverter, ctx);
+      tosa::LogicalAndOp, mlir::tt::ttir::LogicalAndOp>>(typeConverter, ctx);
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
+      tosa::LogicalNotOp, mlir::tt::ttir::LogicalNotOp>>(typeConverter, ctx);
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
+      tosa::LogicalOrOp, mlir::tt::ttir::LogicalOrOp>>(typeConverter, ctx);
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
+      tosa::LogicalXorOp, mlir::tt::ttir::LogicalXorOp>>(typeConverter, ctx);
 }
 
 void addCompareOpsConversionPatterns(MLIRContext *ctx,
                                      RewritePatternSet &patterns,
                                      TypeConverter &typeConverter) {
+  patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
+      tosa::EqualOp, mlir::tt::ttir::EqualOp>>(typeConverter, ctx);
   patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
       tosa::GreaterEqualOp, mlir::tt::ttir::GreaterEqualOp>>(typeConverter,
                                                              ctx);
-}
-
-void addElementwiseTernaryOpsConversionPatterns(MLIRContext *ctx,
-                                                RewritePatternSet &patterns,
-                                                TypeConverter &typeConverter) {
   patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<
-      tosa::SelectOp, mlir::tt::ttir::WhereOp>>(typeConverter, ctx);
+      tosa::GreaterOp, mlir::tt::ttir::GreaterThanOp>>(typeConverter, ctx);
 }
+
 } // namespace
 
 namespace mlir::tt {
@@ -140,6 +171,7 @@ void populateTosaToTTIRPatterns(MLIRContext *ctx, RewritePatternSet &patterns,
   addElementwiseUnaryOpsConversionPatterns(ctx, patterns, typeConverter);
   addElementwiseBinaryOpsConversionPatterns(ctx, patterns, typeConverter);
   addElementwiseTernaryOpsConversionPatterns(ctx, patterns, typeConverter);
+  addLogicalOpsConversionPatterns(ctx, patterns, typeConverter);
   addCompareOpsConversionPatterns(ctx, patterns, typeConverter);
 }
 
diff --git a/test/ttmlir/Conversion/TosaToTTIR/compare/equal.mlir b/test/ttmlir/Conversion/TosaToTTIR/compare/equal.mlir
new file mode 100644
index 0000000000..20387a6f1a
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/compare/equal.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_equal(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3xi1> {
+    %0 = tosa.equal %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xi1>
+    // CHECK: [[VAL0:%[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<13x21x3xi1>]]
+    // CHECK: [[VAL1:%[0-9]+]] = "ttir.eq"(%arg{{[0-9]+}}, %arg{{[0-9]+}}, [[VAL0]]){{.+}}: (tensor<13x21x3xf32>, tensor<13x21x3xf32>, [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xi1>
+    // CHECK: return [[VAL1]] : [[TENSOR_SIZE]]
+  }
+}
diff --git a/test/ttmlir/Conversion/TosaToTTIR/compare/greater.mlir b/test/ttmlir/Conversion/TosaToTTIR/compare/greater.mlir
new file mode 100644
index 0000000000..7487492997
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/compare/greater.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_greater(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3xi1> {
+    %0 = tosa.greater %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xi1>
+    // CHECK: [[VAL0:%[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<13x21x3xi1>]]
+    // CHECK: [[VAL1:%[0-9]+]] = "ttir.gt"(%arg{{[0-9]+}}, %arg{{[0-9]+}}, [[VAL0]]){{.+}}: (tensor<13x21x3xf32>, tensor<13x21x3xf32>, [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xi1>
+    // CHECK: return [[VAL1]] : [[TENSOR_SIZE]]
+  }
+}
diff --git a/test/ttmlir/Conversion/TosaToTTIR/compare/greater_equal.mlir b/test/ttmlir/Conversion/TosaToTTIR/compare/greater_equal.mlir
new file mode 100644
index 0000000000..479af38156
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/compare/greater_equal.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_greater_equal(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3xi1> {
+    %0 = tosa.greater_equal %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xi1>
+    // CHECK: [[VAL0:%[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<13x21x3xi1>]]
+    // CHECK: [[VAL1:%[0-9]+]] = "ttir.ge"(%arg{{[0-9]+}}, %arg{{[0-9]+}}, [[VAL0]]){{.+}}: (tensor<13x21x3xf32>, tensor<13x21x3xf32>, [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xi1>
+    // CHECK: return [[VAL1]] : [[TENSOR_SIZE]]
+  }
+}
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/add.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/add.mlir
new file mode 100644
index 0000000000..b16e8e40ce
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/add.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_add(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+    %0 = tosa.add %arg0, %arg1 {shift = 0 : i8} : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+    // CHECK: [[VAL0:%[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<13x21x3xf32>]]
+    // CHECK: [[VAL1:%[0-9]+]] = "ttir.add"(%arg{{[0-9]+}}, %arg{{[0-9]+}}, [[VAL0]]){{.+}}: ([[TENSOR_SIZE]], [[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xf32>
+    // CHECK: return [[VAL1]] : [[TENSOR_SIZE]]
+  }
+}
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/maximum_op.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/maximum.mlir
similarity index 100%
rename from test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/maximum_op.mlir
rename to test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/maximum.mlir
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/minimum_op.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/minimum.mlir
similarity index 100%
rename from test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/minimum_op.mlir
rename to test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/minimum.mlir
diff --git a/test/ttmlir/Dialect/TTIR/tosa_to_ttir_multiply.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/mul.mlir
similarity index 53%
rename from test/ttmlir/Dialect/TTIR/tosa_to_ttir_multiply.mlir
rename to test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/mul.mlir
index fd35f0cd10..137939fcf8 100644
--- a/test/ttmlir/Dialect/TTIR/tosa_to_ttir_multiply.mlir
+++ b/test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/mul.mlir
@@ -1,10 +1,10 @@
 // RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
-#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 module attributes {} {
   func.func @test_mul(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
     %0 = tosa.mul %arg0, %arg1 {shift = 0 : i8} : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
-    // CHECK: %[[C:.*]] = tensor.empty[[C:.*]]
-    // CHECK: %[[C:.*]] = "ttir.multiply"[[C:.*]]
+    // CHECK: [[VAL0:%[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<13x21x3xf32>]]
+    // CHECK: [[VAL1:%[0-9]+]] = "ttir.multiply"(%arg{{[0-9]+}}, %arg{{[0-9]+}}, [[VAL0]]){{.+}}: ([[TENSOR_SIZE]], [[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
     return %0 : tensor<13x21x3xf32>
+    // CHECK: return [[VAL1]] : [[TENSOR_SIZE]]
   }
 }
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/subtract_op.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/sub.mlir
similarity index 100%
rename from test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/subtract_op.mlir
rename to test/ttmlir/Conversion/TosaToTTIR/elementwise_binary/sub.mlir
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_ternary/select_op.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_ternary/select.mlir
similarity index 100%
rename from test/ttmlir/Conversion/TosaToTTIR/elementwise_ternary/select_op.mlir
rename to test/ttmlir/Conversion/TosaToTTIR/elementwise_ternary/select.mlir
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/abs.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/abs.mlir
new file mode 100644
index 0000000000..9df5a2828b
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/abs.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_abs(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+    %0 = tosa.abs %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+    // CHECK: [[VAL0:%[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<13x21x3xf32>]]
+    // CHECK: [[VAL1:%[0-9]+]] = "ttir.abs"(%arg{{[0-9]+}}, [[VAL0]]){{.+}}: ([[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xf32>
+    // CHECK: return [[VAL1]] : [[TENSOR_SIZE]]
+  }
+}
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/cast.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/cast.mlir
new file mode 100644
index 0000000000..4ee3a742b6
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/cast.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_cast(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xbf16> {
+    %0 = tosa.cast %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xbf16>
+    // CHECK: [[VAL0:%[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<13x21x3xbf16>]]
+    // CHECK: [[VAL1:%[0-9]+]] = "ttir.typecast"(%arg{{[0-9]+}}, [[VAL0]]){{.+}}: (tensor<13x21x3xf32>, [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xbf16>
+    // CHECK: return [[VAL1]] : [[TENSOR_SIZE]]
+  }
+}
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/ceil.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/ceil.mlir
new file mode 100644
index 0000000000..77dc60dc30
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/ceil.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_ceil(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+    %0 = tosa.ceil %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+    // CHECK: [[VAL0:%[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<13x21x3xf32>]]
+    // CHECK: [[VAL1:%[0-9]+]] = "ttir.ceil"(%arg{{[0-9]+}}, [[VAL0]]){{.+}}: ([[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xf32>
+    // CHECK: return [[VAL1]] : [[TENSOR_SIZE]]
+  }
+}
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/cos.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/cos.mlir
new file mode 100644
index 0000000000..1a8aafd6b0
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/cos.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_cos(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+    %0 = tosa.cos %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+    // CHECK: [[VAL0:%[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<13x21x3xf32>]]
+    // CHECK: [[VAL1:%[0-9]+]] = "ttir.cos"(%arg{{[0-9]+}}, [[VAL0]]){{.+}}: ([[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xf32>
+    // CHECK: return [[VAL1]] : [[TENSOR_SIZE]]
+  }
+}
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/exp.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/exp.mlir
new file mode 100644
index 0000000000..9575640211
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/exp.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_exp(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+    %0 = tosa.exp %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+    // CHECK: [[VAL0:%[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<13x21x3xf32>]]
+    // CHECK: [[VAL1:%[0-9]+]] = "ttir.exp"(%arg{{[0-9]+}}, [[VAL0]]){{.+}}: ([[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xf32>
+    // CHECK: return [[VAL1]] : [[TENSOR_SIZE]]
+  }
+}
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/floor.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/floor.mlir
new file mode 100644
index 0000000000..4653bfd3ee
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/floor.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_floor(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+    %0 = tosa.floor %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+    // CHECK: [[VAL0:%[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<13x21x3xf32>]]
+    // CHECK: [[VAL1:%[0-9]+]] = "ttir.floor"(%arg{{[0-9]+}}, [[VAL0]]){{.+}}: ([[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xf32>
+    // CHECK: return [[VAL1]] : [[TENSOR_SIZE]]
+  }
+}
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/negate_op.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/negate.mlir
similarity index 100%
rename from test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/negate_op.mlir
rename to test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/negate.mlir
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/reciprocal_op.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/reciprocal.mlir
similarity index 100%
rename from test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/reciprocal_op.mlir
rename to test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/reciprocal.mlir
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/rsqrt_op.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/rsqrt.mlir
similarity index 100%
rename from test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/rsqrt_op.mlir
rename to test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/rsqrt.mlir
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/sigmoid_op.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/sigmoid.mlir
similarity index 100%
rename from test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/sigmoid_op.mlir
rename to test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/sigmoid.mlir
diff --git a/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/sin_op.mlir b/test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/sin.mlir
similarity index 100%
rename from test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/sin_op.mlir
rename to test/ttmlir/Conversion/TosaToTTIR/elementwise_unary/sin.mlir
diff --git a/test/ttmlir/Conversion/TosaToTTIR/logical/logical_and.mlir b/test/ttmlir/Conversion/TosaToTTIR/logical/logical_and.mlir
new file mode 100644
index 0000000000..adab66f3a2
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/logical/logical_and.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_logical_and(%arg0: tensor<13x21x3xi1>, %arg1: tensor<13x21x3xi1>) -> tensor<13x21x3xi1> {
+    %0 = tosa.logical_and %arg0, %arg1 : (tensor<13x21x3xi1>, tensor<13x21x3xi1>) -> tensor<13x21x3xi1>
+    // CHECK: [[VAL0:%[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<13x21x3xi1>]]
+    // CHECK: [[VAL1:%[0-9]+]] = "ttir.logical_and"(%arg{{[0-9]+}}, %arg{{[0-9]+}}, [[VAL0]]){{.+}}: ([[TENSOR_SIZE]], [[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xi1>
+    // CHECK: return [[VAL1]] : [[TENSOR_SIZE]]
+  }
+}
diff --git a/test/ttmlir/Conversion/TosaToTTIR/logical/logical_not.mlir b/test/ttmlir/Conversion/TosaToTTIR/logical/logical_not.mlir
new file mode 100644
index 0000000000..ca74f1ab91
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/logical/logical_not.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_logical_not(%arg0: tensor<13x21x3xi1>) -> tensor<13x21x3xi1> {
+    %0 = tosa.logical_not %arg0 : (tensor<13x21x3xi1>) -> tensor<13x21x3xi1>
+    // CHECK: [[VAL0:%[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<13x21x3xi1>]]
+    // CHECK: [[VAL1:%[0-9]+]] = "ttir.logical_not"(%arg{{[0-9]+}}, [[VAL0]]){{.+}}: ([[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xi1>
+    // CHECK: return [[VAL1]] : [[TENSOR_SIZE]]
+  }
+}
diff --git a/test/ttmlir/Conversion/TosaToTTIR/logical/logical_or.mlir b/test/ttmlir/Conversion/TosaToTTIR/logical/logical_or.mlir
new file mode 100644
index 0000000000..4a4ab6eaef
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/logical/logical_or.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_logical_or(%arg0: tensor<13x21x3xi1>, %arg1: tensor<13x21x3xi1>) -> tensor<13x21x3xi1> {
+    %0 = tosa.logical_or %arg0, %arg1 : (tensor<13x21x3xi1>, tensor<13x21x3xi1>) -> tensor<13x21x3xi1>
+    // CHECK: [[VAL0:%[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<13x21x3xi1>]]
+    // CHECK: [[VAL1:%[0-9]+]] = "ttir.logical_or"(%arg{{[0-9]+}}, %arg{{[0-9]+}}, [[VAL0]]){{.+}}: ([[TENSOR_SIZE]], [[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xi1>
+    // CHECK: return [[VAL1]] : [[TENSOR_SIZE]]
+  }
+}
diff --git a/test/ttmlir/Conversion/TosaToTTIR/logical/logical_xor.mlir b/test/ttmlir/Conversion/TosaToTTIR/logical/logical_xor.mlir
new file mode 100644
index 0000000000..6492691566
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/logical/logical_xor.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_logical_xor(%arg0: tensor<13x21x3xi1>, %arg1: tensor<13x21x3xi1>) -> tensor<13x21x3xi1> {
+    %0 = tosa.logical_xor %arg0, %arg1 : (tensor<13x21x3xi1>, tensor<13x21x3xi1>) -> tensor<13x21x3xi1>
+    // CHECK: [[VAL0:%[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<13x21x3xi1>]]
+    // CHECK: [[VAL1:%[0-9]+]] = "ttir.logical_xor"(%arg{{[0-9]+}}, %arg{{[0-9]+}}, [[VAL0]]){{.+}}([[TENSOR_SIZE]], [[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xi1>
+    // CHECK: return [[VAL1]] : [[TENSOR_SIZE]]
+  }
+}

From 7a6f33588bf6d2527afb27fc2fe8fe23835942a0 Mon Sep 17 00:00:00 2001
From: Aleksandar Zecevic <azecevic@tenstorrent.com>
Date: Tue, 3 Dec 2024 22:20:26 +0100
Subject: [PATCH 44/84] Mark arange_tests_positive.mlir as UNSUPPORTED (#1478)

`arange_tests_positive.mlir` XFAIL: * is causing `check-ttmlir` to fail
with message.
```
Unexpectedly Passed Tests (1):
  TTMLIR :: ttmlir/Dialect/TTNN/arange/arange_tests_positive.mlir
```
I've marked this test as UNSUPPORTED for now, until we resolve it.
---
 test/ttmlir/Dialect/TTNN/arange/arange_tests_positive.mlir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/ttmlir/Dialect/TTNN/arange/arange_tests_positive.mlir b/test/ttmlir/Dialect/TTNN/arange/arange_tests_positive.mlir
index 16c396c00e..945b6da5b3 100644
--- a/test/ttmlir/Dialect/TTNN/arange/arange_tests_positive.mlir
+++ b/test/ttmlir/Dialect/TTNN/arange/arange_tests_positive.mlir
@@ -1,5 +1,5 @@
 // RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s | FileCheck %s
-// XFAIL: *
+// UNSUPPORTED: true
 // https://github.com/tenstorrent/tt-mlir/issues/1448
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 module attributes {} {

From ca09c6945b074797f09781483e91419813ebae94 Mon Sep 17 00:00:00 2001
From: Collin Tod <collintod@tenstorrent.com>
Date: Tue, 3 Dec 2024 15:38:56 -0600
Subject: [PATCH 45/84] Create `ttmlir-lsp-server` (#1462)

This change introduces the tool `ttmlir-lsp-server`. It will fall under `build/bin`, and is built alongside the rest of the compiler via `cmake --build build`. This is a language server that should be used alongside your IDE/Text editor to give you IDE like features while editing .mlir files. For more info, please see https://mlir.llvm.org/docs/Tools/MLIRLSP/

Closes #1383
---
 lib/Conversion/TTKernelToEmitC/CMakeLists.txt |  2 +-
 tools/CMakeLists.txt                          |  1 +
 tools/ttmlir-lsp-server/CMakeLists.txt        | 18 ++++++++++++++++++
 tools/ttmlir-lsp-server/ttmlir-lsp-server.cpp | 15 +++++++++++++++
 4 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 tools/ttmlir-lsp-server/CMakeLists.txt
 create mode 100644 tools/ttmlir-lsp-server/ttmlir-lsp-server.cpp

diff --git a/lib/Conversion/TTKernelToEmitC/CMakeLists.txt b/lib/Conversion/TTKernelToEmitC/CMakeLists.txt
index 4ed57a5d41..429a694f31 100644
--- a/lib/Conversion/TTKernelToEmitC/CMakeLists.txt
+++ b/lib/Conversion/TTKernelToEmitC/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_mlir_library(TTMLIRTTKernelToEmitC
+add_mlir_conversion_library(TTMLIRTTKernelToEmitC
   TTKernelToEmitC.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index e558d0567e..e5a62f9c5a 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_subdirectory(ttmlir-opt)
+add_subdirectory(ttmlir-lsp-server)
 add_subdirectory(ttmlir-translate)
 add_subdirectory(explorer)
diff --git a/tools/ttmlir-lsp-server/CMakeLists.txt b/tools/ttmlir-lsp-server/CMakeLists.txt
new file mode 100644
index 0000000000..1dd058a715
--- /dev/null
+++ b/tools/ttmlir-lsp-server/CMakeLists.txt
@@ -0,0 +1,18 @@
+get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
+get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
+get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS)
+
+set(LIBS ${dialect_libs} ${conversion_libs} ${extension_libs}
+  MLIROptLib
+  MLIRTargetCpp
+  TTMLIRStatic
+  MLIRLspServerLib
+)
+
+add_llvm_executable(ttmlir-lsp-server ttmlir-lsp-server.cpp DISABLE_LLVM_LINK_LLVM_DYLIB)
+llvm_update_compile_flags(ttmlir-lsp-server)
+target_link_libraries(ttmlir-lsp-server PRIVATE ${LIBS})
+
+mlir_check_all_link_libraries(ttmlir-lsp-server)
+
+install(TARGETS ttmlir-lsp-server DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT Test EXCLUDE_FROM_ALL)
diff --git a/tools/ttmlir-lsp-server/ttmlir-lsp-server.cpp b/tools/ttmlir-lsp-server/ttmlir-lsp-server.cpp
new file mode 100644
index 0000000000..d23425e968
--- /dev/null
+++ b/tools/ttmlir-lsp-server/ttmlir-lsp-server.cpp
@@ -0,0 +1,15 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "mlir/InitAllDialects.h"
+#include "ttmlir/RegisterAll.h"
+
+#include "mlir/Tools/mlir-lsp-server/MlirLspServerMain.h"
+
+int main(int argc, char **argv) {
+  mlir::DialectRegistry registry;
+  mlir::tt::registerAllDialects(registry);
+
+  return mlir::failed(mlir::MlirLspServerMain(argc, argv, registry));
+}

From 29281af7749d726e0722a1773b540d96923a492a Mon Sep 17 00:00:00 2001
From: Jackson Nie <jnie@tenstorrent.com>
Date: Tue, 3 Dec 2024 20:16:34 -0500
Subject: [PATCH 46/84] Runtime stitching APIs and sanity tests, ttnn runtime
 submit refactor (#1301)

---
 .github/workflows/build-and-test.yml          |  96 ++++
 runtime/CMakeLists.txt                        |   5 +-
 runtime/include/tt/runtime/detail/ttmetal.h   |  12 +-
 runtime/include/tt/runtime/detail/ttnn.h      |  55 ++-
 runtime/include/tt/runtime/runtime.h          |  46 +-
 runtime/include/tt/runtime/test/utils.h       |  17 +
 runtime/include/tt/runtime/types.h            |  14 +-
 runtime/lib/binary.cpp                        |  37 +-
 runtime/lib/common/system_desc.cpp            |   4 +-
 runtime/lib/runtime.cpp                       | 206 +++++++--
 runtime/lib/ttmetal/command_queue.cpp         |  14 +-
 runtime/lib/ttmetal/runtime.cpp               |  28 +-
 runtime/lib/ttnn/CMakeLists.txt               |  20 +-
 .../ttnn/include/tt/runtime/ttnn/types.cpp    | 437 ++++++++++++++++++
 .../lib/ttnn/include/tt/runtime/ttnn/types.h  | 193 ++++----
 .../ttnn/include/tt/runtime/ttnn/utils.cpp    | 222 +++++++++
 .../lib/ttnn/include/tt/runtime/ttnn/utils.h  | 143 ++----
 runtime/lib/ttnn/operations/CMakeLists.txt    |   5 +-
 .../lib/ttnn/operations/ccl/all_gather.cpp    |   3 +-
 runtime/lib/ttnn/operations/conv/conv2d.cpp   |   4 +-
 .../lib/ttnn/operations/creation/arange.cpp   |   2 +-
 .../lib/ttnn/operations/creation/empty.cpp    |  13 +-
 runtime/lib/ttnn/operations/creation/full.cpp |  11 +-
 .../operations/data_movement/transpose.cpp    |   3 +-
 .../ttnn/operations/deletion/deallocate.cpp   |   7 -
 .../ttnn/operations/eltwise/binary/binary.cpp |   3 +-
 .../eltwise/binary/binary_composite.cpp       |   3 +-
 .../operations/eltwise/ternary/ternary.cpp    |   2 +-
 .../ttnn/operations/eltwise/unary/unary.cpp   |   7 +-
 .../eltwise/unary/unary_composite.cpp         |  16 +-
 .../ttnn/operations/embedding/embedding.cpp   |   3 +-
 .../tt/runtime/ttnn/operations/utils.cpp      | 107 +----
 .../tt/runtime/ttnn/operations/utils.h        |  17 -
 .../ttnn/operations/layout/from_device.cpp    |   7 +-
 .../lib/ttnn/operations/layout/to_device.cpp  |   2 +-
 .../lib/ttnn/operations/layout/to_layout.cpp  |   2 +-
 .../lib/ttnn/operations/layout/typecast.cpp   |   2 +-
 runtime/lib/ttnn/operations/matmul/matmul.cpp |   5 +-
 .../ttnn/operations/normalization/softmax.cpp |   3 +-
 .../lib/ttnn/operations/pool/maxpool2d.cpp    |   3 +-
 .../ttnn/operations/reduction/reduction.cpp   |   3 +-
 runtime/lib/ttnn/program.cpp                  | 124 ++++-
 runtime/lib/ttnn/runtime.cpp                  | 256 ++++++++--
 runtime/test/CMakeLists.txt                   |  24 +
 .../include/tt/runtime/ttnn/test/utils.cpp    |  50 ++
 runtime/test/python/ttnn/conftest.py          |  25 +
 runtime/test/python/ttnn/test_runtime_api.py  | 160 +++++++
 runtime/test/python/ttnn/utils.py             |  66 +++
 runtime/test/ttnn/test_subtract.cpp           |  36 +-
 runtime/tools/python/CMakeLists.txt           |   1 +
 runtime/tools/python/setup.py                 |  13 +-
 runtime/tools/python/ttrt/common/run.py       |  40 +-
 runtime/tools/python/ttrt/common/util.py      |   6 +
 runtime/tools/python/ttrt/runtime/__init__.py |  14 +
 runtime/tools/python/ttrt/runtime/module.cpp  |  90 +++-
 .../unary/isfinite/simple_isfinite.mlir       |   6 +-
 .../eltwise_binary_op_chain.mlir              |  49 ++
 .../Silicon/StableHLO/Unary/isfinite_op.mlir  |   6 +-
 test/ttmlir/Silicon/StableHLO/select_op.mlir  |  20 +-
 .../TTNN/perf_unit/test_perf_isfinite.mlir    |   6 +-
 .../Silicon/TTNN/perf_unit/test_perf_le.mlir  |  21 -
 .../TTNN/perf_unit/test_perf_where.mlir       |  10 +-
 test/ttmlir/Silicon/TTNN/simple_eltwise.mlir  |  16 +-
 63 files changed, 2201 insertions(+), 620 deletions(-)
 create mode 100644 runtime/include/tt/runtime/test/utils.h
 create mode 100644 runtime/lib/ttnn/include/tt/runtime/ttnn/types.cpp
 create mode 100644 runtime/lib/ttnn/include/tt/runtime/ttnn/utils.cpp
 create mode 100644 runtime/test/include/tt/runtime/ttnn/test/utils.cpp
 create mode 100644 runtime/test/python/ttnn/conftest.py
 create mode 100644 runtime/test/python/ttnn/test_runtime_api.py
 create mode 100644 runtime/test/python/ttnn/utils.py
 create mode 100644 test/ttmlir/Runtime/TTNN/runtime_stitching/eltwise_binary_op_chain.mlir
 delete mode 100644 test/ttmlir/Silicon/TTNN/perf_unit/test_perf_le.mlir

diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index 8ec0c93dc2..c54d734b23 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -475,6 +475,102 @@ jobs:
         report_paths: ${{ steps.strings.outputs.test_report_path }}
         check_name: Run ttrt tests
 
+  run-runtime-api-tests:
+
+    timeout-minutes: 30
+    needs:
+      - build-image
+      - build-ttmlir
+    strategy:
+      fail-fast: false
+      matrix:
+        build: [
+          {runs-on: n150, enable_perf: OFF, name: "run"},
+        ]
+
+    runs-on:
+      - in-service
+      - ${{ matrix.build.runs-on }}
+
+    container:
+      image: ${{ needs.build-image.outputs.docker-image }}
+      options: --device /dev/tenstorrent/0
+      volumes:
+        - /dev/hugepages:/dev/hugepages
+        - /dev/hugepages-1G:/dev/hugepages-1G
+        - /etc/udev/rules.d:/etc/udev/rules.d
+        - /lib/modules:/lib/modules
+        - /opt/tt_metal_infra/provisioning/provisioning_env:/opt/tt_metal_infra/provisioning/provisioning_env
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+          fetch-depth: 0
+
+    - name: Set reusable strings
+      id: strings
+      shell: bash
+      run: |
+        echo "work-dir=$(pwd)" >> "$GITHUB_OUTPUT"
+        echo "build-output-dir=$(pwd)/build" >> "$GITHUB_OUTPUT"
+        echo "install-output-dir=$(pwd)/install" >> "$GITHUB_OUTPUT"
+
+    - name: Git safe dir
+      run: git config --global --add safe.directory ${{ steps.strings.outputs.work-dir }}
+
+    - name: Use build artifacts
+      uses: actions/download-artifact@v4
+      with:
+        name: install-artifacts-${{ matrix.build.name }}
+        path: ${{ steps.strings.outputs.install-output-dir }}
+
+    # This is needed to preserve file permissions
+    # https://github.com/actions/upload-artifact?tab=readme-ov-file#permission-loss
+    - name: 'Untar install directory'
+      shell: bash
+      working-directory: ${{ steps.strings.outputs.install-output-dir }}
+      run: tar xvf artifact.tar
+
+    - name: Remove existing whls files
+      shell: bash
+      run: |
+        rm -f *.whl
+
+    - name: Download ttrt run whls
+      uses: actions/download-artifact@v4
+      with:
+        name: ttrt-whl-${{ matrix.build.name }}
+
+    # Runtime tests currently require ttrt whls to be installed
+    - name: Install ttrt run whls
+      shell: bash
+      run: |
+        source env/activate
+        pip show ttrt && pip uninstall -y ttrt
+        pip install ttrt-${{ env.version }}*.whl --force-reinstall
+        pip install pytest
+
+    - name: Generate system descriptor
+      shell: bash
+      run: |
+        source env/activate
+        ttrt query --save-artifacts
+
+    - name: Generate tests
+      shell: bash
+      run: |
+        source env/activate
+        export LD_LIBRARY_PATH="${TTMLIR_TOOLCHAIN_DIR}/lib:${LD_LIBRARY_PATH}"
+        export SYSTEM_DESC_PATH="${GITHUB_WORKSPACE}/ttrt-artifacts/system_desc.ttsys"
+        ln -sf ${{ steps.strings.outputs.install-output-dir }} ${{ steps.strings.outputs.build-output-dir }}
+        llvm-lit -sv ${{ steps.strings.outputs.build-output-dir }}/test
+
+    - name: ttnn api tests
+      shell: bash
+      run: |
+        source env/activate
+        pytest -ssv runtime/test/python/ttnn/test_runtime_api.py
+
   build-and-test-explorer:
     needs: build-image
     timeout-minutes: 60
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index c9dce10946..0a23c6ddac 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -14,6 +14,7 @@ set(TT_RUNTIME_OPTIONS
   TT_RUNTIME_DEBUG
   TT_RUNTIME_ENABLE_PERF_TRACE
   TT_RUNTIME_WORKAROUNDS
+  TTMLIR_ENABLE_RUNTIME_TESTS
 )
 
 foreach(OPTION ${TT_RUNTIME_OPTIONS})
@@ -24,6 +25,4 @@ endforeach()
 
 add_subdirectory(lib)
 add_subdirectory(tools)
-if (TTMLIR_ENABLE_RUNTIME_TESTS)
-    add_subdirectory(test)
-endif()
+add_subdirectory(test)
diff --git a/runtime/include/tt/runtime/detail/ttmetal.h b/runtime/include/tt/runtime/detail/ttmetal.h
index 5544e1d70f..1b043f6e58 100644
--- a/runtime/include/tt/runtime/detail/ttmetal.h
+++ b/runtime/include/tt/runtime/detail/ttmetal.h
@@ -39,12 +39,16 @@ void closeDevice(Device device);
 
 void deallocateBuffers(Device device);
 
-Event submit(Device device, Binary executable, std::uint32_t programIndex,
-             std::vector<Tensor> const &inputs,
-             std::vector<Tensor> const &outputs);
-
 void wait(Event event);
 
+void wait(Tensor tensor);
+
+void wait(std::vector<Tensor> const &tensors);
+
+Event submit(Device deviceHandle, Binary executableHandle,
+             std::uint32_t programIndex, std::vector<Tensor> const &inputs,
+             std::vector<Tensor> const &outputs);
+
 std::string getOpDebugString(OpContext opContextHandle);
 
 std::string getOpLocInfo(OpContext opContextHandle);
diff --git a/runtime/include/tt/runtime/detail/ttnn.h b/runtime/include/tt/runtime/detail/ttnn.h
index 67aa91a71e..e7b8fbcf21 100644
--- a/runtime/include/tt/runtime/detail/ttnn.h
+++ b/runtime/include/tt/runtime/detail/ttnn.h
@@ -53,16 +53,27 @@ createTensor(std::vector<std::shared_ptr<void>> &data,
              ::tt::target::DataType dataType,
              std::unordered_map<std::string, std::string> const &strategy);
 
+Tensor createTensor(Device device, Layout layout,
+                    std::vector<std::uint32_t> const &shape,
+                    std::vector<std::uint32_t> const &stride,
+                    std::uint32_t itemsize);
+
 inline Tensor createTensor(std::shared_ptr<void> data, TensorDesc const &desc) {
-  return createTensor(data, desc.shape, desc.stride, desc.itemsize,
-                      desc.dataType);
+  return ::tt::runtime::ttnn::createTensor(data, desc.shape, desc.stride,
+                                           desc.itemsize, desc.dataType);
 }
 
 inline Tensor
 createTensor(std::vector<std::shared_ptr<void>> &data, TensorDesc const &desc,
              std::unordered_map<std::string, std::string> const &strategy) {
-  return createTensor(data, desc.shape, desc.stride, desc.itemsize,
-                      desc.dataType, strategy);
+  return ::tt::runtime::ttnn::createTensor(
+      data, desc.shape, desc.stride, desc.itemsize, desc.dataType, strategy);
+}
+
+inline Tensor createTensor(Device device, Layout layout,
+                           TensorDesc const &desc) {
+  return ::tt::runtime::ttnn::createTensor(device, layout, desc.shape,
+                                           desc.stride, desc.itemsize);
 }
 
 tt::target::DataType getTensorDataType(Tensor tensor);
@@ -75,12 +86,23 @@ void closeDevice(Device device);
 
 void deallocateBuffers(Device device);
 
-Event submit(Device device, Binary executable, std::uint32_t programIndex,
-             std::vector<Tensor> const &inputs,
-             std::vector<Tensor> const &outputs);
-
 void wait(Event event);
 
+void wait(Tensor tensor);
+
+void wait(std::vector<Tensor> const &tensors);
+
+Tensor toHost(Tensor tensor, bool untilize = false);
+
+Tensor toLayout(Tensor tensor, Device device, Layout layout);
+
+Layout getLayout(Binary executableHandle, std::uint32_t programIndex,
+                 std::uint32_t inputIndex);
+
+void memcpy(Tensor dst, Tensor src);
+
+void deallocateTensor(Tensor &tensor, bool force = false);
+
 std::string getOpDebugString(OpContext opContextHandle);
 
 std::string getOpLocInfo(OpContext opContextHandle);
@@ -90,10 +112,27 @@ Tensor getOpOutputTensor(OpContext opContextHandle,
 
 std::vector<float> getTensorData(Tensor tensor);
 
+namespace legacy {
+/* Will be deprecated soon once FEs migrate to new API */
+
+Event submit(Device deviceHandle, Binary executableHandle,
+             std::uint32_t programIndex, std::vector<Tensor> const &inputs,
+             std::vector<Tensor> const &outputs);
+
 void runProgram(::ttnn::MeshDevice &meshDevice, Binary &executableHandle,
                 std::uint32_t programIndex,
                 std::vector<::ttnn::Tensor *> const &inputs,
                 std::vector<::ttnn::Tensor *> const &outputs);
+} // namespace legacy
+
+std::vector<Tensor> submit(Device deviceHandle, Binary executableHandle,
+                           std::uint32_t programIndex,
+                           std::vector<Tensor> const &inputs);
+
+std::vector<Tensor> runProgram(::ttnn::MeshDevice &meshDevice,
+                               Binary executableHandle,
+                               std::uint32_t programIndex,
+                               std::vector<::ttnn::Tensor *> const &inputs);
 
 } // namespace tt::runtime::ttnn
 
diff --git a/runtime/include/tt/runtime/runtime.h b/runtime/include/tt/runtime/runtime.h
index e4348da608..56666d564f 100644
--- a/runtime/include/tt/runtime/runtime.h
+++ b/runtime/include/tt/runtime/runtime.h
@@ -43,16 +43,27 @@ createTensor(std::vector<std::shared_ptr<void>> &data,
              ::tt::target::DataType dataType,
              std::unordered_map<std::string, std::string> const &strategy);
 
+Tensor createTensor(Device device, Layout layout,
+                    std::vector<std::uint32_t> const &shape,
+                    std::vector<std::uint32_t> const &stride,
+                    std::uint32_t itemsize);
+
 inline Tensor createTensor(std::shared_ptr<void> data, TensorDesc const &desc) {
-  return createTensor(data, desc.shape, desc.stride, desc.itemsize,
-                      desc.dataType);
+  return ::tt::runtime::createTensor(data, desc.shape, desc.stride,
+                                     desc.itemsize, desc.dataType);
 }
 
 inline Tensor
 createTensor(std::vector<std::shared_ptr<void>> &data, TensorDesc const &desc,
              std::unordered_map<std::string, std::string> const &strategy) {
-  return createTensor(data, desc.shape, desc.stride, desc.itemsize,
-                      desc.dataType, strategy);
+  return ::tt::runtime::createTensor(data, desc.shape, desc.stride,
+                                     desc.itemsize, desc.dataType, strategy);
+}
+
+inline Tensor createTensor(Device device, Layout layout,
+                           TensorDesc const &desc) {
+  return ::tt::runtime::createTensor(device, layout, desc.shape, desc.stride,
+                                     desc.itemsize);
 }
 
 tt::target::DataType getTensorDataType(Tensor tensor);
@@ -63,12 +74,23 @@ Device openDevice(DeviceIds const &deviceIds, size_t numHWCQs = 1);
 
 void closeDevice(Device device);
 
-Event submit(Device device, Binary executable, std::uint32_t programIndex,
-             std::vector<Tensor> const &inputs,
-             std::vector<Tensor> const &outputs);
-
 void wait(Event event);
 
+void wait(Tensor tensor);
+
+void wait(std::vector<Tensor> const &tensors);
+
+Tensor toHost(Tensor tensor, bool untilize = false);
+
+Tensor toLayout(Tensor tensor, Device device, Layout layout);
+
+Layout getLayout(Binary executableHandle, std::uint32_t programIndex,
+                 std::uint32_t inputIndex);
+
+void memcpy(Tensor dst, Tensor src);
+
+void deallocateTensor(Tensor &tensor, bool force = false);
+
 std::string getOpDebugString(OpContext opContextHandle);
 
 std::string getOpLocInfo(OpContext opContextHandle);
@@ -78,6 +100,14 @@ Tensor getOpOutputTensor(OpContext opContextHandle,
 
 std::vector<float> getTensorData(Tensor tensor);
 
+std::vector<Tensor> submit(Device deviceHandle, Binary executableHandle,
+                           std::uint32_t programIndex,
+                           std::vector<Tensor> const &inputs);
+
+Event submit(Device deviceHandle, Binary executableHandle,
+             std::uint32_t programIndex, std::vector<Tensor> const &inputs,
+             std::vector<Tensor> const &outputs);
+
 } // namespace tt::runtime
 
 #endif
diff --git a/runtime/include/tt/runtime/test/utils.h b/runtime/include/tt/runtime/test/utils.h
new file mode 100644
index 0000000000..e4323cc165
--- /dev/null
+++ b/runtime/include/tt/runtime/test/utils.h
@@ -0,0 +1,17 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TT_RUNTIME_TEST_UTILS_H
+#define TT_RUNTIME_TEST_UTILS_H
+
+#include "tt/runtime/types.h"
+
+// Utility functions for testing TTNN runtime
+namespace tt::runtime::ttnn::test {
+Layout getDramInterleavedTileLayout(::tt::target::DataType dataType);
+Layout getDramInterleavedRowMajorLayout(::tt::target::DataType dataType);
+Layout getHostRowMajorLayout(::tt::target::DataType dataType);
+} // namespace tt::runtime::ttnn::test
+
+#endif // TT_RUNTIME_TEST_UTILS_H
diff --git a/runtime/include/tt/runtime/types.h b/runtime/include/tt/runtime/types.h
index 8fd641195f..cc2791e237 100644
--- a/runtime/include/tt/runtime/types.h
+++ b/runtime/include/tt/runtime/types.h
@@ -122,10 +122,20 @@ struct Event : public detail::RuntimeCheckedObjectImpl {
 
 struct Tensor : public detail::RuntimeCheckedObjectImpl {
   std::shared_ptr<void> data;
-
+  Event event;
   Tensor(std::shared_ptr<void> handle, std::shared_ptr<void> data,
          DeviceRuntime runtime)
-      : detail::RuntimeCheckedObjectImpl(handle, runtime), data(data) {}
+      : detail::RuntimeCheckedObjectImpl(handle, runtime), data(data),
+        event(nullptr, runtime) {}
+
+  Tensor(std::shared_ptr<void> handle, std::shared_ptr<void> data,
+         std::shared_ptr<void> eventHandle, DeviceRuntime runtime)
+      : detail::RuntimeCheckedObjectImpl(handle, runtime), data(data),
+        event(eventHandle, runtime) {}
+};
+
+struct Layout : public detail::RuntimeCheckedObjectImpl {
+  using detail::RuntimeCheckedObjectImpl::RuntimeCheckedObjectImpl;
 };
 
 struct CallbackContext : public detail::RuntimeCheckedObjectImpl {
diff --git a/runtime/lib/binary.cpp b/runtime/lib/binary.cpp
index 92be39d27f..1d8cbf38b2 100644
--- a/runtime/lib/binary.cpp
+++ b/runtime/lib/binary.cpp
@@ -27,15 +27,12 @@ static std::string asJson(void const *fbb, uint8_t const *binarySchema,
   flatbuffers::Parser parser(opts);
 
   if (not parser.Deserialize(binarySchema, schemaSize)) {
-    throw std::runtime_error("Failed to deserialize schema");
+    LOG_FATAL("Failed to deserialize schema");
   }
 
   std::string text;
   const char *err = ::flatbuffers::GenerateText(parser, fbb, &text);
-  if (err) {
-    throw std::runtime_error("Failed to generate JSON: " + std::string(err));
-  }
-
+  LOG_ASSERT(not err, "Failed to generate JSON: ", err);
   return text;
 }
 
@@ -44,9 +41,7 @@ namespace ttnn {
 ::tt::target::ttnn::TTNNBinary const *getBinary(Flatbuffer binary) {
   bool isTTNN = ::tt::target::ttnn::SizePrefixedTTNNBinaryBufferHasIdentifier(
       binary.handle.get());
-  if (not isTTNN) {
-    throw std::runtime_error("Unsupported binary format");
-  }
+  LOG_ASSERT(isTTNN, "Unsupported binary format");
   return ::tt::target::ttnn::GetSizePrefixedTTNNBinary(binary.handle.get());
 }
 
@@ -128,9 +123,7 @@ ::tt::target::metal::TTMetalBinary const *getBinary(Flatbuffer binary) {
   bool isTTMetal =
       ::tt::target::metal::SizePrefixedTTMetalBinaryBufferHasIdentifier(
           binary.handle.get());
-  if (not isTTMetal) {
-    throw std::runtime_error("Unsupported binary format");
-  }
+  LOG_ASSERT(isTTMetal, "Unsupported binary format");
   return ::tt::target::metal::GetSizePrefixedTTMetalBinary(binary.handle.get());
 }
 
@@ -207,7 +200,7 @@ namespace system_desc {
 ::tt::target::SystemDescRoot const *getBinary(Flatbuffer binary) {
   if (!::tt::target::SizePrefixedSystemDescRootBufferHasIdentifier(
           binary.handle.get())) {
-    throw std::runtime_error("Unsupported binary format");
+    LOG_FATAL("Unsupported binary format");
   }
   return ::tt::target::GetSizePrefixedSystemDescRoot(binary.handle.get());
 }
@@ -234,10 +227,7 @@ std::string asJson(Flatbuffer binary) {
 Flatbuffer Flatbuffer::loadFromPath(char const *path) {
   // load a flatbuffer from path
   std::ifstream fbb(path, std::ios::binary | std::ios::ate);
-  if (!fbb.is_open()) {
-    throw std::runtime_error("Failed to open file: " + std::string(path));
-  }
-
+  LOG_ASSERT(fbb.is_open(), "Failed to open file: ", path);
   std::streampos size = fbb.tellg();
   fbb.seekg(0, std::ios::beg);
   auto buffer = ::tt::runtime::utils::malloc_shared(size);
@@ -269,7 +259,7 @@ std::string_view Flatbuffer::getFileIdentifier() const {
     return ::tt::target::SystemDescRootIdentifier();
   }
 
-  throw std::runtime_error("Unsupported binary format");
+  LOG_FATAL("Unsupported binary format");
 }
 
 std::string Flatbuffer::getVersion() const {
@@ -288,7 +278,7 @@ std::string Flatbuffer::getVersion() const {
     return system_desc::getVersion(*this);
   }
 
-  throw std::runtime_error("Unsupported binary format");
+  LOG_FATAL("Unsupported binary format");
 }
 
 std::string_view Flatbuffer::getTTMLIRGitHash() const {
@@ -307,7 +297,7 @@ std::string_view Flatbuffer::getTTMLIRGitHash() const {
     return system_desc::getTTMLIRGitHash(*this);
   }
 
-  throw std::runtime_error("Unsupported binary format");
+  LOG_FATAL("Unsupported binary format");
 }
 
 std::string Flatbuffer::asJson() const {
@@ -326,7 +316,7 @@ std::string Flatbuffer::asJson() const {
     return system_desc::asJson(*this);
   }
 
-  throw std::runtime_error("Unsupported binary format");
+  LOG_FATAL("Unsupported binary format");
 }
 
 SystemDesc SystemDesc::loadFromPath(char const *path) {
@@ -349,7 +339,7 @@ Binary::getProgramInputs(std::uint32_t programIndex) const {
     return metal::getProgramInputs(*this, programIndex);
   }
 
-  throw std::runtime_error("Unsupported binary format");
+  LOG_FATAL("Unsupported binary format");
 }
 
 std::vector<TensorDesc>
@@ -364,7 +354,7 @@ Binary::getProgramOutputs(std::uint32_t programIndex) const {
     return metal::getProgramOutputs(*this, programIndex);
   }
 
-  throw std::runtime_error("Unsupported binary format");
+  LOG_FATAL("Unsupported binary format");
 }
 
 const ::tt::target::GoldenTensor *
@@ -379,8 +369,7 @@ Binary::getDebugInfoGolden(std::string &loc) const {
     return metal::getDebugInfoGolden(*this, loc);
   }
 
-  throw std::runtime_error(
-      "Unsupported binary format for obtaining golden information");
+  LOG_FATAL("Unsupported binary format for obtaining golden information");
 }
 
 } // namespace tt::runtime
diff --git a/runtime/lib/common/system_desc.cpp b/runtime/lib/common/system_desc.cpp
index f1210d00aa..3b4685901a 100644
--- a/runtime/lib/common/system_desc.cpp
+++ b/runtime/lib/common/system_desc.cpp
@@ -32,7 +32,7 @@ static ::tt::target::Arch toFlatbuffer(::tt::ARCH arch) {
     break;
   }
 
-  throw std::runtime_error("Unsupported arch");
+  LOG_FATAL("Unsupported arch");
 }
 
 static std::vector<::tt::target::ChipChannel>
@@ -246,7 +246,7 @@ static std::unique_ptr<::tt::runtime::SystemDesc> getCurrentSystemDescImpl(
   ::tt::target::FinishSizePrefixedSystemDescRootBuffer(fbb, root);
   ::flatbuffers::Verifier verifier(fbb.GetBufferPointer(), fbb.GetSize());
   if (!::tt::target::VerifySizePrefixedSystemDescRootBuffer(verifier)) {
-    throw std::runtime_error("Failed to verify system desc root buffer");
+    LOG_FATAL("Failed to verify system desc root buffer");
   }
   uint8_t *buf = fbb.GetBufferPointer();
   auto size = fbb.GetSize();
diff --git a/runtime/lib/runtime.cpp b/runtime/lib/runtime.cpp
index a57ac3fcd3..bf61133089 100644
--- a/runtime/lib/runtime.cpp
+++ b/runtime/lib/runtime.cpp
@@ -42,7 +42,7 @@ void deallocateBuffers(Device device) {
     return ::tt::runtime::ttmetal::deallocateBuffers(device);
   }
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
 }
 } // namespace detail
 
@@ -91,15 +91,14 @@ void setCompatibleRuntime(const Binary &binary) {
     return setCurrentRuntime(DeviceRuntime::TTMetal);
   }
 #endif
-  throw std::runtime_error(
-      "Unsupported binary file identifier or runtime not enabled");
+  LOG_FATAL("Unsupported binary file identifier or runtime not enabled");
 }
 
 std::pair<SystemDesc, DeviceIds> getCurrentSystemDesc() {
 #if defined(TT_RUNTIME_ENABLE_TTNN) || defined(TT_RUNTIME_ENABLE_TTMETAL)
   return system_desc::getCurrentSystemDesc();
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
 }
 
 Tensor createTensor(std::shared_ptr<void> data,
@@ -122,7 +121,7 @@ Tensor createTensor(std::shared_ptr<void> data,
                                                 dataType);
   }
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
 }
 
 Tensor
@@ -143,10 +142,32 @@ createTensor(std::vector<std::shared_ptr<void>> &data,
 
 #if defined(TT_RUNTIME_ENABLE_TTMETAL)
   if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
-    throw std::runtime_error("Not implemented");
+    LOG_FATAL("Not implemented");
   }
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
+}
+
+Tensor createTensor(Device device, Layout layout,
+                    std::vector<std::uint32_t> const &shape,
+                    std::vector<std::uint32_t> const &stride,
+                    std::uint32_t itemsize) {
+  LOG_ASSERT(not shape.empty());
+  LOG_ASSERT(not stride.empty());
+  LOG_ASSERT(itemsize > 0);
+#if defined(TT_RUNTIME_ENABLE_TTNN)
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    return ::tt::runtime::ttnn::createTensor(device, layout, shape, stride,
+                                             itemsize);
+  }
+#endif
+
+#if defined(TT_RUNTIME_ENABLE_TTMETAL)
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    LOG_FATAL("Not implemented");
+  }
+#endif
+  LOG_FATAL("runtime is not enabled");
 }
 
 tt::target::DataType getTensorDataType(Tensor tensor) {
@@ -161,7 +182,7 @@ tt::target::DataType getTensorDataType(Tensor tensor) {
     return ::tt::runtime::ttmetal::getTensorDataType(tensor);
   }
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
 }
 
 size_t getNumAvailableDevices() {
@@ -176,7 +197,7 @@ size_t getNumAvailableDevices() {
     return ::tt::runtime::ttmetal::getNumAvailableDevices();
   }
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
 }
 
 Device openDevice(DeviceIds const &deviceIds, size_t numHWCQs) {
@@ -191,7 +212,7 @@ Device openDevice(DeviceIds const &deviceIds, size_t numHWCQs) {
     return ::tt::runtime::ttmetal::openDevice(deviceIds, numHWCQs);
   }
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
 }
 
 void closeDevice(Device device) {
@@ -206,44 +227,130 @@ void closeDevice(Device device) {
     return ::tt::runtime::ttmetal::closeDevice(device);
   }
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
 }
 
-Event submit(Device deviceHandle, Binary executableHandle,
-             std::uint32_t programIndex,
-             std::vector<Tensor> const &inputHandles,
-             std::vector<Tensor> const &outputHandles) {
+void wait(Event event) {
 #if defined(TT_RUNTIME_ENABLE_TTNN)
   if (getCurrentRuntime() == DeviceRuntime::TTNN) {
-    return ::tt::runtime::ttnn::submit(deviceHandle, executableHandle,
-                                       programIndex, inputHandles,
-                                       outputHandles);
+    LOG_WARNING("wait API will be deprecated for TTNN runtime.");
+    return ::tt::runtime::ttnn::wait(event);
   }
 #endif
 
 #if defined(TT_RUNTIME_ENABLE_TTMETAL)
   if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
-    return ::tt::runtime::ttmetal::submit(deviceHandle, executableHandle,
-                                          programIndex, inputHandles,
-                                          outputHandles);
+    return ::tt::runtime::ttmetal::wait(event);
   }
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
 }
 
-void wait(Event event) {
+void wait(Tensor tensor) {
 #if defined(TT_RUNTIME_ENABLE_TTNN)
   if (getCurrentRuntime() == DeviceRuntime::TTNN) {
-    return ::tt::runtime::ttnn::wait(event);
+    return ::tt::runtime::ttnn::wait(tensor);
   }
 #endif
 
 #if defined(TT_RUNTIME_ENABLE_TTMETAL)
   if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
-    return ::tt::runtime::ttmetal::wait(event);
+    return ::tt::runtime::ttmetal::wait(tensor);
   }
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
+}
+
+void wait(std::vector<Tensor> const &tensors) {
+#if defined(TT_RUNTIME_ENABLE_TTNN)
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    return ::tt::runtime::ttnn::wait(tensors);
+  }
+#endif
+
+#if defined(TT_RUNTIME_ENABLE_TTMETAL)
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    return ::tt::runtime::ttmetal::wait(tensors);
+  }
+#endif
+  LOG_FATAL("runtime is not enabled");
+}
+
+Tensor toHost(Tensor tensor, bool untilize) {
+#if defined(TT_RUNTIME_ENABLE_TTNN)
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    return ::tt::runtime::ttnn::toHost(tensor, untilize);
+  }
+#endif
+
+#if defined(TT_RUNTIME_ENABLE_TTMETAL)
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    LOG_FATAL("not implemented");
+  }
+#endif
+  LOG_FATAL("runtime is not enabled");
+}
+
+Tensor toLayout(Tensor tensor, Device device, Layout layout) {
+#if defined(TT_RUNTIME_ENABLE_TTNN)
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    return ::tt::runtime::ttnn::toLayout(tensor, device, layout);
+  }
+#endif
+
+#if defined(TT_RUNTIME_ENABLE_TTMETAL)
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    LOG_FATAL("not implemented");
+  }
+#endif
+  LOG_FATAL("runtime is not enabled");
+}
+
+Layout getLayout(Binary executableHandle, std::uint32_t programIndex,
+                 std::uint32_t inputIndex) {
+#if defined(TT_RUNTIME_ENABLE_TTNN)
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    return ::tt::runtime::ttnn::getLayout(executableHandle, programIndex,
+                                          inputIndex);
+  }
+#endif
+
+#if defined(TT_RUNTIME_ENABLE_TTMETAL)
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    LOG_FATAL("not implemented");
+  }
+#endif
+  LOG_FATAL("runtime is not enabled");
+}
+
+void memcpy(Tensor dst, Tensor src) {
+#if defined(TT_RUNTIME_ENABLE_TTNN)
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    return ::tt::runtime::ttnn::memcpy(dst, src);
+  }
+#endif
+
+#if defined(TT_RUNTIME_ENABLE_TTMETAL)
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    LOG_FATAL("not implemented");
+  }
+#endif
+  LOG_FATAL("runtime is not enabled");
+}
+
+void deallocateTensor(Tensor &tensor, bool force) {
+#if defined(TT_RUNTIME_ENABLE_TTNN)
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    return ::tt::runtime::ttnn::deallocateTensor(tensor, force);
+  }
+#endif
+
+#if defined(TT_RUNTIME_ENABLE_TTMETAL)
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    LOG_FATAL("not implemented");
+  }
+#endif
+  LOG_FATAL("runtime is not enabled");
 }
 
 std::string getOpDebugString(OpContext opContextHandle) {
@@ -258,7 +365,7 @@ std::string getOpDebugString(OpContext opContextHandle) {
     return ::tt::runtime::ttmetal::getOpDebugString(opContextHandle);
   }
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
 }
 
 std::string getOpLocInfo(OpContext opContextHandle) {
@@ -291,7 +398,7 @@ Tensor getOpOutputTensor(OpContext opContextHandle,
                                                      programContextHandle);
   }
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
 }
 
 std::vector<float> getTensorData(Tensor tensor) {
@@ -307,7 +414,48 @@ std::vector<float> getTensorData(Tensor tensor) {
   }
 #endif
 
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
+}
+
+std::vector<Tensor> submit(Device deviceHandle, Binary executableHandle,
+                           std::uint32_t programIndex,
+                           std::vector<Tensor> const &inputHandles) {
+#if defined(TT_RUNTIME_ENABLE_TTNN)
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    return ::tt::runtime::ttnn::submit(deviceHandle, executableHandle,
+                                       programIndex, inputHandles);
+  }
+#endif
+
+#if defined(TT_RUNTIME_ENABLE_TTMETAL)
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    LOG_FATAL("not implemented");
+  }
+#endif
+  LOG_FATAL("runtime is not enabled");
 }
 
+Event submit(Device deviceHandle, Binary executableHandle,
+             std::uint32_t programIndex,
+             std::vector<Tensor> const &inputHandles,
+             std::vector<Tensor> const &outputHandles) {
+#if defined(TT_RUNTIME_ENABLE_TTNN)
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    LOG_WARNING("This submit API will soon be deprecated. Please switch to the "
+                "new API.");
+    return ::tt::runtime::ttnn::legacy::submit(deviceHandle, executableHandle,
+                                               programIndex, inputHandles,
+                                               outputHandles);
+  }
+#endif
+
+#if defined(TT_RUNTIME_ENABLE_TTMETAL)
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    return ::tt::runtime::ttmetal::submit(deviceHandle, executableHandle,
+                                          programIndex, inputHandles,
+                                          outputHandles);
+  }
+#endif
+  LOG_FATAL("runtime is not enabled");
+}
 } // namespace tt::runtime
diff --git a/runtime/lib/ttmetal/command_queue.cpp b/runtime/lib/ttmetal/command_queue.cpp
index 9a408a66b1..3480458e6a 100644
--- a/runtime/lib/ttmetal/command_queue.cpp
+++ b/runtime/lib/ttmetal/command_queue.cpp
@@ -137,7 +137,7 @@ void CQExecutor::execute(::tt::target::metal::Command const *command) {
     break;
   }
   default:
-    throw std::runtime_error("Unsupported command type");
+    LOG_FATAL("Unsupported command type");
     break;
   }
 }
@@ -328,7 +328,7 @@ createKernelConfig(::tt::target::metal::KernelSource const *kernelSource) {
     break;
   }
   }
-  throw std::runtime_error("Unsupported kernel source type");
+  LOG_FATAL("Unsupported kernel source type");
 }
 
 static ::tt::DataFormat toDataFormat(::tt::target::DataType dataType) {
@@ -346,7 +346,7 @@ static ::tt::DataFormat toDataFormat(::tt::target::DataType dataType) {
   case ::tt::target::DataType::UInt8:
     return ::tt::DataFormat::UInt8;
   default:
-    throw std::runtime_error("Unsupported data type");
+    LOG_FATAL("Unsupported data type");
   }
 }
 
@@ -358,7 +358,7 @@ static CoreType toCoreType(::tt::target::metal::CoreType coreType) {
   case ::tt::target::metal::CoreType::ETH:
     return CoreType::ETH;
   }
-  throw std::runtime_error("Unsupported core type");
+  LOG_FATAL("Unsupported core type");
 }
 
 static ::tt::tt_metal::CircularBufferConfig createCircularBufferConfig(
@@ -427,7 +427,7 @@ static void processRuntimeArgs(
       break;
     }
     case ::tt::target::metal::RuntimeArg::NONE:
-      throw std::runtime_error("Unsupported runtime arg type");
+      LOG_FATAL("Unsupported runtime arg type");
     }
   }
 
@@ -516,7 +516,7 @@ void CQExecutor::execute(
     break;
   }
   default:
-    throw std::runtime_error("Unsupported HostBuffer type");
+    LOG_FATAL("Unsupported HostBuffer type");
   }
 }
 
@@ -524,7 +524,7 @@ void CQExecutor::execute(
     ::tt::target::metal::EnqueueReadBufferCommand const *command) {
   ZoneScopedN("EnqueueReadBufferCommand");
   // Maybe we will need this in the future, like paging to system mem?
-  throw std::runtime_error("Unsupported EnqueueReadBufferCommand");
+  LOG_FATAL("Unsupported EnqueueReadBufferCommand");
 }
 
 void CQExecutor::execute(
diff --git a/runtime/lib/ttmetal/runtime.cpp b/runtime/lib/ttmetal/runtime.cpp
index 22d43ba366..2a66aa5e65 100644
--- a/runtime/lib/ttmetal/runtime.cpp
+++ b/runtime/lib/ttmetal/runtime.cpp
@@ -24,7 +24,7 @@ static ::tt::target::metal::TTMetalBinary const *getBinary(Flatbuffer binary) {
       ::tt::target::metal::SizePrefixedTTMetalBinaryBufferHasIdentifier(
           binary.handle.get());
   if (not isTTMetal) {
-    throw std::runtime_error("Unsupported binary format");
+    LOG_FATAL("Unsupported binary format");
   }
   return ::tt::target::metal::GetSizePrefixedTTMetalBinary(binary.handle.get());
 }
@@ -56,7 +56,7 @@ tt::target::DataType getTensorDataType(Tensor tensor) {
   }
   if (std::holds_alternative<std::shared_ptr<::tt::tt_metal::Buffer>>(
           metalTensor)) {
-    throw std::runtime_error("Datatype mapping from buffer not supported yet.");
+    LOG_FATAL("Datatype mapping from buffer not supported yet.");
   }
   LOG_ASSERT(false, "Unsupported tensor type");
   return ::tt::target::DataType::Float32;
@@ -96,6 +96,21 @@ void deallocateBuffers(Device deviceHandle) {
   }
 }
 
+void wait(Event event) {
+  Events events = event.as<Events>(DeviceRuntime::TTMetal);
+  for (auto e : events) {
+    ::tt::tt_metal::EventSynchronize(e);
+  }
+}
+
+void wait(Tensor tensor) { ::tt::runtime::ttmetal::wait(tensor.event); }
+
+void wait(std::vector<Tensor> const &tensors) {
+  for (Tensor tensor : tensors) {
+    ::tt::runtime::ttmetal::wait(tensor);
+  }
+}
+
 static std::pair<std::shared_ptr<::tt::tt_metal::Buffer>,
                  std::shared_ptr<::tt::tt_metal::Event>>
 prepareInput(::tt::tt_metal::Device *device, MetalTensor const &metalTensor,
@@ -117,7 +132,7 @@ prepareInput(::tt::tt_metal::Device *device, MetalTensor const &metalTensor,
           metalTensor)) {
     std::shared_ptr<::tt::tt_metal::Buffer> buffer =
         std::get<std::shared_ptr<::tt::tt_metal::Buffer>>(metalTensor);
-    throw std::runtime_error("Input from buffer not supported yet");
+    LOG_FATAL("Input from buffer not supported yet");
   }
   LOG_ASSERT(false, "Unsupported tensor type");
   return std::make_pair(nullptr, nullptr);
@@ -249,13 +264,6 @@ Event submit(Device deviceHandle, Binary executableHandle,
   return Event(static_pointer_cast<void>(events), DeviceRuntime::TTMetal);
 }
 
-void wait(Event event) {
-  Events events = event.as<Events>(DeviceRuntime::TTMetal);
-  for (auto e : events) {
-    ::tt::tt_metal::EventSynchronize(e);
-  }
-}
-
 std::string getOpDebugString(OpContext opContextHandle) {
   // Not implemented
   LOG_WARNING("obtaining op debug string for metal runtime not implemented");
diff --git a/runtime/lib/ttnn/CMakeLists.txt b/runtime/lib/ttnn/CMakeLists.txt
index 92581cf46f..6a68c4c7b9 100644
--- a/runtime/lib/ttnn/CMakeLists.txt
+++ b/runtime/lib/ttnn/CMakeLists.txt
@@ -1,4 +1,22 @@
+add_library(TTRuntimeTTNNHelpers
+  STATIC
+  ${CMAKE_CURRENT_SOURCE_DIR}/include/tt/runtime/ttnn/utils.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/include/tt/runtime/ttnn/types.cpp
+)
+set_property(TARGET TTRuntimeTTNNHelpers PROPERTY CXX_STANDARD 20)
+target_compile_options(TTRuntimeTTNNHelpers PUBLIC -mavx -mavx2 -fsized-deallocation)
+target_include_directories(TTRuntimeTTNNHelpers PUBLIC
+  ${PROJECT_SOURCE_DIR}/runtime/include
+  ${PROJECT_SOURCE_DIR}/runtime/lib/ttnn/include
+  ${PROJECT_SOURCE_DIR}/runtime/lib/ttnn/operations/include
+  ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common
+)
+target_include_directories(TTRuntimeTTNNHelpers SYSTEM PUBLIC "$<BUILD_INTERFACE:${TTMETAL_INCLUDE_DIRS}>")
+add_dependencies(TTRuntimeTTNNHelpers TTNN_LIBRARY tt-metal FBS_GENERATION)
+target_link_libraries(TTRuntimeTTNNHelpers PUBLIC TTNN_LIBRARY)
+
 add_subdirectory(operations)
+
 add_library(TTRuntimeTTNN
   STATIC
   runtime.cpp
@@ -11,5 +29,5 @@ target_include_directories(TTRuntimeTTNN PUBLIC
   ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common
 )
 target_include_directories(TTRuntimeTTNN SYSTEM PUBLIC "$<BUILD_INTERFACE:${TTMETAL_INCLUDE_DIRS}>")
-target_link_libraries(TTRuntimeTTNN PUBLIC TTRuntimeTTNNOps)
+target_link_libraries(TTRuntimeTTNN PUBLIC TTRuntimeTTNNOps TTRuntimeTTNNHelpers)
 add_dependencies(TTRuntimeTTNN TTRuntimeTTNNOps)
diff --git a/runtime/lib/ttnn/include/tt/runtime/ttnn/types.cpp b/runtime/lib/ttnn/include/tt/runtime/ttnn/types.cpp
new file mode 100644
index 0000000000..87d0815992
--- /dev/null
+++ b/runtime/lib/ttnn/include/tt/runtime/ttnn/types.cpp
@@ -0,0 +1,437 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tt/runtime/ttnn/types.h"
+#include "tt/runtime/detail/logger.h"
+#include "tt/runtime/ttnn/utils.h"
+
+namespace tt::runtime::ttnn {
+
+//
+// LayoutConverter APIs
+//
+LayoutConverter::LayoutConverter(const LayoutDesc &inputDesc,
+                                 const LayoutDesc &outputDesc)
+    : inputDesc(inputDesc), outputDesc(outputDesc) {
+  shouldTilize = (inputDesc.layout == ::ttnn::Layout::ROW_MAJOR and
+                  outputDesc.layout == ::ttnn::Layout::TILE);
+  shouldUntilize = (inputDesc.layout == ::ttnn::Layout::TILE and
+                    outputDesc.layout == ::ttnn::Layout::ROW_MAJOR);
+  shouldTypecast = (inputDesc.dataType != outputDesc.dataType);
+  shouldToDevice = (inputDesc.isOnHost() and outputDesc.isOnDevice());
+  shouldToMemoryConfig = (not shouldToDevice and outputDesc.isOnDevice() and
+                          (inputDesc.memoryConfig != outputDesc.memoryConfig));
+  shouldFromDevice = (inputDesc.isOnDevice() and outputDesc.isOnHost());
+}
+
+::ttnn::Tensor LayoutConverter::convertTensorLayout(
+    const ::ttnn::Tensor &input, std::optional<DeviceVariant> targetDevice) {
+  if (inputDesc.isOnHost()) {
+    return convertHostTensorLayout(input, targetDevice);
+  }
+  return convertDeviceTensorLayout(input);
+}
+
+::ttnn::Tensor LayoutConverter::toLayoutIfNeeded(const ::ttnn::Tensor &input) {
+  if (shouldTilize) {
+    return ::ttnn::to_layout(input, ::ttnn::Layout::TILE, std::nullopt,
+                             std::nullopt,
+                             static_cast<::ttnn::Device *>(nullptr));
+  }
+  if (shouldUntilize) {
+    return ::ttnn::to_layout(input, ::ttnn::Layout::ROW_MAJOR, std::nullopt,
+                             std::nullopt,
+                             static_cast<::ttnn::Device *>(nullptr));
+  }
+  return input;
+}
+
+::ttnn::Tensor LayoutConverter::typecastIfNeeded(const ::ttnn::Tensor &input) {
+  if (shouldTypecast) {
+    return ::ttnn::typecast(input, outputDesc.dataType);
+  }
+  return input;
+}
+
+::ttnn::Tensor
+LayoutConverter::toDeviceIfNeeded(const ::ttnn::Tensor &input,
+                                  std::optional<DeviceVariant> targetDevice,
+                                  bool force) {
+  if (shouldToDevice or force) {
+    LOG_ASSERT(targetDevice.has_value());
+    return std::visit(
+        [&](auto &&targetDevice) -> ::ttnn::Tensor {
+          return ::ttnn::to_device(input, &(targetDevice.get()),
+                                   outputDesc.memoryConfig);
+        },
+        targetDevice.value());
+  }
+  return input;
+}
+
+::ttnn::Tensor
+LayoutConverter::toMemoryConfigIfNeeded(const ::ttnn::Tensor &input) {
+  if (shouldToMemoryConfig) {
+    LOG_ASSERT(outputDesc.memoryConfig.has_value());
+    return ::ttnn::to_memory_config(input, outputDesc.memoryConfig.value());
+  }
+  return input;
+}
+
+::ttnn::Tensor
+LayoutConverter::fromDeviceIfNeeded(const ::ttnn::Tensor &input) {
+  if (shouldFromDevice) {
+    return ::ttnn::from_device(input);
+  }
+  return input;
+}
+
+::ttnn::Tensor LayoutConverter::handleHostInputNoLayoutNoTypecast(
+    const ::ttnn::Tensor &input, std::optional<DeviceVariant> targetDevice) {
+  ::ttnn::Tensor out = toDeviceIfNeeded(input, targetDevice);
+  out = toMemoryConfigIfNeeded(out);
+  return out;
+}
+
+::ttnn::Tensor LayoutConverter::handleHostInputLayoutNoTypecast(
+    const ::ttnn::Tensor &input, std::optional<DeviceVariant> targetDevice) {
+  if (shouldUntilize) {
+    ::ttnn::Tensor out = toLayoutIfNeeded(input);
+    out = toDeviceIfNeeded(out, targetDevice);
+    out = toMemoryConfigIfNeeded(out);
+    return out;
+  }
+
+  if (shouldTilize and outputDesc.dataType == ::ttnn::DataType::BFLOAT16) {
+    ::ttnn::Tensor out = toDeviceIfNeeded(input, targetDevice);
+    out = toLayoutIfNeeded(out);
+    out = toMemoryConfigIfNeeded(out);
+    return out;
+  }
+
+  if (shouldTilize and outputDesc.dataType != ::ttnn::DataType::BFLOAT16) {
+    ::ttnn::Tensor out = toLayoutIfNeeded(input);
+    out = toDeviceIfNeeded(out, targetDevice);
+    out = toMemoryConfigIfNeeded(out);
+    return out;
+  }
+  LOG_FATAL("Unreachable code path");
+}
+
+::ttnn::Tensor LayoutConverter::handleHostInputNoLayoutTypecast(
+    const ::ttnn::Tensor &input, std::optional<DeviceVariant> targetDevice) {
+  if (outputDesc.layout == ::ttnn::Layout::TILE) {
+    ::ttnn::Tensor out = toDeviceIfNeeded(input, targetDevice);
+    out = typecastIfNeeded(out);
+    out = toMemoryConfigIfNeeded(out);
+    return out;
+  }
+
+  if (outputDesc.layout != ::ttnn::Layout::TILE) {
+    ::ttnn::Tensor out = typecastIfNeeded(input);
+    out = toDeviceIfNeeded(out, targetDevice);
+    out = toMemoryConfigIfNeeded(out);
+    return out;
+  }
+  LOG_FATAL("Unreachable code path");
+}
+
+::ttnn::Tensor LayoutConverter::handleHostInputLayoutTypecast(
+    const ::ttnn::Tensor &input, std::optional<DeviceVariant> targetDevice) {
+  if (shouldUntilize) {
+    ::ttnn::Tensor out = typecastIfNeeded(input);
+    out = toLayoutIfNeeded(out);
+    out = toDeviceIfNeeded(out, targetDevice);
+    out = toMemoryConfigIfNeeded(out);
+    return out;
+  }
+
+  if (shouldTilize and inputDesc.dataType == ::ttnn::DataType::BFLOAT16) {
+    ::ttnn::Tensor out = toDeviceIfNeeded(input, targetDevice);
+    out = toLayoutIfNeeded(out);
+    out = typecastIfNeeded(out);
+    out = toMemoryConfigIfNeeded(out);
+    return out;
+  }
+
+  if (shouldTilize and outputDesc.dataType == ::ttnn::DataType::BFLOAT16) {
+    ::ttnn::Tensor out = typecastIfNeeded(input);
+    out = toDeviceIfNeeded(out, targetDevice);
+    out = toLayoutIfNeeded(input);
+    out = toMemoryConfigIfNeeded(out);
+    return out;
+  }
+
+  if (shouldTilize and inputDesc.dataType != ::ttnn::DataType::BFLOAT16 and
+      outputDesc.dataType != ::ttnn::DataType::BFLOAT16) {
+    ::ttnn::Tensor out = typecastIfNeeded(input);
+    out = toLayoutIfNeeded(out);
+    out = toDeviceIfNeeded(out, targetDevice);
+    out = toMemoryConfigIfNeeded(out);
+    return out;
+  }
+
+  LOG_FATAL("Unreachable code path");
+}
+
+::ttnn::Tensor LayoutConverter::convertHostTensorLayout(
+    const ::ttnn::Tensor &input, std::optional<DeviceVariant> targetDevice) {
+  bool shouldToLayout = (shouldTilize or shouldUntilize);
+  LOG_ASSERT(not shouldToDevice or targetDevice.has_value(),
+             "Target device must be provided for ToDevice");
+  if (not shouldToLayout and not shouldTypecast) {
+    return handleHostInputNoLayoutNoTypecast(input, targetDevice);
+  }
+  if (shouldToLayout and not shouldTypecast) {
+    return handleHostInputLayoutNoTypecast(input, targetDevice);
+  }
+  if (not shouldToLayout and shouldTypecast) {
+    return handleHostInputNoLayoutTypecast(input, targetDevice);
+  }
+  if (shouldToLayout and shouldTypecast) {
+    return handleHostInputLayoutTypecast(input, targetDevice);
+  }
+  LOG_FATAL("Unreachable code path");
+}
+
+::ttnn::Tensor LayoutConverter::handleDeviceInputNoLayoutNoTypecast(
+    const ::ttnn::Tensor &input) {
+  ::ttnn::Tensor out = toMemoryConfigIfNeeded(input);
+  out = fromDeviceIfNeeded(out);
+  return out;
+}
+
+::ttnn::Tensor LayoutConverter::handleDeviceInputLayoutNoTypecast(
+    const ::ttnn::Tensor &input) {
+  if (shouldUntilize and shouldFromDevice) {
+    ::ttnn::Tensor out = fromDeviceIfNeeded(input);
+    out = toLayoutIfNeeded(out);
+    return out;
+  }
+
+  if (shouldUntilize and not shouldFromDevice) {
+    LOG_WARNING("Currently no constraint checking for on-device untilize.");
+    ::ttnn::Tensor out = toLayoutIfNeeded(input);
+    out = toMemoryConfigIfNeeded(out);
+    return out;
+  }
+
+  /* If we should tilize and the input data type is bfloat16, tilize on device
+   */
+  if (shouldTilize and inputDesc.dataType == ::ttnn::DataType::BFLOAT16) {
+    ::ttnn::Tensor out = toLayoutIfNeeded(input);
+    out = toMemoryConfigIfNeeded(out);
+    out = fromDeviceIfNeeded(out);
+    return out;
+  }
+
+  /* If we should tilize and the input data type is not bfloat16, tilize on
+   * host */
+  if (shouldTilize and inputDesc.dataType != ::ttnn::DataType::BFLOAT16 and
+      shouldFromDevice) {
+    ::ttnn::Tensor out = fromDeviceIfNeeded(input);
+    out = toLayoutIfNeeded(out);
+    return out;
+  }
+
+  if (shouldTilize and inputDesc.dataType != ::ttnn::DataType::BFLOAT16 and
+      not shouldFromDevice) {
+    LOG_WARNING("Currently no constraint checking for on-device tilize.");
+    ::ttnn::Tensor out = toLayoutIfNeeded(input);
+    out = toMemoryConfigIfNeeded(out);
+    return out;
+  }
+
+  LOG_FATAL("Unreachable code path");
+}
+
+::ttnn::Tensor LayoutConverter::handleDeviceInputNoLayoutTypecast(
+    const ::ttnn::Tensor &input) {
+  if (inputDesc.isTilized()) {
+    ::ttnn::Tensor out = typecastIfNeeded(input);
+    out = toMemoryConfigIfNeeded(out);
+    out = fromDeviceIfNeeded(input);
+    return out;
+  }
+
+  if (not inputDesc.isTilized() and shouldFromDevice) {
+    ::ttnn::Tensor out = fromDeviceIfNeeded(input);
+    out = typecastIfNeeded(out);
+    return out;
+  }
+
+  if (not inputDesc.isTilized() and not shouldFromDevice) {
+    LOG_WARNING("Currently no constraint checking for on-device typecast.");
+    ::ttnn::Tensor out = typecastIfNeeded(input);
+    out = toMemoryConfigIfNeeded(out);
+    return out;
+  }
+  LOG_FATAL("Unreachable code path");
+}
+
+::ttnn::Tensor
+LayoutConverter::handleDeviceInputLayoutTypecast(const ::ttnn::Tensor &input) {
+  if (shouldUntilize and shouldFromDevice) {
+    ::ttnn::Tensor out = typecastIfNeeded(input);
+    out = fromDeviceIfNeeded(input);
+    out = toLayoutIfNeeded(out);
+    return out;
+  }
+
+  if (shouldUntilize and not shouldFromDevice) {
+    LOG_WARNING("Currently no constraint checking for on-device untilize.");
+    ::ttnn::Tensor out = typecastIfNeeded(input);
+    out = toLayoutIfNeeded(input);
+    out = toMemoryConfigIfNeeded(out);
+    return out;
+  }
+
+  if (shouldTilize and inputDesc.dataType == ::ttnn::DataType::BFLOAT16) {
+    ::ttnn::Tensor out = toLayoutIfNeeded(input);
+    out = typecastIfNeeded(out);
+    out = toMemoryConfigIfNeeded(out);
+    out = fromDeviceIfNeeded(out);
+    return out;
+  }
+
+  if (shouldTilize and inputDesc.dataType != ::ttnn::DataType::BFLOAT16 and
+      shouldFromDevice) {
+    ::ttnn::Tensor out = fromDeviceIfNeeded(input);
+    out = toLayoutIfNeeded(out);
+    out = typecastIfNeeded(out);
+    return out;
+  }
+
+  if (shouldTilize and inputDesc.dataType != ::ttnn::DataType::BFLOAT16 and
+      not shouldFromDevice) {
+    LOG_WARNING("Currently no constraint checking for on-device tilize.");
+    ::ttnn::Tensor out = toLayoutIfNeeded(input);
+    out = typecastIfNeeded(out);
+    out = toMemoryConfigIfNeeded(out);
+    return out;
+  }
+
+  LOG_FATAL("Unreachable code path");
+}
+
+::ttnn::Tensor
+LayoutConverter::convertDeviceTensorLayout(const ::ttnn::Tensor &input) {
+  bool shouldToLayout = (shouldTilize or shouldUntilize);
+  if (not shouldToLayout and not shouldTypecast) {
+    return handleDeviceInputNoLayoutNoTypecast(input);
+  }
+  if (shouldToLayout and not shouldTypecast) {
+    return handleDeviceInputLayoutNoTypecast(input);
+  }
+  if (not shouldToLayout and shouldTypecast) {
+    return handleDeviceInputNoLayoutTypecast(input);
+  }
+  if (shouldToLayout and shouldTypecast) {
+    return handleDeviceInputLayoutTypecast(input);
+  }
+  LOG_FATAL("Unreachable code path");
+}
+
+//
+// ProgramTensorPool APIs
+//
+std::pair<std::unordered_map<std::uint32_t, ::ttnn::Tensor *>::iterator, bool>
+ProgramTensorPool::try_emplace(std::uint32_t globalId,
+                               const ::ttnn::Tensor &tensor) {
+  auto it = liveTensors.find(globalId);
+  if (it != liveTensors.end()) {
+    return std::make_pair(it, false);
+  }
+  LOG_ASSERT(!intermedTensors.contains(globalId));
+  intermedTensors.try_emplace(globalId, tensor);
+  return liveTensors.try_emplace(globalId, &intermedTensors.at(globalId));
+}
+
+std::pair<std::unordered_map<std::uint32_t, ::ttnn::Tensor *>::iterator, bool>
+ProgramTensorPool::insert_or_assign(std::uint32_t globalId,
+                                    const ::ttnn::Tensor &tensor) {
+  intermedTensors.insert_or_assign(globalId, tensor);
+  return liveTensors.insert_or_assign(globalId, &intermedTensors.at(globalId));
+}
+
+::ttnn::Tensor &ProgramTensorPool::at(std::uint32_t globalId) {
+  LOG_ASSERT(liveTensors.contains(globalId));
+  return *liveTensors.at(globalId);
+}
+
+const ::ttnn::Tensor &ProgramTensorPool::at(std::uint32_t globalId) const {
+  LOG_ASSERT(liveTensors.contains(globalId));
+  return *liveTensors.at(globalId);
+}
+
+size_t ProgramTensorPool::erase(std::uint32_t globalId) {
+  LOG_ASSERT(liveTensors.contains(globalId) &&
+             intermedTensors.contains(globalId));
+  intermedTensors.erase(globalId);
+  return liveTensors.erase(globalId);
+}
+
+std::vector<Tensor> ProgramTensorPool::gatherOutputTensors() {
+  std::vector<Tensor> outputTensors;
+  outputTensors.reserve(programOutputs.size());
+  std::transform(
+      programOutputs.begin(), programOutputs.end(),
+      std::back_inserter(outputTensors), [this](uint32_t outputGlobalId) {
+        return utils::createRuntimeTensorFromTTNN(this->at(outputGlobalId));
+      });
+  return outputTensors;
+}
+
+//
+// ProgramContext APIs
+//
+ProgramContext::ProgramContext(
+    const std::unordered_map<uint32_t, ::ttnn::Tensor *> &liveTensors,
+    const std::vector<uint32_t> &programInputs,
+    const std::vector<uint32_t> &programOutputs, ::ttnn::MeshDevice *parentMesh)
+    : tensorPool(ProgramTensorPool(liveTensors, programInputs, programOutputs)),
+      parentMesh(parentMesh) {
+  LOG_ASSERT(parentMesh, "Parent mesh cannot be null");
+}
+
+void ProgramContext::addSubMesh(uint32_t meshId,
+                                std::shared_ptr<::ttnn::MeshDevice> subMesh) {
+  auto [it, inserted] = subMeshes.try_emplace(meshId, subMesh);
+  LOG_ASSERT(inserted, "Submesh already exists");
+}
+
+::ttnn::MeshDevice &ProgramContext::getSubMesh(uint32_t meshId) {
+  LOG_ASSERT(subMeshes.contains(meshId));
+  return *subMeshes.at(meshId);
+}
+
+size_t ProgramContext::subMeshSize(uint32_t meshId) const {
+  LOG_ASSERT(subMeshes.contains(meshId));
+  return subMeshes.at(meshId)->num_devices();
+}
+
+::ttnn::Device &ProgramContext::getDeviceFromSubMesh(uint32_t meshId,
+                                                     int physicalDeviceId) {
+  LOG_ASSERT(subMeshes.contains(meshId));
+  auto &subMesh = *subMeshes.at(meshId);
+  return *subMesh.get_device(physicalDeviceId);
+}
+
+::ttnn::Device &ProgramContext::getDeviceIndexFromSubMesh(uint32_t meshId,
+                                                          int deviceIndex) {
+  LOG_ASSERT(subMeshes.contains(meshId));
+  auto &subMesh = *subMeshes.at(meshId);
+  return *subMesh.get_device_index(deviceIndex);
+}
+
+DeviceVariant ProgramContext::getTargetDevice(uint32_t meshId) {
+  LOG_ASSERT(subMeshes.contains(meshId));
+  auto &subMesh = *subMeshes.at(meshId);
+  if (subMesh.num_devices() == 1) {
+    return std::ref(*subMesh.get_device_index(0));
+  }
+  return std::ref(subMesh);
+}
+
+} // namespace tt::runtime::ttnn
diff --git a/runtime/lib/ttnn/include/tt/runtime/ttnn/types.h b/runtime/lib/ttnn/include/tt/runtime/ttnn/types.h
index 5cd08c7ed0..a5ca800c33 100644
--- a/runtime/lib/ttnn/include/tt/runtime/ttnn/types.h
+++ b/runtime/lib/ttnn/include/tt/runtime/ttnn/types.h
@@ -6,18 +6,88 @@
 #define TT_RUNTIME_TTNN_TYPES_H
 
 #include "tt/runtime/detail/ttnn.h"
+#include "tt/runtime/types.h"
+#include <optional>
+#include <unordered_map>
 
 namespace tt::runtime::ttnn {
-
-using TensorMap = std::unordered_map<uint32_t, ::ttnn::Tensor *>;
 using DeviceVariant = std::variant<std::reference_wrapper<::ttnn::Device>,
                                    std::reference_wrapper<::ttnn::MeshDevice>>;
 
+struct LayoutDesc {
+  ::ttnn::BufferType bufferType;
+  ::ttnn::Layout layout;
+  ::ttnn::DataType dataType;
+  std::optional<::ttnn::MemoryConfig> memoryConfig;
+
+  LayoutDesc(const ::ttnn::BufferType &bufferType, const ::ttnn::Layout &layout,
+             const ::ttnn::DataType &dataType,
+             const std::optional<::ttnn::MemoryConfig> &memoryConfig)
+      : bufferType(bufferType), layout(layout), dataType(dataType),
+        memoryConfig(memoryConfig) {}
+
+  bool isOnHost() const {
+    return bufferType == ::ttnn::BufferType::SYSTEM_MEMORY;
+  }
+  bool isOnDevice() const { return !isOnHost(); }
+
+  bool isTilized() const { return layout == ::ttnn::Layout::TILE; }
+};
+
+class LayoutConverter {
+public:
+  LayoutDesc inputDesc;
+  LayoutDesc outputDesc;
+  bool shouldTilize = false;
+  bool shouldUntilize = false;
+  bool shouldTypecast = false;
+  bool shouldToDevice = false;
+  bool shouldToMemoryConfig = false;
+  bool shouldFromDevice = false;
+
+  LayoutConverter(const LayoutDesc &inputDesc, const LayoutDesc &outputDesc);
+  ::ttnn::Tensor convertTensorLayout(const ::ttnn::Tensor &input,
+                                     std::optional<DeviceVariant> targetDevice);
+
+private:
+  ::ttnn::Tensor toLayoutIfNeeded(const ::ttnn::Tensor &input);
+  ::ttnn::Tensor typecastIfNeeded(const ::ttnn::Tensor &input);
+  ::ttnn::Tensor toDeviceIfNeeded(const ::ttnn::Tensor &input,
+                                  std::optional<DeviceVariant> targetDevice,
+                                  bool force = false);
+  ::ttnn::Tensor toMemoryConfigIfNeeded(const ::ttnn::Tensor &input);
+  ::ttnn::Tensor fromDeviceIfNeeded(const ::ttnn::Tensor &input);
+
+  ::ttnn::Tensor
+  handleHostInputNoLayoutNoTypecast(const ::ttnn::Tensor &input,
+                                    std::optional<DeviceVariant> targetDevice);
+  ::ttnn::Tensor
+  handleHostInputLayoutNoTypecast(const ::ttnn::Tensor &input,
+                                  std::optional<DeviceVariant> targetDevice);
+  ::ttnn::Tensor
+  handleHostInputNoLayoutTypecast(const ::ttnn::Tensor &input,
+                                  std::optional<DeviceVariant> targetDevice);
+  ::ttnn::Tensor
+  handleHostInputLayoutTypecast(const ::ttnn::Tensor &input,
+                                std::optional<DeviceVariant> targetDevice);
+  ::ttnn::Tensor
+  convertHostTensorLayout(const ::ttnn::Tensor &input,
+                          std::optional<DeviceVariant> targetDevice);
+
+  ::ttnn::Tensor
+  handleDeviceInputNoLayoutNoTypecast(const ::ttnn::Tensor &input);
+  ::ttnn::Tensor handleDeviceInputLayoutNoTypecast(const ::ttnn::Tensor &input);
+  ::ttnn::Tensor handleDeviceInputNoLayoutTypecast(const ::ttnn::Tensor &input);
+  ::ttnn::Tensor handleDeviceInputLayoutTypecast(const ::ttnn::Tensor &input);
+  ::ttnn::Tensor convertDeviceTensorLayout(const ::ttnn::Tensor &input);
+};
+
 class ProgramTensorPool {
 public:
-  ProgramTensorPool(const TensorMap &liveTensors,
-                    const std::unordered_set<uint32_t> &programInputs,
-                    const std::unordered_set<uint32_t> &programOutputs)
+  ProgramTensorPool(
+      const std::unordered_map<uint32_t, ::ttnn::Tensor *> &liveTensors,
+      const std::vector<uint32_t> &programInputs,
+      const std::vector<uint32_t> &programOutputs)
       : programInputs(programInputs), programOutputs(programOutputs),
         liveTensors(liveTensors) {}
   ProgramTensorPool(const ProgramTensorPool &) = delete;
@@ -25,72 +95,38 @@ class ProgramTensorPool {
   ProgramTensorPool(ProgramTensorPool &&) = default;
   ProgramTensorPool &operator=(ProgramTensorPool &&) = default;
 
-  auto try_emplace(std::uint32_t globalId, const ::ttnn::Tensor &tensor) {
-    auto it = liveTensors.find(globalId);
-    if (it != liveTensors.end()) {
-      return std::make_pair(it, false);
-    }
-    assert(!intermedTensors.contains(globalId));
-    intermedTensors.try_emplace(globalId, tensor);
-    return liveTensors.try_emplace(globalId, &intermedTensors.at(globalId));
-  }
+  std::pair<std::unordered_map<std::uint32_t, ::ttnn::Tensor *>::iterator, bool>
+  try_emplace(std::uint32_t globalId, const ::ttnn::Tensor &tensor);
 
-  auto insert_or_assign(std::uint32_t globalId, const ::ttnn::Tensor &tensor) {
-    intermedTensors.insert_or_assign(globalId, tensor);
-    return liveTensors.insert_or_assign(globalId,
-                                        &intermedTensors.at(globalId));
-  }
+  std::pair<std::unordered_map<std::uint32_t, ::ttnn::Tensor *>::iterator, bool>
+  insert_or_assign(std::uint32_t globalId, const ::ttnn::Tensor &tensor);
 
-  ::ttnn::Tensor &at(std::uint32_t globalId) {
-    assert(liveTensors.contains(globalId));
-    return *liveTensors.at(globalId);
-  }
+  ::ttnn::Tensor &at(std::uint32_t globalId);
 
-  const ::ttnn::Tensor &at(std::uint32_t globalId) const {
-    assert(liveTensors.contains(globalId));
-    return *liveTensors.at(globalId);
-  }
+  const ::ttnn::Tensor &at(std::uint32_t globalId) const;
 
-  size_t erase(std::uint32_t globalId) {
-    assert(liveTensors.contains(globalId) &&
-           intermedTensors.contains(globalId));
-    intermedTensors.erase(globalId);
-    return liveTensors.erase(globalId);
-  }
+  size_t erase(std::uint32_t globalId);
 
-  void copyTensorToUserOutput(std::uint32_t outputGlobalId,
-                              const ::ttnn::Tensor &srcTensor) {
-    assert(liveTensors.contains(outputGlobalId));
-    assert(isUserOutput(outputGlobalId));
-    ::ttnn::Tensor &outputTensor = *liveTensors.at(outputGlobalId);
-    void *src = ::tt::tt_metal::get_raw_host_data_ptr(srcTensor);
-    void *dst = ::tt::tt_metal::get_raw_host_data_ptr(outputTensor);
-    size_t size = outputTensor.volume() * outputTensor.element_size();
-    std::memcpy(dst, src, size);
-  }
+  std::vector<Tensor> gatherOutputTensors();
 
   bool contains(std::uint32_t globalId) const {
     return liveTensors.contains(globalId);
   }
 
-  bool isUserOutput(std::uint32_t globalId) const {
-    return programOutputs.contains(globalId);
-  }
-
-  const std::unordered_set<std::uint32_t> &getProgramInputs() const {
+  const std::vector<std::uint32_t> &getProgramInputs() const {
     return programInputs;
   }
 
-  const std::unordered_set<std::uint32_t> &getProgramOutputs() const {
+  const std::vector<std::uint32_t> &getProgramOutputs() const {
     return programOutputs;
   }
 
 private:
-  std::unordered_set<std::uint32_t> programInputs;
-  std::unordered_set<std::uint32_t> programOutputs;
+  std::vector<std::uint32_t> programInputs;
+  std::vector<std::uint32_t> programOutputs;
   // A superset of intermedTensors, containing pointers to all tensors created
-  // by the program and the input/output tensors passed in by the user
-  TensorMap liveTensors;
+  // by the program and the input tensors passed in by the user
+  std::unordered_map<uint32_t, ::ttnn::Tensor *> liveTensors;
 
   // A subset of liveTensors, containing values of any intermediate tensors
   // created by the program
@@ -99,15 +135,11 @@ class ProgramTensorPool {
 
 class ProgramContext {
 public:
-  ProgramContext(const TensorMap &liveTensors,
-                 const std::unordered_set<uint32_t> &programInputs,
-                 const std::unordered_set<uint32_t> &programOutputs,
-                 ::ttnn::MeshDevice *parentMesh)
-      : tensorPool(
-            ProgramTensorPool(liveTensors, programInputs, programOutputs)),
-        parentMesh(parentMesh) {
-    assert(parentMesh && "Parent mesh cannot be null");
-  }
+  ProgramContext(
+      const std::unordered_map<uint32_t, ::ttnn::Tensor *> &liveTensors,
+      const std::vector<uint32_t> &programInputs,
+      const std::vector<uint32_t> &programOutputs,
+      ::ttnn::MeshDevice *parentMesh);
   ProgramContext(const ProgramContext &) = delete;
   ProgramContext &operator=(const ProgramContext &) = delete;
   ProgramContext(ProgramContext &&) = default;
@@ -125,42 +157,17 @@ class ProgramContext {
   //
   // Sub Mesh Operations
   //
-  void addSubMesh(uint32_t meshId,
-                  std::shared_ptr<::ttnn::MeshDevice> subMesh) {
-    auto [it, inserted] = subMeshes.try_emplace(meshId, subMesh);
-    assert(inserted && "Submesh already exists");
-  }
+  void addSubMesh(uint32_t meshId, std::shared_ptr<::ttnn::MeshDevice> subMesh);
 
-  ::ttnn::MeshDevice &getSubMesh(uint32_t meshId) {
-    assert(subMeshes.contains(meshId));
-    return *subMeshes.at(meshId);
-  }
+  ::ttnn::MeshDevice &getSubMesh(uint32_t meshId);
 
-  size_t subMeshSize(uint32_t meshId) const {
-    assert(subMeshes.contains(meshId));
-    return subMeshes.at(meshId)->num_devices();
-  }
+  size_t subMeshSize(uint32_t meshId) const;
 
-  ::ttnn::Device &getDeviceFromSubMesh(uint32_t meshId, int physicalDeviceId) {
-    assert(subMeshes.contains(meshId));
-    auto &subMesh = *subMeshes.at(meshId);
-    return *subMesh.get_device(physicalDeviceId);
-  }
+  ::ttnn::Device &getDeviceFromSubMesh(uint32_t meshId, int physicalDeviceId);
 
-  ::ttnn::Device &getDeviceIndexFromSubMesh(uint32_t meshId, int deviceIndex) {
-    assert(subMeshes.contains(meshId));
-    auto &subMesh = *subMeshes.at(meshId);
-    return *subMesh.get_device_index(deviceIndex);
-  }
+  ::ttnn::Device &getDeviceIndexFromSubMesh(uint32_t meshId, int deviceIndex);
 
-  DeviceVariant getTargetDevice(uint32_t meshId) {
-    assert(subMeshes.contains(meshId));
-    auto &subMesh = *subMeshes.at(meshId);
-    if (subMesh.num_devices() == 1) {
-      return std::ref(*subMesh.get_device_index(0));
-    }
-    return std::ref(subMesh);
-  }
+  DeviceVariant getTargetDevice(uint32_t meshId);
 
   //
   // Tensor Pool Operations
diff --git a/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.cpp b/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.cpp
new file mode 100644
index 0000000000..fa8aa82ed2
--- /dev/null
+++ b/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.cpp
@@ -0,0 +1,222 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tt/runtime/ttnn/utils.h"
+#include "tt/runtime/detail/logger.h"
+
+namespace tt::runtime::ttnn::utils {
+
+// TODO (bug #701)
+// Currently the memory layout/location in flatbuffer is incorrect
+// These methods are workarounds for operations such that we query the info
+// directly from the TTNN tensor. Ideally, we should be able to get all of this
+// info directly from the flatbuffer
+bool isOnHost(const ::ttnn::StorageType &storageType) {
+  return storageType == ::tt::tt_metal::StorageType::BORROWED or
+         storageType == ::tt::tt_metal::StorageType::OWNED or
+         storageType == ::tt::tt_metal::StorageType::MULTI_DEVICE_HOST;
+}
+
+bool isOnDevice(const ::ttnn::StorageType &storageType) {
+  return storageType == ::tt::tt_metal::StorageType::DEVICE or
+         storageType == ::tt::tt_metal::StorageType::MULTI_DEVICE;
+}
+
+bool isValidTileShape(const ::tt::target::Dim2d *shape) {
+  return (shape->x() == 1 and shape->y() == 1) or
+         (shape->x() == 32 and shape->y() == 32);
+}
+
+::ttnn::DataType toTTNNDataType(::tt::target::DataType dataType) {
+  switch (dataType) {
+  case ::tt::target::DataType::Float32:
+    return ::ttnn::DataType::FLOAT32;
+  case ::tt::target::DataType::BFloat16:
+    return ::ttnn::DataType::BFLOAT16;
+  case ::tt::target::DataType::BFP_BFloat8:
+    return ::ttnn::DataType::BFLOAT8_B;
+  case ::tt::target::DataType::BFP_BFloat4:
+    return ::ttnn::DataType::BFLOAT4_B;
+  case ::tt::target::DataType::UInt32:
+    return ::ttnn::DataType::UINT32;
+  case ::tt::target::DataType::UInt16:
+    return ::ttnn::DataType::UINT16;
+
+  default:
+    LOG_FATAL("Unsupported data type");
+  }
+}
+
+::tt::target::DataType fromTTNNDataType(::ttnn::DataType dataType) {
+  switch (dataType) {
+  case ::ttnn::DataType::FLOAT32:
+    return ::tt::target::DataType::Float32;
+  case ::ttnn::DataType::BFLOAT16:
+    return ::tt::target::DataType::BFloat16;
+  case ::ttnn::DataType::BFLOAT8_B:
+    return ::tt::target::DataType::BFP_BFloat8;
+  case ::ttnn::DataType::BFLOAT4_B:
+    return ::tt::target::DataType::BFP_BFloat4;
+  case ::ttnn::DataType::UINT32:
+    return ::tt::target::DataType::UInt32;
+  case ::ttnn::DataType::UINT16:
+    return ::tt::target::DataType::UInt16;
+
+  default:
+    LOG_FATAL("Unsupported data type");
+  }
+}
+
+::ttnn::Layout toTTNNLayout(::tt::target::TensorLayout layout) {
+  switch (layout) {
+  case ::tt::target::TensorLayout::Tile:
+    return ::ttnn::Layout::TILE;
+  case ::tt::target::TensorLayout::RowMajor:
+    return ::ttnn::Layout::ROW_MAJOR;
+  default:
+    LOG_FATAL("Unsupported layout");
+  }
+}
+
+::ttnn::TensorMemoryLayout
+toTTNNTensorMemoryLayout(::tt::target::TensorMemoryLayout tensorMemoryLayout) {
+
+  switch (tensorMemoryLayout) {
+  case ::tt::target::TensorMemoryLayout::Interleaved:
+    return ::ttnn::TensorMemoryLayout::INTERLEAVED;
+  case ::tt::target::TensorMemoryLayout::SingleBank:
+    return ::ttnn::TensorMemoryLayout::SINGLE_BANK;
+  case ::tt::target::TensorMemoryLayout::HeightSharded:
+    return ::ttnn::TensorMemoryLayout::HEIGHT_SHARDED;
+  case ::tt::target::TensorMemoryLayout::WidthSharded:
+    return ::ttnn::TensorMemoryLayout::WIDTH_SHARDED;
+  case ::tt::target::TensorMemoryLayout::BlockSharded:
+    return ::ttnn::TensorMemoryLayout::BLOCK_SHARDED;
+  case ::tt::target::TensorMemoryLayout::None:
+    LOG_FATAL("Unsupported tensor memory layout None");
+  }
+}
+
+// This method will be deprecated in favor of method below
+//
+::tt::tt_metal::BufferType
+toTTNNBufferType(::tt::target::MemorySpace memorySpace) {
+  switch (memorySpace) {
+  case ::tt::target::MemorySpace::System:
+  case ::tt::target::MemorySpace::SystemMMIO:
+    return ::tt::tt_metal::BufferType::SYSTEM_MEMORY;
+  case ::tt::target::MemorySpace::DeviceDRAM:
+    return ::tt::tt_metal::BufferType::DRAM;
+  case ::tt::target::MemorySpace::DeviceL1:
+    return ::tt::tt_metal::BufferType::L1;
+  }
+}
+
+// Prefer to use this method
+//
+::ttnn::BufferType toTTNNBufferType(::tt::target::BufferType bufferType) {
+
+  switch (bufferType) {
+  case ::tt::target::BufferType::DRAM:
+    return ::ttnn::BufferType::DRAM;
+  case ::tt::target::BufferType::L1:
+    return ::ttnn::BufferType::L1;
+  case ::tt::target::BufferType::SystemMemory:
+    return ::ttnn::BufferType::SYSTEM_MEMORY;
+  case ::tt::target::BufferType::L1Small:
+    return ::ttnn::BufferType::L1_SMALL;
+  case ::tt::target::BufferType::Trace:
+    return ::ttnn::BufferType::TRACE;
+  }
+};
+
+std::vector<uint32_t>
+toShapeFromFBShape(const flatbuffers::Vector<int32_t> &vec) {
+  return std::vector<uint32_t>(vec.begin(), vec.end());
+}
+
+::ttnn::Layout
+inferLayoutFromTileShape(const ::tt::target::TensorRef *tensorRef) {
+  const ::tt::target::Dim2d *tileShape =
+      tensorRef->desc()->layout()->memory_desc()->tile_shape();
+  LOG_ASSERT(isValidTileShape(tileShape));
+  if (tileShape->x() == 1 and tileShape->y() == 1) {
+    return ::ttnn::Layout::ROW_MAJOR;
+  }
+  return ::ttnn::Layout::TILE;
+}
+
+CoreRangeSet
+toCoreRangeSet(const ::flatbuffers::Vector<const ::tt::target::Dim2dRange *>
+                   *coreRangeSet) {
+  std::set<CoreRange> coreRanges;
+  for (::tt::target::Dim2dRange const *coreRange : *coreRangeSet) {
+    CoreCoord start(coreRange->loc().x(), coreRange->loc().y());
+    // End is inclusive
+    CoreCoord end(coreRange->loc().x() + coreRange->size().x() - 1,
+                  coreRange->loc().y() + coreRange->size().y() - 1);
+
+    coreRanges.emplace(start, end);
+  }
+  return CoreRangeSet(coreRanges);
+}
+
+::tt::tt_metal::MemoryConfig
+createMemoryConfig(const ::tt::target::TensorRef *tensorRef) {
+  const ::tt::target::LayoutDesc *layout = tensorRef->desc()->layout();
+  const ::tt::target::TensorMemoryLayout targetMemoryLayout =
+      layout->memory_desc()->memory_layout();
+  const ::tt::target::MemorySpace targetMemorySpace =
+      layout->memory_desc()->memory_space();
+  const ::flatbuffers::Vector<const tt::target::Dim2dRange *>
+      *targetCoreRangeSet = layout->core_range_set();
+  const ::flatbuffers::Vector<int32_t> *targetShardShape =
+      layout->memory_desc()->shape();
+  const ::tt::target::Dim2d *tileShape = layout->memory_desc()->tile_shape();
+
+  LOG_ASSERT(targetCoreRangeSet->size() == 1,
+             "Currently only single core range/grid is supported");
+
+  LOG_ASSERT(targetShardShape->size() == 2,
+             "Only 2D shard shape is supported in TTNN backend");
+
+  LOG_ASSERT(::tt::runtime::ttnn::utils::isValidTileShape(tileShape),
+             "Invalid tile shape");
+
+  CoreRangeSet ttnnCoreRangeSet = toCoreRangeSet(targetCoreRangeSet);
+  std::array<uint32_t, 2> ttnnShardShape;
+  std::copy(targetShardShape->begin(), targetShardShape->end(),
+            ttnnShardShape.begin());
+
+  ttnnShardShape[0] *= tileShape->y();
+  ttnnShardShape[1] *= tileShape->x();
+
+  ::tt::tt_metal::TensorMemoryLayout ttnnMemLayout =
+      toTTNNTensorMemoryLayout(targetMemoryLayout);
+
+  ::tt::tt_metal::BufferType ttnnBufferType =
+      toTTNNBufferType(targetMemorySpace);
+
+  ::tt::tt_metal::ShardSpec shardSpec(
+      ttnnCoreRangeSet, ttnnShardShape,
+      ::tt::tt_metal::ShardOrientation::ROW_MAJOR, false);
+
+  std::optional<::tt::tt_metal::ShardSpec> shardSpecOpt =
+      ttnnMemLayout == tt_metal::TensorMemoryLayout::INTERLEAVED
+          ? std::nullopt
+          : std::make_optional(shardSpec);
+
+  ::tt::tt_metal::MemoryConfig memoryConfig{.memory_layout = ttnnMemLayout,
+                                            .buffer_type = ttnnBufferType,
+                                            .shard_spec = shardSpecOpt};
+  return memoryConfig;
+}
+
+Tensor createRuntimeTensorFromTTNN(const ::ttnn::Tensor &tensor) {
+  auto tensorPtr = std::make_shared<::ttnn::Tensor>(tensor);
+  return Tensor(std::static_pointer_cast<void>(tensorPtr), nullptr,
+                DeviceRuntime::TTNN);
+}
+
+} // namespace tt::runtime::ttnn::utils
diff --git a/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.h b/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.h
index 75b22d1145..353195b8df 100644
--- a/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.h
+++ b/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.h
@@ -6,127 +6,50 @@
 #define TT_RUNTIME_TTNN_UTILS_H
 
 #include "flatbuffers/vector.h"
-#include "tt_metal/impl/buffers/buffer.hpp"
+#include "tt/runtime/detail/ttnn.h"
 #include "ttmlir/Target/Common/types_generated.h"
 #include "ttmlir/Target/TTNN/Target.h"
-#include "ttnn/types.hpp"
 
 namespace tt::runtime::ttnn::utils {
 
-inline bool isValidTileShape(const ::tt::target::Dim2d *shape) {
-  return (shape->x() == 1 and shape->y() == 1) or
-         (shape->x() == 32 and shape->y() == 32);
-}
-
-inline ::ttnn::DataType toTTNNDataType(::tt::target::DataType dataType) {
-  switch (dataType) {
-  case ::tt::target::DataType::Float32:
-    return ::ttnn::DataType::FLOAT32;
-  case ::tt::target::DataType::BFloat16:
-    return ::ttnn::DataType::BFLOAT16;
-  case ::tt::target::DataType::BFP_BFloat8:
-    return ::ttnn::DataType::BFLOAT8_B;
-  case ::tt::target::DataType::BFP_BFloat4:
-    return ::ttnn::DataType::BFLOAT4_B;
-  case ::tt::target::DataType::UInt32:
-    return ::ttnn::DataType::UINT32;
-  case ::tt::target::DataType::UInt16:
-    return ::ttnn::DataType::UINT16;
-
-  default:
-    throw std::runtime_error("Unsupported data type");
-  }
-}
-
-inline ::tt::target::DataType fromTTNNDataType(::ttnn::DataType dataType) {
-  switch (dataType) {
-  case ::ttnn::DataType::FLOAT32:
-    return ::tt::target::DataType::Float32;
-  case ::ttnn::DataType::BFLOAT16:
-    return ::tt::target::DataType::BFloat16;
-  case ::ttnn::DataType::BFLOAT8_B:
-    return ::tt::target::DataType::BFP_BFloat8;
-  case ::ttnn::DataType::BFLOAT4_B:
-    return ::tt::target::DataType::BFP_BFloat4;
-  case ::ttnn::DataType::UINT32:
-    return ::tt::target::DataType::UInt32;
-  case ::ttnn::DataType::UINT16:
-    return ::tt::target::DataType::UInt16;
-
-  default:
-    throw std::runtime_error("Unsupported data type");
-  }
-}
-
-inline ::ttnn::Layout toTTNNLayout(::tt::target::TensorLayout layout) {
-  switch (layout) {
-  case ::tt::target::TensorLayout::Tile:
-    return ::ttnn::Layout::TILE;
-  case ::tt::target::TensorLayout::RowMajor:
-    return ::ttnn::Layout::ROW_MAJOR;
-  default:
-    throw std::runtime_error("Unsupported layout");
-  }
-}
-
-inline ::ttnn::TensorMemoryLayout
-toTTNNTensorMemoryLayout(::tt::target::TensorMemoryLayout tensorMemoryLayout) {
-
-  switch (tensorMemoryLayout) {
-  case ::tt::target::TensorMemoryLayout::Interleaved:
-    return ::ttnn::TensorMemoryLayout::INTERLEAVED;
-  case ::tt::target::TensorMemoryLayout::SingleBank:
-    return ::ttnn::TensorMemoryLayout::SINGLE_BANK;
-  case ::tt::target::TensorMemoryLayout::HeightSharded:
-    return ::ttnn::TensorMemoryLayout::HEIGHT_SHARDED;
-  case ::tt::target::TensorMemoryLayout::WidthSharded:
-    return ::ttnn::TensorMemoryLayout::WIDTH_SHARDED;
-  case ::tt::target::TensorMemoryLayout::BlockSharded:
-    return ::ttnn::TensorMemoryLayout::BLOCK_SHARDED;
-  case ::tt::target::TensorMemoryLayout::None:
-    assert(false &&
-           "Unsupported tensor memory layout TensorMemoryLayout::None");
-  }
-}
+bool isOnHost(const ::ttnn::StorageType &storageType);
+
+bool isOnDevice(const ::ttnn::StorageType &storageType);
+
+bool isValidTileShape(const ::tt::target::Dim2d *shape);
+
+::ttnn::DataType toTTNNDataType(::tt::target::DataType dataType);
+
+::tt::target::DataType fromTTNNDataType(::ttnn::DataType dataType);
+
+::ttnn::Layout toTTNNLayout(::tt::target::TensorLayout layout);
+
+::ttnn::TensorMemoryLayout
+toTTNNTensorMemoryLayout(::tt::target::TensorMemoryLayout tensorMemoryLayout);
 
 // This method will be deprecated in favor of method below
 //
-inline ::tt::tt_metal::BufferType
-toTTNNBufferType(::tt::target::MemorySpace memorySpace) {
-  switch (memorySpace) {
-  case ::tt::target::MemorySpace::System:
-  case ::tt::target::MemorySpace::SystemMMIO:
-    return ::tt::tt_metal::BufferType::SYSTEM_MEMORY;
-  case ::tt::target::MemorySpace::DeviceDRAM:
-    return ::tt::tt_metal::BufferType::DRAM;
-  case ::tt::target::MemorySpace::DeviceL1:
-    return ::tt::tt_metal::BufferType::L1;
-  }
-}
+::tt::tt_metal::BufferType
+toTTNNBufferType(::tt::target::MemorySpace memorySpace);
 
 // Prefer to use this method
 //
-inline ::ttnn::BufferType
-toTTNNBufferType(::tt::target::BufferType bufferType) {
-
-  switch (bufferType) {
-  case ::tt::target::BufferType::DRAM:
-    return ::ttnn::BufferType::DRAM;
-  case ::tt::target::BufferType::L1:
-    return ::ttnn::BufferType::L1;
-  case ::tt::target::BufferType::SystemMemory:
-    return ::ttnn::BufferType::SYSTEM_MEMORY;
-  case ::tt::target::BufferType::L1Small:
-    return ::ttnn::BufferType::L1_SMALL;
-  case ::tt::target::BufferType::Trace:
-    return ::ttnn::BufferType::TRACE;
-  }
-};
-
-inline std::vector<uint32_t>
-toShapeFromFBShape(const flatbuffers::Vector<int32_t> &vec) {
-  return std::vector<uint32_t>(vec.begin(), vec.end());
-}
+::ttnn::BufferType toTTNNBufferType(::tt::target::BufferType bufferType);
+
+std::vector<uint32_t>
+toShapeFromFBShape(const flatbuffers::Vector<int32_t> &vec);
+
+::ttnn::Layout
+inferLayoutFromTileShape(const ::tt::target::TensorRef *tensorRef);
+
+CoreRangeSet
+toCoreRangeSet(const ::flatbuffers::Vector<const ::tt::target::Dim2dRange *>
+                   *coreRangeSet);
+
+::tt::tt_metal::MemoryConfig
+createMemoryConfig(const ::tt::target::TensorRef *tensorRef);
+
+Tensor createRuntimeTensorFromTTNN(const ::ttnn::Tensor &tensor);
 
 } // namespace tt::runtime::ttnn::utils
 
diff --git a/runtime/lib/ttnn/operations/CMakeLists.txt b/runtime/lib/ttnn/operations/CMakeLists.txt
index 38115803f0..4d18e3f1ce 100644
--- a/runtime/lib/ttnn/operations/CMakeLists.txt
+++ b/runtime/lib/ttnn/operations/CMakeLists.txt
@@ -46,12 +46,13 @@ target_include_directories(TTRuntimeTTNNOps PUBLIC
   ${PROJECT_SOURCE_DIR}/runtime/lib/ttnn/operations/include
   ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common
 )
+
 target_include_directories(TTRuntimeTTNNOps SYSTEM PUBLIC "$<BUILD_INTERFACE:${TTMETAL_INCLUDE_DIRS}>")
-target_link_libraries(TTRuntimeTTNNOps PUBLIC TTNN_LIBRARY)
+target_link_libraries(TTRuntimeTTNNOps PUBLIC TTNN_LIBRARY TTRuntimeTTNNHelpers)
 
 if (TT_RUNTIME_ENABLE_PERF_TRACE)
   target_link_libraries(TTRuntimeTTNNOps PUBLIC TRACY_LIBRARY)
 endif()
 
 
-add_dependencies(TTRuntimeTTNNOps TTNN_LIBRARY tt-metal FBS_GENERATION)
+add_dependencies(TTRuntimeTTNNOps TTNN_LIBRARY tt-metal FBS_GENERATION TTRuntimeTTNNHelpers)
diff --git a/runtime/lib/ttnn/operations/ccl/all_gather.cpp b/runtime/lib/ttnn/operations/ccl/all_gather.cpp
index 37bf7427bf..eee27e7bab 100644
--- a/runtime/lib/ttnn/operations/ccl/all_gather.cpp
+++ b/runtime/lib/ttnn/operations/ccl/all_gather.cpp
@@ -5,6 +5,7 @@
 #include "all_gather.h"
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
 
 namespace tt::runtime::ttnn::operations::ccl {
 void run(const ::tt::target::ttnn::AllGatherOp *op, ProgramContext &context) {
@@ -13,7 +14,7 @@ void run(const ::tt::target::ttnn::AllGatherOp *op, ProgramContext &context) {
   int32_t dim = op->dim();
   int32_t num_links = op->num_links();
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
   ::ttnn::Tensor out =
       ::ttnn::all_gather(input, dim, num_links, outputMemoryConfig);
   tensorPool.insert_or_assign(op->out()->global_id(), out);
diff --git a/runtime/lib/ttnn/operations/conv/conv2d.cpp b/runtime/lib/ttnn/operations/conv/conv2d.cpp
index e6670c1131..5e00b929e7 100644
--- a/runtime/lib/ttnn/operations/conv/conv2d.cpp
+++ b/runtime/lib/ttnn/operations/conv/conv2d.cpp
@@ -6,6 +6,7 @@
 #include "tt/runtime/detail/logger.h"
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
 #include "ttmlir/Target/TTNN/program_generated.h"
 #include "ttnn/types.hpp"
 
@@ -23,7 +24,8 @@ void run(const ::tt::target::ttnn::Conv2dOp *op, ProgramContext &context) {
   auto config = ::ttnn::operations::conv::conv2d::Conv2dConfig();
   config.dtype = utils::getDataType(op->input());
   config.weights_dtype = utils::getDataType(op->weight());
-  ::ttnn::MemoryConfig outMemConfig = utils::createMemoryConfig(op->out());
+  ::ttnn::MemoryConfig outMemConfig =
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
   DeviceVariant targetDevice =
       context.getTargetDevice(op->device()->global_id());
   ::ttnn::Tensor out = std::visit(
diff --git a/runtime/lib/ttnn/operations/creation/arange.cpp b/runtime/lib/ttnn/operations/creation/arange.cpp
index 446cdf72ad..8ddb199136 100644
--- a/runtime/lib/ttnn/operations/creation/arange.cpp
+++ b/runtime/lib/ttnn/operations/creation/arange.cpp
@@ -41,6 +41,6 @@ void run(const ::tt::target::ttnn::ArangeOp *op, ProgramContext &context) {
   ::ttnn::Tensor out = ::ttnn::arange(op->start(), op->end(), op->step(), dtype,
                                       device, memoryConfig);
 
-  utils::updateTensorPool(tensorPool, out, op->out()->global_id());
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
 }
 } // namespace tt::runtime::ttnn::operations::creation
diff --git a/runtime/lib/ttnn/operations/creation/empty.cpp b/runtime/lib/ttnn/operations/creation/empty.cpp
index bed68e0f14..d504a798b2 100644
--- a/runtime/lib/ttnn/operations/creation/empty.cpp
+++ b/runtime/lib/ttnn/operations/creation/empty.cpp
@@ -62,11 +62,12 @@ createEmptyOnMultiDevice(ProgramContext &context, EmptyTensorConfig &config,
   ::tt::tt_metal::DistributedTensorConfig strategy =
       config.distributedTensorConfig();
   std::vector<::ttnn::Tensor> tensorShards;
-  tensorShards.resize(config.numShards);
-  std::generate_n(
-      tensorShards.begin(), config.numShards, [&config]() -> ::ttnn::Tensor {
-        return ::ttnn::zeros(config.shape, config.dtype, config.layout);
-      });
+  tensorShards.reserve(config.numShards);
+  std::generate_n(std::back_inserter(tensorShards), config.numShards,
+                  [&config]() -> ::ttnn::Tensor {
+                    return ::ttnn::zeros(config.shape, config.dtype,
+                                         config.layout);
+                  });
   ::ttnn::Tensor out = ::ttnn::distributed::api::create_multi_device_tensor(
       tensorShards, ::tt::tt_metal::StorageType::MULTI_DEVICE_HOST, strategy);
   if (deviceRef) {
@@ -101,6 +102,6 @@ void run(const ::tt::target::ttnn::EmptyOp *op, ProgramContext &context) {
   } else {
     LOG_FATAL("Unsupported num shards");
   }
-  utils::updateTensorPool(tensorPool, out, op->out()->global_id());
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
 }
 } // namespace tt::runtime::ttnn::operations::creation
diff --git a/runtime/lib/ttnn/operations/creation/full.cpp b/runtime/lib/ttnn/operations/creation/full.cpp
index 6a224f935d..7f6a6c0b6f 100644
--- a/runtime/lib/ttnn/operations/creation/full.cpp
+++ b/runtime/lib/ttnn/operations/creation/full.cpp
@@ -26,7 +26,7 @@ struct FullTensorConfig {
         fillValue(op->fill_value()), numShards(op->num_shards()),
         strategy(op->strategy()) {
 
-    layout = utils::inferLayoutFromTileShape(op->out());
+    layout = ::tt::runtime::ttnn::utils::inferLayoutFromTileShape(op->out());
 
     // TODO(bug #272), determine correct layout by tile shape in the future
     // currently tile shape is not set correctly, so as a workaround, hardcode
@@ -42,8 +42,7 @@ struct FullTensorConfig {
     }
 
     if (!utils::inSystemMemory(op->out())) {
-      memoryConfig =
-          ::tt::runtime::ttnn::operations::utils::createMemoryConfig(op->out());
+      memoryConfig = ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
     }
     validate();
   }
@@ -72,8 +71,8 @@ createFullOnMultiDevice(ProgramContext &context, FullTensorConfig &config,
   ::tt::tt_metal::DistributedTensorConfig strategy =
       config.distributedTensorConfig();
   std::vector<::ttnn::Tensor> tensorShards;
-  tensorShards.resize(config.numShards);
-  std::generate_n(tensorShards.begin(), config.numShards,
+  tensorShards.reserve(config.numShards);
+  std::generate_n(std::back_inserter(tensorShards), config.numShards,
                   [&config]() -> ::ttnn::Tensor {
                     return ::ttnn::full(config.shape, config.fillValue,
                                         config.dtype, config.layout);
@@ -116,6 +115,6 @@ void run(const ::tt::target::ttnn::FullOp *op, ProgramContext &context) {
   } else {
     LOG_FATAL("Unsupported num shards");
   }
-  utils::updateTensorPool(tensorPool, out, op->out()->global_id());
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
 }
 } // namespace tt::runtime::ttnn::operations::creation
diff --git a/runtime/lib/ttnn/operations/data_movement/transpose.cpp b/runtime/lib/ttnn/operations/data_movement/transpose.cpp
index ef8dcf1b13..c86c0ee10a 100644
--- a/runtime/lib/ttnn/operations/data_movement/transpose.cpp
+++ b/runtime/lib/ttnn/operations/data_movement/transpose.cpp
@@ -6,6 +6,7 @@
 #include "tt/runtime/detail/logger.h"
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
 
 namespace tt::runtime::ttnn::operations::data_movement {
 void run(const ::tt::target::ttnn::TransposeOp *op, ProgramContext &context) {
@@ -15,7 +16,7 @@ void run(const ::tt::target::ttnn::TransposeOp *op, ProgramContext &context) {
   int32_t dim0 = op->dim0();
   int32_t dim1 = op->dim1();
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
   ::ttnn::Tensor out = ::ttnn::transpose(in, dim0, dim1, outputMemoryConfig);
   tensorPool.insert_or_assign(op->out()->global_id(), out);
 }
diff --git a/runtime/lib/ttnn/operations/deletion/deallocate.cpp b/runtime/lib/ttnn/operations/deletion/deallocate.cpp
index 6204945b34..e871a9ea64 100644
--- a/runtime/lib/ttnn/operations/deletion/deallocate.cpp
+++ b/runtime/lib/ttnn/operations/deletion/deallocate.cpp
@@ -11,13 +11,6 @@ void run(const ::tt::target::ttnn::DeallocateOp *op, ProgramContext &context) {
   ::ttnn::Tensor &tensor = tensorPool.at(op->in()->global_id());
   DEBUG_ASSERT(tensor.is_allocated());
   ::ttnn::deallocate(tensor, op->force());
-
-  // The tensor should be deallocated after the deallocate call.
-  // Still this assert may be hit in the future for multidevice/async ttnn
-  // support. In that case, we will reevaluate the assert/dealloc behaviour and
-  // adjust it accordingly.
-  //
-  DEBUG_ASSERT(!tensor.is_allocated());
   tensorPool.erase(op->in()->global_id());
 }
 } // namespace tt::runtime::ttnn::operations::deletion
diff --git a/runtime/lib/ttnn/operations/eltwise/binary/binary.cpp b/runtime/lib/ttnn/operations/eltwise/binary/binary.cpp
index 5913971192..ff47bdcdd8 100644
--- a/runtime/lib/ttnn/operations/eltwise/binary/binary.cpp
+++ b/runtime/lib/ttnn/operations/eltwise/binary/binary.cpp
@@ -6,6 +6,7 @@
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/ttnn/operations/eltwise/binary/utils.h"
 #include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
 #include "ttnn/operations/eltwise/binary/binary_composite.hpp"
 
 namespace tt::runtime::ttnn::operations::binary {
@@ -26,7 +27,7 @@ static void runEltwiseBinaryOp(
 
   ::ttnn::DataType outputDataType = utils::getDataType(op->out());
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
 
   ::ttnn::Tensor out = ttnnOp(*lhs, *rhs, outputDataType, outputMemoryConfig,
                               std::nullopt, std::nullopt, std::nullopt);
diff --git a/runtime/lib/ttnn/operations/eltwise/binary/binary_composite.cpp b/runtime/lib/ttnn/operations/eltwise/binary/binary_composite.cpp
index 5c1d056f99..921b542ed2 100644
--- a/runtime/lib/ttnn/operations/eltwise/binary/binary_composite.cpp
+++ b/runtime/lib/ttnn/operations/eltwise/binary/binary_composite.cpp
@@ -6,6 +6,7 @@
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/ttnn/operations/eltwise/binary/utils.h"
 #include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
 
 namespace tt::runtime::ttnn::operations::binary::composite {
 
@@ -20,7 +21,7 @@ static void runEltwiseBinaryCompositeOp(
   getEltwiseBinaryOpInputTensors(op, tensorPool, &lhs, &rhs);
 
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
 
   ::ttnn::Tensor out = ttnnOp(*lhs, *rhs, outputMemoryConfig);
   tensorPool.insert_or_assign(op->out()->global_id(), out);
diff --git a/runtime/lib/ttnn/operations/eltwise/ternary/ternary.cpp b/runtime/lib/ttnn/operations/eltwise/ternary/ternary.cpp
index 6afde5d663..44f1413898 100644
--- a/runtime/lib/ttnn/operations/eltwise/ternary/ternary.cpp
+++ b/runtime/lib/ttnn/operations/eltwise/ternary/ternary.cpp
@@ -22,7 +22,7 @@ static void runEltwiseTernaryWhereOp(
   getEltwiseTernaryOpInputTensors(op, tensorPool, &first, &second, &third);
 
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
 
   ::ttnn::Tensor out = ttnnOp(*first, *second, *third, outputMemoryConfig);
   tensorPool.insert_or_assign(op->out()->global_id(), out);
diff --git a/runtime/lib/ttnn/operations/eltwise/unary/unary.cpp b/runtime/lib/ttnn/operations/eltwise/unary/unary.cpp
index 5a09b43a9f..50c53f8dbe 100644
--- a/runtime/lib/ttnn/operations/eltwise/unary/unary.cpp
+++ b/runtime/lib/ttnn/operations/eltwise/unary/unary.cpp
@@ -6,6 +6,7 @@
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/ttnn/operations/eltwise/unary/utils.h"
 #include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
 #include "ttmlir/Target/TTNN/program_generated.h"
 #include "ttnn/operations/copy.hpp"
 
@@ -22,7 +23,7 @@ static void runEltwiseUnaryOp(
   getEltwiseUnaryOpInputTensor(op, tensorPool, &in);
 
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
 
   ::ttnn::Tensor out = ttnnOp(*in, outputMemoryConfig, std::nullopt);
   tensorPool.insert_or_assign(op->out()->global_id(), out);
@@ -39,7 +40,7 @@ static void runEltwiseUnaryWithFastAndApproximateModeOp(
   getEltwiseUnaryOpInputTensor(op, tensorPool, &in);
 
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
 
   ::ttnn::Tensor out =
       ttnnOp(*in, false /* parameter */, outputMemoryConfig, std::nullopt);
@@ -56,7 +57,7 @@ static void runEltwiseUnaryWithFloatParameterOp(
 
   float parameter = op->params_as_EltwiseOpWithFloatParams()->parameter();
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
   ::ttnn::Tensor out = ttnnOp(*in, parameter, outputMemoryConfig);
   tensorPool.insert_or_assign(op->out()->global_id(), out);
 }
diff --git a/runtime/lib/ttnn/operations/eltwise/unary/unary_composite.cpp b/runtime/lib/ttnn/operations/eltwise/unary/unary_composite.cpp
index fd378d5a26..31514f0fe5 100644
--- a/runtime/lib/ttnn/operations/eltwise/unary/unary_composite.cpp
+++ b/runtime/lib/ttnn/operations/eltwise/unary/unary_composite.cpp
@@ -6,6 +6,7 @@
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/ttnn/operations/eltwise/unary/utils.h"
 #include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
 #include "ttnn/operations/eltwise/unary/unary_composite.hpp"
 
 namespace tt::runtime::ttnn::operations::unary::composite {
@@ -20,27 +21,26 @@ static void runEltwiseUnaryCompositeOp(
   getEltwiseUnaryOpInputTensor(op, tensorPool, &in);
 
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
 
   ::ttnn::Tensor out = ttnnOp(*in, outputMemoryConfig);
   tensorPool.insert_or_assign(op->out()->global_id(), out);
 }
 
-static void runEltwiseUnaryCompositeClampOP(
+static void runEltwiseUnaryCompositeClampOp(
     const ::tt::target::ttnn::EltwiseOp *op, ProgramTensorPool &tensorPool,
-    std::function<::ttnn::Tensor(const ::ttnn::Tensor &, float, float,
-                                 const ::tt::tt_metal::MemoryConfig &)>
-        ttnnOp) {
+    const std::function<::ttnn::Tensor(const ::ttnn::Tensor &, float, float,
+                                       const ::tt::tt_metal::MemoryConfig &)>
+        &ttnnOp) {
   ::ttnn::Tensor *in = nullptr;
   getEltwiseUnaryOpInputTensor(op, tensorPool, &in);
 
   float min = op->params_as_ClampOpParams()->min();
   float max = op->params_as_ClampOpParams()->max();
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
   ::ttnn::Tensor out = ttnnOp(*in, min, max, outputMemoryConfig);
   tensorPool.insert_or_assign(op->out()->global_id(), out);
-  return;
 }
 
 void run(const ::tt::target::ttnn::EltwiseOp *op, ProgramContext &context) {
@@ -51,7 +51,7 @@ void run(const ::tt::target::ttnn::EltwiseOp *op, ProgramContext &context) {
     break;
   }
   case ::tt::target::ttnn::EltwiseOpType::Clamp: {
-    runEltwiseUnaryCompositeClampOP(op, tensorPool, ::ttnn::clamp);
+    runEltwiseUnaryCompositeClampOp(op, tensorPool, ::ttnn::clamp);
     break;
   }
   case ::tt::target::ttnn::EltwiseOpType::Log1p: {
diff --git a/runtime/lib/ttnn/operations/embedding/embedding.cpp b/runtime/lib/ttnn/operations/embedding/embedding.cpp
index 47b27ca9ac..511d8256de 100644
--- a/runtime/lib/ttnn/operations/embedding/embedding.cpp
+++ b/runtime/lib/ttnn/operations/embedding/embedding.cpp
@@ -6,6 +6,7 @@
 #include "tt/runtime/detail/logger.h"
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
 
 namespace tt::runtime::ttnn::operations::embedding {
 void run(const ::tt::target::ttnn::EmbeddingOp *op, ProgramContext &context) {
@@ -24,7 +25,7 @@ void run(const ::tt::target::ttnn::EmbeddingOp *op, ProgramContext &context) {
   auto embeddingsType = ::ttnn::operations::embedding::EmbeddingsType::GENERIC;
   ::ttnn::DataType outputDataType = utils::getDataType(op->out());
   ::ttnn::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
   ::ttnn::Tensor out =
       ::ttnn::embedding(input, weight, padToken, layout, embeddingsType,
                         outputDataType, outputMemoryConfig);
diff --git a/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.cpp b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.cpp
index c595fe26bc..60ee2ddc2b 100644
--- a/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.cpp
+++ b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.cpp
@@ -7,25 +7,6 @@
 
 namespace tt::runtime::ttnn::operations::utils {
 
-// TODO (bug #701)
-// Currently the memory layout/location in flatbuffer is incorrect
-// These methods are workarounds such that we query the info directly from the
-// TTNN tensor Ideally, we should be able to get all of this info directly from
-// the flatbuffer
-bool isOnHost(const ::ttnn::Tensor &tensor) {
-  // Currently only supports borrowed or owned host storage
-  return tensor.storage_type() == ::tt::tt_metal::StorageType::BORROWED or
-         tensor.storage_type() == ::tt::tt_metal::StorageType::OWNED or
-         tensor.storage_type() ==
-             ::tt::tt_metal::StorageType::MULTI_DEVICE_HOST;
-}
-
-bool isOnDevice(const ::ttnn::Tensor &tensor) {
-  // Currently only supports single device storage
-  return tensor.storage_type() == ::tt::tt_metal::StorageType::DEVICE or
-         tensor.storage_type() == ::tt::tt_metal::StorageType::MULTI_DEVICE;
-}
-
 bool isTilized(const ::tt::target::TensorRef *tensorRef) {
   const ::tt::target::Dim2d *tileShape =
       tensorRef->desc()->layout()->memory_desc()->tile_shape();
@@ -43,96 +24,11 @@ bool inSystemMemory(const ::tt::target::TensorRef *tensorRef) {
          targetMemorySpace == ::tt::target::MemorySpace::SystemMMIO;
 }
 
-void updateTensorPool(ProgramTensorPool &tensorPool,
-                      const ::ttnn::Tensor &tensor, uint32_t outputGlobalId) {
-  if (tensorPool.isUserOutput(outputGlobalId)) {
-    tensorPool.copyTensorToUserOutput(outputGlobalId, tensor);
-  } else {
-    tensorPool.insert_or_assign(outputGlobalId, tensor);
-  }
-}
-
 ::ttnn::DataType getDataType(const ::tt::target::TensorRef *tensorRef) {
   return ::tt::runtime::ttnn::utils::toTTNNDataType(
       tensorRef->desc()->layout()->memory_desc()->data_type());
 }
 
-::ttnn::Layout
-inferLayoutFromTileShape(const ::tt::target::TensorRef *tensorRef) {
-  const ::tt::target::Dim2d *tileShape =
-      tensorRef->desc()->layout()->memory_desc()->tile_shape();
-  LOG_ASSERT(::tt::runtime::ttnn::utils::isValidTileShape(tileShape));
-  if (tileShape->x() == 1 and tileShape->y() == 1) {
-    return ::ttnn::Layout::ROW_MAJOR;
-  }
-  return ::ttnn::Layout::TILE;
-}
-
-CoreRangeSet
-toCoreRangeSet(const ::flatbuffers::Vector<const ::tt::target::Dim2dRange *>
-                   *coreRangeSet) {
-  std::set<CoreRange> coreRanges;
-  for (::tt::target::Dim2dRange const *coreRange : *coreRangeSet) {
-    CoreCoord start(coreRange->loc().x(), coreRange->loc().y());
-    // End is inclusive
-    CoreCoord end(coreRange->loc().x() + coreRange->size().x() - 1,
-                  coreRange->loc().y() + coreRange->size().y() - 1);
-
-    coreRanges.emplace(start, end);
-  }
-  return CoreRangeSet(coreRanges);
-}
-
-// This method will soon be deprecated, prefer to use the method below
-//
-::tt::tt_metal::MemoryConfig
-createMemoryConfig(const ::tt::target::TensorRef *tensorRef) {
-  const ::tt::target::LayoutDesc *layout = tensorRef->desc()->layout();
-  const ::tt::target::TensorMemoryLayout targetMemoryLayout =
-      layout->memory_desc()->memory_layout();
-  const ::tt::target::MemorySpace targetMemorySpace =
-      layout->memory_desc()->memory_space();
-  const ::flatbuffers::Vector<const tt::target::Dim2dRange *>
-      *targetCoreRangeSet = layout->core_range_set();
-  const ::flatbuffers::Vector<int32_t> *targetShardShape =
-      layout->memory_desc()->shape();
-  const ::tt::target::Dim2d *tileShape = layout->memory_desc()->tile_shape();
-
-  LOG_ASSERT(targetCoreRangeSet->size() == 1,
-             "Currently only single core range/grid is supported");
-
-  LOG_ASSERT(targetShardShape->size() == 2,
-             "Only 2D shard shape is supported in TTNN backend");
-
-  LOG_ASSERT(::tt::runtime::ttnn::utils::isValidTileShape(tileShape),
-             "Invalid tile shape");
-
-  CoreRangeSet ttnnCoreRangeSet = toCoreRangeSet(targetCoreRangeSet);
-  std::array<uint32_t, 2> ttnnShardShape;
-  std::copy(targetShardShape->begin(), targetShardShape->end(),
-            ttnnShardShape.begin());
-
-  ttnnShardShape[0] *= tileShape->y();
-  ttnnShardShape[1] *= tileShape->x();
-
-  ::tt::tt_metal::ShardSpec shardSpec(
-      ttnnCoreRangeSet, ttnnShardShape,
-      ::tt::tt_metal::ShardOrientation::ROW_MAJOR, false);
-
-  ::tt::tt_metal::TensorMemoryLayout ttnnMemLayout =
-      ::tt::runtime::ttnn::utils::toTTNNTensorMemoryLayout(targetMemoryLayout);
-
-  ::tt::tt_metal::BufferType ttnnBufferType =
-      ::tt::runtime::ttnn::utils::toTTNNBufferType(targetMemorySpace);
-
-  return {ttnnMemLayout, ttnnBufferType,
-          ttnnMemLayout == tt_metal::TensorMemoryLayout::INTERLEAVED
-              ? std::nullopt
-              : std::make_optional(shardSpec)};
-}
-
-// Prefer to use this method over the one above
-//
 ::tt::tt_metal::MemoryConfig
 createMemoryConfig(const ::tt::target::MemoryConfigDesc *memcfg,
                    const ::tt::target::TensorRef *tensorRef) {
@@ -147,7 +43,8 @@ createMemoryConfig(const ::tt::target::MemoryConfigDesc *memcfg,
   const ::tt::target::LayoutDesc *layout = tensorRef->desc()->layout();
   const ::flatbuffers::Vector<const tt::target::Dim2dRange *>
       *targetCoreRangeSet = layout->core_range_set();
-  CoreRangeSet ttnnCoreRangeSet = toCoreRangeSet(targetCoreRangeSet);
+  CoreRangeSet ttnnCoreRangeSet =
+      ::tt::runtime::ttnn::utils::toCoreRangeSet(targetCoreRangeSet);
   const ::flatbuffers::Vector<int64_t> *shardShape =
       memcfg->shard_spec()->shard_shape();
   const ::tt::target::Dim2d *tileShape = layout->memory_desc()->tile_shape();
diff --git a/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.h b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.h
index b922e120a6..269e0328f9 100644
--- a/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.h
+++ b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.h
@@ -13,32 +13,15 @@
 
 namespace tt::runtime::ttnn::operations::utils {
 
-bool isOnHost(const ::ttnn::Tensor &tensor);
-
-bool isOnDevice(const ::ttnn::Tensor &tensor);
-
 bool isTilized(const ::tt::target::TensorRef *tensorRef);
 
 bool inSystemMemory(const ::tt::target::TensorRef *tensorRef);
 
-void updateTensorPool(ProgramTensorPool &tensorPool,
-                      const ::ttnn::Tensor &tensor, uint32_t outputGlobalId);
-
 ::tt::target::MemorySpace
 getMemorySpace(const ::tt::target::TensorRef *tensorRef);
 
 ::ttnn::DataType getDataType(const ::tt::target::TensorRef *tensorRef);
 
-::ttnn::Layout
-inferLayoutFromTileShape(const ::tt::target::TensorRef *tensorRef);
-
-CoreRangeSet
-toCoreRangeSet(const ::flatbuffers::Vector<const ::tt::target::Dim2dRange *>
-                   *coreRangeSet);
-
-::tt::tt_metal::MemoryConfig
-createMemoryConfig(const ::tt::target::TensorRef *tensorRef);
-
 ::tt::tt_metal::MemoryConfig
 createMemoryConfig(const ::tt::target::MemoryConfigDesc *memcfg,
                    const ::tt::target::TensorRef *tensorRef);
diff --git a/runtime/lib/ttnn/operations/layout/from_device.cpp b/runtime/lib/ttnn/operations/layout/from_device.cpp
index b6820b6ec7..e26e3be2a3 100644
--- a/runtime/lib/ttnn/operations/layout/from_device.cpp
+++ b/runtime/lib/ttnn/operations/layout/from_device.cpp
@@ -12,10 +12,11 @@ void run(const ::tt::target::ttnn::FromDeviceOp *op, ProgramContext &context) {
   ProgramTensorPool &tensorPool = context.getTensorPool();
   const ::ttnn::Tensor &inputTensor = tensorPool.at(op->in()->global_id());
   DEBUG_ASSERT(inputTensor.is_allocated());
-  LOG_ASSERT(utils::isOnDevice(inputTensor),
-             "Calling ttnn::from_device on a host tensor");
+  DEBUG_ASSERT(
+      ::tt::runtime::ttnn::utils::isOnDevice(inputTensor.storage_type()),
+      "Calling ttnn::from_device on a host tensor");
 
   ::ttnn::Tensor out = ::ttnn::from_device(inputTensor);
-  utils::updateTensorPool(tensorPool, out, op->out()->global_id());
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
 }
 } // namespace tt::runtime::ttnn::operations::layout
diff --git a/runtime/lib/ttnn/operations/layout/to_device.cpp b/runtime/lib/ttnn/operations/layout/to_device.cpp
index 34af89f504..414afc9f05 100644
--- a/runtime/lib/ttnn/operations/layout/to_device.cpp
+++ b/runtime/lib/ttnn/operations/layout/to_device.cpp
@@ -14,7 +14,7 @@ void run(const ::tt::target::ttnn::ToDeviceOp *op, ProgramContext &context) {
   ProgramTensorPool &tensorPool = context.getTensorPool();
   const ::ttnn::Tensor &inputTensor = tensorPool.at(op->in()->global_id());
   DEBUG_ASSERT(inputTensor.is_allocated());
-  DEBUG_ASSERT(utils::isOnHost(inputTensor),
+  DEBUG_ASSERT(::tt::runtime::ttnn::utils::isOnHost(inputTensor.storage_type()),
                "Calling ttnn::to_device on a device tensor");
   std::optional<::ttnn::MemoryConfig> memoryConfig = std::nullopt;
 
diff --git a/runtime/lib/ttnn/operations/layout/to_layout.cpp b/runtime/lib/ttnn/operations/layout/to_layout.cpp
index 5e78a67187..bf80ef292e 100644
--- a/runtime/lib/ttnn/operations/layout/to_layout.cpp
+++ b/runtime/lib/ttnn/operations/layout/to_layout.cpp
@@ -57,7 +57,7 @@ void run(const ::tt::target::ttnn::ToLayoutOp *op, ProgramContext &context) {
     out = ::ttnn::to_layout(inputTensor, layout, dtype, memoryConfig,
                             static_cast<::ttnn::Device *>(nullptr));
   }
-  utils::updateTensorPool(tensorPool, out, op->out()->global_id());
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
 }
 
 } // namespace tt::runtime::ttnn::operations::layout
diff --git a/runtime/lib/ttnn/operations/layout/typecast.cpp b/runtime/lib/ttnn/operations/layout/typecast.cpp
index 5529c6112c..e59a64a401 100644
--- a/runtime/lib/ttnn/operations/layout/typecast.cpp
+++ b/runtime/lib/ttnn/operations/layout/typecast.cpp
@@ -17,7 +17,7 @@ void run(const ::tt::target::ttnn::TypecastOp *op, ProgramContext &context) {
       ::tt::runtime::ttnn::utils::toTTNNDataType(op->dtype());
 
   ::ttnn::Tensor out = ::ttnn::typecast(inputTensor, targetDataType);
-  utils::updateTensorPool(tensorPool, out, op->out()->global_id());
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
 }
 
 } // namespace tt::runtime::ttnn::operations::layout
diff --git a/runtime/lib/ttnn/operations/matmul/matmul.cpp b/runtime/lib/ttnn/operations/matmul/matmul.cpp
index a25102d9af..896797d59c 100644
--- a/runtime/lib/ttnn/operations/matmul/matmul.cpp
+++ b/runtime/lib/ttnn/operations/matmul/matmul.cpp
@@ -6,6 +6,7 @@
 #include "tt/runtime/detail/logger.h"
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
 #include <optional>
 
 namespace tt::runtime::ttnn::operations::matmul {
@@ -18,7 +19,7 @@ void run(const ::tt::target::ttnn::MatmulOp *op, ProgramContext &context) {
   DEBUG_ASSERT(rhs.is_allocated());
   ::ttnn::DataType outputDataType = utils::getDataType(op->out());
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
 
   const std::optional<const ::tt::tt_metal::MemoryConfig> memoryConfig =
       std::make_optional(outputMemoryConfig);
@@ -49,7 +50,7 @@ void run(const ::tt::target::ttnn::LinearOp *op, ProgramContext &context) {
 
   ::ttnn::DataType outputDataType = utils::getDataType(op->out());
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
 
   const std::optional<const ::tt::tt_metal::MemoryConfig> memoryConfig =
       std::make_optional(outputMemoryConfig);
diff --git a/runtime/lib/ttnn/operations/normalization/softmax.cpp b/runtime/lib/ttnn/operations/normalization/softmax.cpp
index a83358567c..432f920956 100644
--- a/runtime/lib/ttnn/operations/normalization/softmax.cpp
+++ b/runtime/lib/ttnn/operations/normalization/softmax.cpp
@@ -6,6 +6,7 @@
 #include "tt/runtime/detail/logger.h"
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
 
 namespace tt::runtime::ttnn::operations::normalization {
 void run(const ::tt::target::ttnn::SoftmaxOp *op, ProgramContext &context) {
@@ -14,7 +15,7 @@ void run(const ::tt::target::ttnn::SoftmaxOp *op, ProgramContext &context) {
   DEBUG_ASSERT(in.is_allocated());
   int32_t dimension = op->dimension();
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
   ::ttnn::Tensor out = ::ttnn::softmax(in, dimension, outputMemoryConfig);
   tensorPool.insert_or_assign(op->out()->global_id(), out);
 }
diff --git a/runtime/lib/ttnn/operations/pool/maxpool2d.cpp b/runtime/lib/ttnn/operations/pool/maxpool2d.cpp
index 4fc6fca87f..c405a86f1c 100644
--- a/runtime/lib/ttnn/operations/pool/maxpool2d.cpp
+++ b/runtime/lib/ttnn/operations/pool/maxpool2d.cpp
@@ -61,7 +61,8 @@ void run(const ::tt::target::ttnn::MaxPool2dOp *op, ProgramContext &context) {
         },
         targetDevice);
   }
-  ::ttnn::MemoryConfig outMemConfig = utils::createMemoryConfig(op->out());
+  ::ttnn::MemoryConfig outMemConfig =
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
   ::ttnn::Tensor out = operation.invoke(
       0, input, op->batch_size(), op->input_height(), op->input_width(),
       op->channels(), {op->kernel_height(), op->kernel_width()},
diff --git a/runtime/lib/ttnn/operations/reduction/reduction.cpp b/runtime/lib/ttnn/operations/reduction/reduction.cpp
index 3af46efc9c..a74373ee9f 100644
--- a/runtime/lib/ttnn/operations/reduction/reduction.cpp
+++ b/runtime/lib/ttnn/operations/reduction/reduction.cpp
@@ -6,6 +6,7 @@
 #include "tt/runtime/detail/logger.h"
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
 
 namespace tt::runtime::ttnn::operations::reduction {
 static void runReductionOp(
@@ -17,7 +18,7 @@ static void runReductionOp(
         const std::optional<::ttnn::DeviceComputeKernelConfig> &, float)>
         &ttnnOp) {
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
   const ::ttnn::Tensor &in = tensorPool.at(op->in()->global_id());
   DEBUG_ASSERT(in.is_allocated());
 
diff --git a/runtime/lib/ttnn/program.cpp b/runtime/lib/ttnn/program.cpp
index 3aab3a94cd..a45c2de9a0 100644
--- a/runtime/lib/ttnn/program.cpp
+++ b/runtime/lib/ttnn/program.cpp
@@ -30,6 +30,7 @@
 #include "tt/runtime/detail/debug.h"
 #include "tt/runtime/detail/logger.h"
 #include "tt/runtime/ttnn/types.h"
+#include "tt/runtime/ttnn/utils.h"
 #include "tt/runtime/utils.h"
 #include "ttmlir/Target/TTNN/program_generated.h"
 
@@ -49,36 +50,25 @@ void tracyLogOpLocation(const ::tt::target::ttnn::Operation *op) {
 static ::tt::target::ttnn::TTNNBinary const *getBinary(Flatbuffer binary) {
   bool isTTNN = ::tt::target::ttnn::SizePrefixedTTNNBinaryBufferHasIdentifier(
       binary.handle.get());
-  if (not isTTNN) {
-    throw std::runtime_error("Unsupported binary format");
-  }
+  LOG_ASSERT(isTTNN, "Unsupported binary format");
   return ::tt::target::ttnn::GetSizePrefixedTTNNBinary(binary.handle.get());
 }
 
 class ProgramExecutor {
 public:
-  ProgramExecutor(Binary &executableHandle, const TensorMap &liveTensors,
-                  const std::unordered_set<uint32_t> &programInputs,
-                  const std::unordered_set<uint32_t> &programOutputs,
-                  ::ttnn::MeshDevice *meshDevice)
+  ProgramExecutor(
+      const Binary &executableHandle,
+      const std::unordered_map<uint32_t, ::ttnn::Tensor *> &liveTensors,
+      const std::vector<uint32_t> &programInputs,
+      const std::vector<uint32_t> &programOutputs,
+      ::ttnn::MeshDevice *meshDevice)
       : executableHandle(executableHandle),
         context(ProgramContext(liveTensors, programInputs, programOutputs,
                                meshDevice)) {}
 
   void runCallback(Binary &executableHandle,
                    const ::tt::target::ttnn::Operation *opContext,
-                   ProgramContext *programContext) {
-    if (auto callback = debug::Hooks::get().getOperatorCallback(); callback) {
-      std::shared_ptr<void> programContextPtr =
-          ::tt::runtime::utils::unsafe_borrow_shared(programContext);
-      std::shared_ptr<void> opContextPtr =
-          ::tt::runtime::utils::unsafe_borrow_shared(
-              const_cast<::tt::target::ttnn::Operation *>(opContext));
-      (*callback)(executableHandle,
-                  CallbackContext(programContextPtr, DeviceRuntime::TTNN),
-                  OpContext(opContextPtr, DeviceRuntime::TTNN));
-    }
-  }
+                   ProgramContext *programContext);
 
   void execute(const ::tt::target::ttnn::Program *program) {
     for (const ::tt::target::ttnn::Operation *op : *program->operations()) {
@@ -91,6 +81,9 @@ class ProgramExecutor {
   }
 
   ProgramContext &getContext() { return context; }
+  std::vector<Tensor> gatherOutputTensors() {
+    return context.getTensorPool().gatherOutputTensors();
+  }
 
 private:
   Binary executableHandle;
@@ -99,6 +92,21 @@ class ProgramExecutor {
   void runEltwiseOperation(const ::tt::target::ttnn::EltwiseOp *op);
 };
 
+void ProgramExecutor::runCallback(
+    Binary &executableHandle, const ::tt::target::ttnn::Operation *opContext,
+    ProgramContext *programContext) {
+  if (auto callback = debug::Hooks::get().getOperatorCallback(); callback) {
+    std::shared_ptr<void> programContextPtr =
+        ::tt::runtime::utils::unsafe_borrow_shared(programContext);
+    std::shared_ptr<void> opContextPtr =
+        ::tt::runtime::utils::unsafe_borrow_shared(
+            const_cast<::tt::target::ttnn::Operation *>(opContext));
+    (*callback)(executableHandle,
+                CallbackContext(programContextPtr, DeviceRuntime::TTNN),
+                OpContext(opContextPtr, DeviceRuntime::TTNN));
+  }
+}
+
 void ProgramExecutor::runEltwiseOperation(
     const ::tt::target::ttnn::EltwiseOp *op) {
   auto runUnaryOp = [&]() {
@@ -211,6 +219,26 @@ void ProgramExecutor::runOperation(const ::tt::target::ttnn::Operation *op) {
 }
 
 // Nop is single input, output tensor where input is returned as output.
+static bool isNopProgram(const ::tt::target::ttnn::Program *program) {
+  return program->inputs()->size() == 1 && program->outputs()->size() == 1 &&
+         program->inputs()->Get(0)->global_id() ==
+             program->outputs()->Get(0)->global_id();
+}
+
+static ::ttnn::Tensor
+handleNopProgram(::tt::target::ttnn::Program const *program,
+                 std::vector<::ttnn::Tensor *> const &inputs) {
+  const ::ttnn::Tensor &input = *inputs[0];
+  ::ttnn::Tensor output =
+      ::ttnn::zeros(input.get_shape(), input.get_dtype(), input.get_layout());
+  const void *src = ::tt::tt_metal::get_raw_host_data_ptr(input);
+  void *dst = ::tt::tt_metal::get_raw_host_data_ptr(output);
+  std::memcpy(dst, src, input.volume() * input.element_size());
+  return output;
+}
+
+namespace legacy {
+
 static bool handleNopProgram(::tt::target::ttnn::Program const *program,
                              std::vector<::ttnn::Tensor *> const &inputs,
                              std::vector<::ttnn::Tensor *> const &outputs) {
@@ -239,8 +267,8 @@ void runProgram(::ttnn::MeshDevice &meshDevice, Binary &executableHandle,
   if (handleNopProgram(program, inputs, outputs)) {
     return;
   }
-  TensorMap liveTensors;
-  std::unordered_set<uint32_t> programInputs;
+  std::unordered_map<uint32_t, ::ttnn::Tensor *> liveTensors;
+  std::vector<uint32_t> programInputs;
   int inputIndex = 0;
   LOG_ASSERT(program->inputs()->size() == inputs.size(),
              "Program input size mismatch: ", program->inputs()->size(),
@@ -249,21 +277,69 @@ void runProgram(::ttnn::MeshDevice &meshDevice, Binary &executableHandle,
     auto [iter, inserted] =
         liveTensors.try_emplace(input->global_id(), inputs[inputIndex++]);
     LOG_ASSERT(inserted, "Duplicate input tensor");
-    programInputs.emplace(input->global_id());
+    programInputs.push_back(input->global_id());
   }
 
   int outputIndex = 0;
-  std::unordered_set<uint32_t> programOutputs;
+  std::vector<uint32_t> programOutputs;
   LOG_ASSERT(program->outputs()->size() == outputs.size());
   for (::tt::target::TensorRef const *output : *program->outputs()) {
     auto [iter, inserted] =
         liveTensors.try_emplace(output->global_id(), outputs[outputIndex++]);
     LOG_ASSERT(inserted, "Duplicate output tensor");
-    programOutputs.emplace(output->global_id());
+    programOutputs.push_back(output->global_id());
+  }
+  ProgramExecutor executor(executableHandle, liveTensors, programInputs,
+                           programOutputs, &meshDevice);
+  executor.execute(program);
+  outputIndex = 0;
+  for (uint32_t outputId : programOutputs) {
+    const ::ttnn::Tensor &src =
+        executor.getContext().getTensorPool().at(outputId);
+    const ::ttnn::Tensor &dst = *(outputs[outputIndex++]);
+    size_t srcSize = src.volume() * src.element_size();
+    size_t dstSize = dst.volume() * dst.element_size();
+    LOG_ASSERT(srcSize == dstSize, "Output tensor size mismatch");
+    const void *srcPtr = ::tt::tt_metal::get_raw_host_data_ptr(src);
+    void *dstPtr = ::tt::tt_metal::get_raw_host_data_ptr(dst);
+    std::memcpy(dstPtr, srcPtr, dstSize);
+  }
+}
+} // namespace legacy
+
+std::vector<Tensor> runProgram(::ttnn::MeshDevice &meshDevice,
+                               Binary executableHandle,
+                               std::uint32_t programIndex,
+                               std::vector<::ttnn::Tensor *> const &inputs) {
+  ::tt::target::ttnn::TTNNBinary const &fbb = *getBinary(executableHandle);
+  ::tt::target::ttnn::Program const *program =
+      fbb.programs()->Get(programIndex);
+  if (isNopProgram(program)) {
+    Tensor out =
+        utils::createRuntimeTensorFromTTNN(handleNopProgram(program, inputs));
+    return {out};
+  }
+  std::unordered_map<uint32_t, ::ttnn::Tensor *> liveTensors;
+  std::vector<uint32_t> programInputs;
+  int inputIndex = 0;
+  LOG_ASSERT(program->inputs()->size() == inputs.size(),
+             "Program input size mismatch: ", program->inputs()->size(),
+             " != ", inputs.size());
+  for (::tt::target::TensorRef const *input : *program->inputs()) {
+    auto [iter, inserted] =
+        liveTensors.try_emplace(input->global_id(), inputs[inputIndex++]);
+    LOG_ASSERT(inserted, "Duplicate input tensor");
+    programInputs.push_back(input->global_id());
+  }
+  std::vector<uint32_t> programOutputs;
+  for (::tt::target::TensorRef const *output : *program->outputs()) {
+    programOutputs.push_back(output->global_id());
   }
   ProgramExecutor executor(executableHandle, liveTensors, programInputs,
                            programOutputs, &meshDevice);
   executor.execute(program);
+  std::vector<Tensor> outputTensors = executor.gatherOutputTensors();
+  return outputTensors;
 }
 
 } // namespace tt::runtime::ttnn
diff --git a/runtime/lib/ttnn/runtime.cpp b/runtime/lib/ttnn/runtime.cpp
index 2dfc077884..466bf318bc 100644
--- a/runtime/lib/ttnn/runtime.cpp
+++ b/runtime/lib/ttnn/runtime.cpp
@@ -1,7 +1,6 @@
 // SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
 //
 // SPDX-License-Identifier: Apache-2.0
-#include "tt/runtime/runtime.h"
 #include "tt/runtime/detail/debug.h"
 #include "tt/runtime/detail/logger.h"
 #include "tt/runtime/detail/ttnn.h"
@@ -21,16 +20,28 @@ using ::tt::tt_metal::DistributedTensorConfig;
 using ::tt::tt_metal::OwnedStorage;
 using ::tt::tt_metal::raise_unsupported_storage;
 
+template <typename ElementType>
+static OwnedStorage createOwnedStorage(ElementType *ptr,
+                                       std::uint32_t numElements) {
+  ::tt::tt_metal::owned_buffer::Buffer<ElementType> buffer;
+  if (ptr != nullptr) {
+    auto data = std::vector<ElementType>(ptr, ptr + numElements);
+    buffer = ::tt::tt_metal::owned_buffer::create<ElementType>(std::move(data));
+  } else {
+    buffer = ::tt::tt_metal::owned_buffer::create<ElementType>(numElements);
+  }
+  return OwnedStorage(std::move(buffer));
+}
+
 template <typename StorageType, typename ElementType>
 static StorageType createStorage(ElementType *ptr, std::uint32_t numElements) {
   if constexpr (std::is_same_v<StorageType, BorrowedStorage>) {
+    LOG_ASSERT(ptr != nullptr, "Cannot create borrowed storage from nullptr");
     return BorrowedStorage(
         ::tt::tt_metal::borrowed_buffer::Buffer<ElementType>(ptr, numElements),
         [] {}, [] {});
   } else if constexpr (std::is_same_v<StorageType, OwnedStorage>) {
-    auto data = std::vector<ElementType>(ptr, ptr + numElements);
-    auto buffer = ::tt::tt_metal::owned_buffer::create(std::move(data));
-    return OwnedStorage(std::move(buffer));
+    return createOwnedStorage(ptr, numElements);
   } else {
     raise_unsupported_storage<StorageType>();
   }
@@ -76,6 +87,21 @@ static Tensor createNullTensor() {
   return Tensor(nullptr, nullptr, DeviceRuntime::TTNN);
 }
 
+static DeviceVariant getTargetDevice(::ttnn::MeshDevice &meshDevice) {
+  if (meshDevice.num_devices() == 1) {
+    return std::ref(*(meshDevice.get_device_index(0)));
+  }
+  return std::ref(meshDevice);
+}
+
+static ::tt::target::ttnn::TTNNBinary const *getBinary(Flatbuffer binary) {
+  bool isTTNN = ::tt::target::ttnn::SizePrefixedTTNNBinaryBufferHasIdentifier(
+      binary.handle.get());
+  LOG_ASSERT(isTTNN, "Unsupported binary format");
+  return ::tt::target::ttnn::GetSizePrefixedTTNNBinary(binary.handle.get());
+}
+
+// Create a borrowed tensor from user-owned data
 Tensor createTensor(std::shared_ptr<void> data,
                     std::vector<std::uint32_t> const &shape,
                     std::vector<std::uint32_t> const &stride,
@@ -89,10 +115,11 @@ Tensor createTensor(std::shared_ptr<void> data,
       createStorage<BorrowedStorage>(data.get(), numElements, dataType),
       ::ttnn::Shape(small_vector_shape), utils::toTTNNDataType(dataType),
       ::ttnn::Layout::ROW_MAJOR);
-  return Tensor(std::static_pointer_cast<void>(tensor), data,
+  return Tensor(std::static_pointer_cast<void>(tensor), nullptr,
                 DeviceRuntime::TTNN);
 }
 
+// Create a owned multi-device host tensor from user-owned data
 Tensor
 createTensor(std::vector<std::shared_ptr<void>> &data,
              std::vector<std::uint32_t> const &shape,
@@ -100,8 +127,8 @@ createTensor(std::vector<std::shared_ptr<void>> &data,
              ::tt::target::DataType dataType,
              std::unordered_map<std::string, std::string> const &strategy) {
   std::vector<::ttnn::Tensor> tensorShards;
-  tensorShards.resize(data.size());
-  std::transform(data.begin(), data.end(), tensorShards.begin(),
+  tensorShards.reserve(data.size());
+  std::transform(data.begin(), data.end(), std::back_inserter(tensorShards),
                  [&](std::shared_ptr<void> &dataShard) -> ::ttnn::Tensor {
                    return createOwnedTensor(dataShard, shape, stride, itemsize,
                                             dataType);
@@ -112,13 +139,35 @@ createTensor(std::vector<std::shared_ptr<void>> &data,
       ::ttnn::distributed::api::create_multi_device_tensor(
           tensorShards, ::tt::tt_metal::StorageType::MULTI_DEVICE_HOST,
           distributionStrategy));
-  std::shared_ptr<std::vector<std::shared_ptr<void>>> borrowedData =
-      std::make_shared<std::vector<std::shared_ptr<void>>>(data);
-  return Tensor(std::static_pointer_cast<void>(tensor),
-                std::static_pointer_cast<void>(borrowedData),
+  return Tensor(std::static_pointer_cast<void>(tensor), nullptr,
                 DeviceRuntime::TTNN);
 }
 
+// Create an owned empty tensor on host/device
+Tensor createTensor(Device device, Layout layout,
+                    std::vector<std::uint32_t> const &shape,
+                    std::vector<std::uint32_t> const &stride,
+                    std::uint32_t itemsize) {
+  const LayoutDesc &layoutDesc = layout.as<LayoutDesc>(DeviceRuntime::TTNN);
+  if (layoutDesc.isOnHost()) {
+    ::ttnn::Tensor tensor =
+        createOwnedTensor(nullptr, shape, stride, itemsize,
+                          utils::fromTTNNDataType(layoutDesc.dataType));
+    Tensor out = utils::createRuntimeTensorFromTTNN(tensor);
+    return toLayout(out, device, layout);
+  }
+  DeviceVariant targetDevice =
+      getTargetDevice(device.as<::ttnn::MeshDevice>(DeviceRuntime::TTNN));
+  ::ttnn::Tensor tensor = std::visit(
+      [&](auto &&device) -> ::ttnn::Tensor {
+        return ::ttnn::operations::core::allocate_tensor_on_device(
+            ::ttnn::Shape(shape), layoutDesc.dataType, layoutDesc.layout,
+            &(device.get()), layoutDesc.memoryConfig);
+      },
+      targetDevice);
+  return utils::createRuntimeTensorFromTTNN(tensor);
+}
+
 tt::target::DataType getTensorDataType(Tensor tensor) {
   const ::ttnn::Tensor &nnTensor =
       tensor.as<::ttnn::Tensor>(DeviceRuntime::TTNN);
@@ -166,34 +215,110 @@ void deallocateBuffers(Device deviceHandle) {
   }
 }
 
-Event submit(Device deviceHandle, Binary executableHandle,
-             std::uint32_t programIndex,
-             std::vector<Tensor> const &inputHandles,
-             std::vector<Tensor> const &outputHandles) {
-  ::ttnn::MeshDevice &meshDevice =
-      deviceHandle.as<::ttnn::MeshDevice>(DeviceRuntime::TTNN);
-  std::vector<::ttnn::Tensor *> inputs;
-  inputs.reserve(inputHandles.size());
-  for (auto &input : inputHandles) {
-    LOG_ASSERT(input.matchesRuntime(DeviceRuntime::TTNN));
-    inputs.push_back(static_cast<::ttnn::Tensor *>(input.handle.get()));
+void wait(Event event) {
+  // Nothing to do for ttnn runtime
+  LOG_ASSERT(event.matchesRuntime(DeviceRuntime::TTNN));
+}
+
+void wait(Tensor tensor) {
+  LOG_ASSERT(tensor.matchesRuntime(DeviceRuntime::TTNN),
+             "Expected ttnn tensor");
+  ::tt::runtime::ttnn::wait(tensor.event);
+}
+
+void wait(std::vector<Tensor> const &tensors) {
+  for (const Tensor &tensor : tensors) {
+    ::tt::runtime::ttnn::wait(tensor);
   }
+}
 
-  std::vector<::ttnn::Tensor *> outputs;
-  outputs.reserve(outputHandles.size());
-  for (auto &output : outputHandles) {
-    LOG_ASSERT(output.matchesRuntime(DeviceRuntime::TTNN));
-    outputs.push_back(static_cast<::ttnn::Tensor *>(output.handle.get()));
+Tensor toHost(Tensor tensor, bool untilize) {
+  const ::ttnn::Tensor &deviceTensor =
+      tensor.as<::ttnn::Tensor>(DeviceRuntime::TTNN);
+  std::shared_ptr<::ttnn::Tensor> hostTensor =
+      std::make_shared<::ttnn::Tensor>(::ttnn::from_device(deviceTensor));
+
+  if (untilize) {
+    hostTensor = std::make_shared<::ttnn::Tensor>(::ttnn::to_layout(
+        *hostTensor, ::ttnn::Layout::ROW_MAJOR, std::nullopt, std::nullopt,
+        static_cast<::ttnn::Device *>(nullptr)));
   }
 
-  tt::runtime::ttnn::runProgram(meshDevice, executableHandle, programIndex,
-                                inputs, outputs);
-  return Event(nullptr, DeviceRuntime::TTNN);
+  return Tensor(std::static_pointer_cast<void>(hostTensor), nullptr,
+                DeviceRuntime::TTNN);
 }
 
-void wait(Event event) {
-  // Not implemented
-  LOG_ASSERT(event.matchesRuntime(DeviceRuntime::TTNN));
+Tensor toLayout(Tensor tensor, Device device, Layout layout) {
+  const ::ttnn::Tensor &ttnnTensor =
+      tensor.as<::ttnn::Tensor>(DeviceRuntime::TTNN);
+  const ::ttnn::Layout &inputLayout = ttnnTensor.get_layout();
+  const ::ttnn::DataType &inputDataType = ttnnTensor.get_dtype();
+  LayoutDesc inputLayoutDesc(::ttnn::BufferType::SYSTEM_MEMORY, inputLayout,
+                             inputDataType, std::nullopt);
+
+  const LayoutDesc &outputLayoutDesc =
+      layout.as<LayoutDesc>(DeviceRuntime::TTNN);
+
+  ::ttnn::MeshDevice &meshDevice =
+      device.as<::ttnn::MeshDevice>(DeviceRuntime::TTNN);
+  DeviceVariant targetDevice = getTargetDevice(meshDevice);
+  LayoutConverter converter(inputLayoutDesc, outputLayoutDesc);
+  std::shared_ptr<::ttnn::Tensor> out = std::make_shared<::ttnn::Tensor>(
+      converter.convertTensorLayout(ttnnTensor, targetDevice));
+
+  return Tensor(std::static_pointer_cast<void>(out), nullptr,
+                DeviceRuntime::TTNN);
+}
+
+Layout getLayout(Binary executableHandle, std::uint32_t programIndex,
+                 std::uint32_t inputIndex) {
+  const ::tt::target::ttnn::TTNNBinary &fbb = *getBinary(executableHandle);
+  LOG_ASSERT(programIndex < fbb.programs()->size(), "Invalid program index");
+  const ::tt::target::ttnn::Program *program =
+      fbb.programs()->Get(programIndex);
+  LOG_ASSERT(inputIndex < program->inputs()->size(), "Invalid input index");
+  const ::tt::target::TensorRef *input = program->inputs()->Get(inputIndex);
+
+  ::ttnn::BufferType inputBufferType = utils::toTTNNBufferType(
+      input->desc()->layout()->memory_desc()->memory_space());
+  ::ttnn::Layout inputLayout = utils::inferLayoutFromTileShape(input);
+  ::ttnn::DataType inputDataType = utils::toTTNNDataType(
+      input->desc()->layout()->memory_desc()->data_type());
+  std::optional<::ttnn::MemoryConfig> inputMemoryConfig = std::nullopt;
+  if (inputBufferType != ::ttnn::BufferType::SYSTEM_MEMORY) {
+    inputMemoryConfig = utils::createMemoryConfig(input);
+  }
+
+  std::shared_ptr<LayoutDesc> layoutDesc = std::make_shared<LayoutDesc>(
+      inputBufferType, inputLayout, inputDataType, inputMemoryConfig);
+
+  return Layout(std::static_pointer_cast<void>(layoutDesc),
+                DeviceRuntime::TTNN);
+}
+
+void memcpy(Tensor dst, Tensor src) {
+  ::ttnn::Tensor &dstTensor = dst.as<::ttnn::Tensor>(DeviceRuntime::TTNN);
+  const ::ttnn::Tensor &srcTensor = src.as<::ttnn::Tensor>(DeviceRuntime::TTNN);
+  LOG_ASSERT(srcTensor.volume() * srcTensor.element_size() ==
+                 dstTensor.volume() * dstTensor.element_size(),
+             "Input output tensor size mismatch in memcpy: ",
+             srcTensor.volume(), " * ", srcTensor.element_size(),
+             " != ", dstTensor.volume(), " * ", dstTensor.element_size());
+
+  if (utils::isOnHost(srcTensor.storage_type()) and
+      utils::isOnHost(dstTensor.storage_type())) {
+    void *dstPtr = ::tt::tt_metal::get_raw_host_data_ptr(dstTensor);
+    void *srcPtr = ::tt::tt_metal::get_raw_host_data_ptr(srcTensor);
+    size_t size = srcTensor.volume() * srcTensor.element_size();
+    std::memcpy(dstPtr, srcPtr, size);
+  } else {
+    ::tt::tt_metal::memcpy(dstTensor, srcTensor);
+  }
+}
+
+void deallocateTensor(Tensor &tensor, bool force) {
+  ::ttnn::Tensor &ttnnTensor = tensor.as<::ttnn::Tensor>(DeviceRuntime::TTNN);
+  ::ttnn::deallocate(ttnnTensor, force);
 }
 
 std::string getOpDebugString(OpContext opContextHandle) {
@@ -305,7 +430,7 @@ Tensor getOpOutputTensor(OpContext opContextHandle,
     return createNullTensor();
   }
   default: {
-    throw std::runtime_error("Unsupported operation type");
+    LOG_FATAL("Unsupported operation type");
   }
   }
 
@@ -332,12 +457,13 @@ Tensor getOpOutputTensor(OpContext opContextHandle,
       outCopy.shape().value, ::ttnn::DataType::FLOAT32,
       ::ttnn::Layout::ROW_MAJOR);
 
-  return Tensor(std::static_pointer_cast<void>(tensor), data,
+  return Tensor(std::static_pointer_cast<void>(tensor), nullptr,
                 DeviceRuntime::TTNN);
 }
 
 std::vector<float> getTensorData(Tensor tensor) {
-  ::ttnn::Tensor *nnTensor = static_cast<::ttnn::Tensor *>(tensor.handle.get());
+  const ::ttnn::Tensor *nnTensor =
+      static_cast<::ttnn::Tensor *>(tensor.handle.get());
   if (nnTensor == nullptr) {
     return {};
   }
@@ -347,4 +473,62 @@ std::vector<float> getTensorData(Tensor tensor) {
                             static_cast<float *>(dataPtr) + nnTensor->volume());
 }
 
+namespace legacy {
+
+Event submit(Device deviceHandle, Binary executableHandle,
+             std::uint32_t programIndex,
+             std::vector<Tensor> const &inputHandles,
+             std::vector<Tensor> const &outputHandles) {
+  ::ttnn::MeshDevice &meshDevice =
+      deviceHandle.as<::ttnn::MeshDevice>(DeviceRuntime::TTNN);
+  std::vector<::ttnn::Tensor *> inputs;
+  inputs.reserve(inputHandles.size());
+  for (auto &input : inputHandles) {
+    LOG_ASSERT(input.matchesRuntime(DeviceRuntime::TTNN));
+    inputs.push_back(static_cast<::ttnn::Tensor *>(input.handle.get()));
+  }
+
+  std::vector<::ttnn::Tensor *> outputs;
+  outputs.reserve(outputHandles.size());
+  for (auto &output : outputHandles) {
+    LOG_ASSERT(output.matchesRuntime(DeviceRuntime::TTNN));
+    outputs.push_back(static_cast<::ttnn::Tensor *>(output.handle.get()));
+  }
+
+  tt::runtime::ttnn::legacy::runProgram(meshDevice, executableHandle,
+                                        programIndex, inputs, outputs);
+  return Event(nullptr, DeviceRuntime::TTNN);
+}
+} // namespace legacy
+
+std::vector<Tensor> submit(Device deviceHandle, Binary executableHandle,
+                           std::uint32_t programIndex,
+                           std::vector<Tensor> const &inputHandles) {
+  ::ttnn::MeshDevice &meshDevice =
+      deviceHandle.as<::ttnn::MeshDevice>(DeviceRuntime::TTNN);
+
+  // Convert input tensors to the layout expected by the program
+  std::vector<Tensor> inputsWithLayout;
+  inputsWithLayout.reserve(inputHandles.size());
+  std::transform(
+      inputHandles.begin(), inputHandles.end(),
+      std::back_inserter(inputsWithLayout), [&](const Tensor &input) -> Tensor {
+        Layout inputLayout = ::tt::runtime::ttnn::getLayout(
+            executableHandle, programIndex, inputsWithLayout.size());
+        return ::tt::runtime::ttnn::toLayout(input, deviceHandle, inputLayout);
+      });
+
+  std::vector<::ttnn::Tensor *> ttnnInputs;
+  ttnnInputs.reserve(inputsWithLayout.size());
+  std::transform(inputsWithLayout.begin(), inputsWithLayout.end(),
+                 std::back_inserter(ttnnInputs),
+                 [](Tensor &input) -> ::ttnn::Tensor * {
+                   return &input.as<::ttnn::Tensor>(DeviceRuntime::TTNN);
+                 });
+
+  std::vector<Tensor> outputs = ::tt::runtime::ttnn::runProgram(
+      meshDevice, executableHandle, programIndex, ttnnInputs);
+  return outputs;
+}
+
 } // namespace tt::runtime::ttnn
diff --git a/runtime/test/CMakeLists.txt b/runtime/test/CMakeLists.txt
index e4a7adc406..f55a6c7615 100644
--- a/runtime/test/CMakeLists.txt
+++ b/runtime/test/CMakeLists.txt
@@ -1,7 +1,31 @@
+if (NOT TTMLIR_ENABLE_RUNTIME_TESTS)
+  add_library(TTRuntimeTTNNTestHelpers INTERFACE)
+  return()
+endif()
+
 if (NOT TTMLIR_ENABLE_RUNTIME OR (NOT TT_RUNTIME_ENABLE_TTNN AND NOT TT_RUNTIME_ENABLE_TTMETAL))
   message(FATAL_ERROR "Runtime tests require -DTTMLIR_ENABLE_RUNTIME=ON and at least one backend runtime to be enabled")
 endif()
 
+if (NOT TT_RUNTIME_ENABLE_TTNN)
+  add_library(TTRuntimeTTNNTestHelpers INTERFACE)
+else()
+  add_library(TTRuntimeTTNNTestHelpers
+    STATIC
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/tt/runtime/ttnn/test/utils.cpp
+  )
+  set_property(TARGET TTRuntimeTTNNTestHelpers PROPERTY CXX_STANDARD 20)
+  target_compile_options(TTRuntimeTTNNTestHelpers PUBLIC -mavx -mavx2 -fsized-deallocation)
+  target_include_directories(TTRuntimeTTNNTestHelpers PUBLIC
+    ${PROJECT_SOURCE_DIR}/runtime/include
+    ${PROJECT_SOURCE_DIR}/runtime/lib/ttnn/include
+    ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common
+  )
+  target_include_directories(TTRuntimeTTNNTestHelpers SYSTEM PUBLIC "$<BUILD_INTERFACE:${TTMETAL_INCLUDE_DIRS}>")
+  add_dependencies(TTRuntimeTTNNTestHelpers TTRuntime tt-metal FBS_GENERATION)
+  target_link_libraries(TTRuntimeTTNNTestHelpers PUBLIC TTRuntime TTNN_LIBRARY)
+endif()
+
 enable_testing()
 include(FetchContent)
 FetchContent_Declare(
diff --git a/runtime/test/include/tt/runtime/ttnn/test/utils.cpp b/runtime/test/include/tt/runtime/ttnn/test/utils.cpp
new file mode 100644
index 0000000000..e0cc969b7c
--- /dev/null
+++ b/runtime/test/include/tt/runtime/ttnn/test/utils.cpp
@@ -0,0 +1,50 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tt/runtime/test/utils.h"
+#include "tt/runtime/detail/logger.h"
+#include "tt/runtime/runtime.h"
+#include "tt/runtime/ttnn/types.h"
+#include "tt/runtime/ttnn/utils.h"
+#include "tt/runtime/types.h"
+
+namespace tt::runtime::ttnn::test {
+using ::tt::runtime::DeviceRuntime;
+Layout getDramInterleavedTileLayout(::tt::target::DataType dataType) {
+  LOG_ASSERT(getCurrentRuntime() == DeviceRuntime::TTNN);
+  ::ttnn::DataType ttnnDataType =
+      ::tt::runtime::ttnn::utils::toTTNNDataType(dataType);
+  ::tt::runtime::ttnn::LayoutDesc layoutDesc(::ttnn::BufferType::DRAM,
+                                             ::ttnn::Layout::TILE, ttnnDataType,
+                                             std::nullopt);
+  return Layout(
+      std::static_pointer_cast<void>(
+          std::make_shared<::tt::runtime::ttnn::LayoutDesc>(layoutDesc)),
+      ::tt::runtime::DeviceRuntime::TTNN);
+}
+Layout getDramInterleavedRowMajorLayout(::tt::target::DataType dataType) {
+  LOG_ASSERT(getCurrentRuntime() == DeviceRuntime::TTNN);
+  ::ttnn::DataType ttnnDataType =
+      ::tt::runtime::ttnn::utils::toTTNNDataType(dataType);
+  ::tt::runtime::ttnn::LayoutDesc layoutDesc(::ttnn::BufferType::DRAM,
+                                             ::ttnn::Layout::ROW_MAJOR,
+                                             ttnnDataType, std::nullopt);
+  return Layout(
+      std::static_pointer_cast<void>(
+          std::make_shared<::tt::runtime::ttnn::LayoutDesc>(layoutDesc)),
+      ::tt::runtime::DeviceRuntime::TTNN);
+}
+::tt::runtime::Layout getHostRowMajorLayout(::tt::target::DataType dataType) {
+  LOG_ASSERT(getCurrentRuntime() == DeviceRuntime::TTNN);
+  ::ttnn::DataType ttnnDataType =
+      ::tt::runtime::ttnn::utils::toTTNNDataType(dataType);
+  ::tt::runtime::ttnn::LayoutDesc layoutDesc(::ttnn::BufferType::SYSTEM_MEMORY,
+                                             ::ttnn::Layout::ROW_MAJOR,
+                                             ttnnDataType, std::nullopt);
+  return Layout(
+      std::static_pointer_cast<void>(
+          std::make_shared<::tt::runtime::ttnn::LayoutDesc>(layoutDesc)),
+      ::tt::runtime::DeviceRuntime::TTNN);
+}
+} // namespace tt::runtime::ttnn::test
diff --git a/runtime/test/python/ttnn/conftest.py b/runtime/test/python/ttnn/conftest.py
new file mode 100644
index 0000000000..854cb42a39
--- /dev/null
+++ b/runtime/test/python/ttnn/conftest.py
@@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+try:
+    import ttrt
+except (ImportError, ModuleNotFoundError):
+    raise ImportError(
+        "Error: runtime python tests require ttrt to built and installed. Please run `cmake --build build -- ttrt`"
+    )
+import ttrt.runtime
+from ttrt.common.api import API
+from utils import Helper
+import pytest
+
+
+@pytest.fixture(autouse=True, scope="module")
+def initialize():
+    API.initialize_apis()
+    ttrt.runtime.set_current_runtime(ttrt.runtime.DeviceRuntime.TTNN)
+
+
+@pytest.fixture(scope="module")
+def helper():
+    helper = Helper()
+    yield helper
diff --git a/runtime/test/python/ttnn/test_runtime_api.py b/runtime/test/python/ttnn/test_runtime_api.py
new file mode 100644
index 0000000000..fe914d0c9a
--- /dev/null
+++ b/runtime/test/python/ttnn/test_runtime_api.py
@@ -0,0 +1,160 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import ttrt
+import ttrt.runtime
+import torch
+from ttrt.common.util import *
+from utils import TT_MLIR_HOME, Helper, DeviceContext, assert_pcc
+
+
+@pytest.mark.parametrize("shape", [(64, 128)])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16])
+def test_to_layout(helper: Helper, shape, dtype, request):
+    helper.initialize(request.node.name)
+    helper.check_constraints()
+    torch_input_tensor = torch.randn(shape, dtype=dtype)
+    torch_result_tensor = torch.zeros(shape, dtype=dtype)
+    runtime_dtype = Binary.Program.to_data_type(dtype)
+    runtime_input_tensor = ttrt.runtime.create_tensor(
+        torch_input_tensor.data_ptr(),
+        list(torch_input_tensor.shape),
+        list(torch_input_tensor.stride()),
+        torch_input_tensor.element_size(),
+        runtime_dtype,
+    )
+    runtime_output_tensor = ttrt.runtime.create_tensor(
+        torch_result_tensor.data_ptr(),
+        list(torch_result_tensor.shape),
+        list(torch_result_tensor.stride()),
+        torch_result_tensor.element_size(),
+        runtime_dtype,
+    )
+    device_layout = ttrt.runtime.testing.get_dram_interleaved_tile_layout(runtime_dtype)
+    host_layout = ttrt.runtime.testing.get_host_row_major_layout(runtime_dtype)
+    with DeviceContext([helper.query.device_ids[0]]) as device:
+        device_tensor = ttrt.runtime.to_layout(
+            runtime_input_tensor, device, device_layout
+        )
+        host_tensor = ttrt.runtime.to_layout(device_tensor, device, host_layout)
+        ttrt.runtime.deallocate_tensor(device_tensor, force=True)
+        ttrt.runtime.memcpy(runtime_output_tensor, host_tensor)
+        ttrt.runtime.deallocate_tensor(host_tensor, force=True)
+
+    lambda: assert_pcc(torch_input_tensor, torch_result_tensor, threshold=0.999)
+    helper.teardown()
+
+
+@pytest.mark.parametrize("shape", [(64, 128)])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16])
+def test_create_tensor_memcpy(helper: Helper, shape, dtype, request):
+    helper.initialize(request.node.name)
+    helper.check_constraints()
+    torch_input_tensor = torch.randn(shape, dtype=dtype)
+    torch_result_tensor = torch.zeros(shape, dtype=dtype)
+    runtime_dtype = Binary.Program.to_data_type(dtype)
+    runtime_input_tensor = ttrt.runtime.create_tensor(
+        torch_input_tensor.data_ptr(),
+        list(torch_input_tensor.shape),
+        list(torch_input_tensor.stride()),
+        torch_input_tensor.element_size(),
+        runtime_dtype,
+    )
+    runtime_output_tensor = ttrt.runtime.create_tensor(
+        torch_result_tensor.data_ptr(),
+        list(torch_result_tensor.shape),
+        list(torch_result_tensor.stride()),
+        torch_result_tensor.element_size(),
+        runtime_dtype,
+    )
+    device_layout = ttrt.runtime.testing.get_dram_interleaved_row_major_layout(
+        runtime_dtype
+    )
+    with DeviceContext([helper.query.device_ids[0]]) as device:
+        device_tensor = ttrt.runtime.create_empty_tensor(
+            device,
+            device_layout,
+            list(torch_input_tensor.shape),
+            list(torch_input_tensor.stride()),
+            torch_input_tensor.element_size(),
+        )
+        ttrt.runtime.memcpy(device_tensor, runtime_input_tensor)
+        host_tensor = ttrt.runtime.to_host(device_tensor, untilize=True)
+        ttrt.runtime.deallocate_tensor(device_tensor, force=True)
+        ttrt.runtime.memcpy(runtime_output_tensor, host_tensor)
+        ttrt.runtime.deallocate_tensor(host_tensor, force=True)
+    lambda: assert_pcc(torch_input_tensor, torch_result_tensor, threshold=0.999)
+    helper.teardown()
+
+
+def test_runtime_stitching_eltwise_binary_op_chain(helper: Helper, request):
+    binary_path = f"{TT_MLIR_HOME}/build/test/ttmlir/Runtime/TTNN/runtime_stitching/Output/eltwise_binary_op_chain.mlir.tmp.ttnn"
+    helper.initialize(request.node.name, binary_path)
+    helper.check_constraints()
+    first_program: Binary.Program = helper.binary.get_program(0)
+    assert first_program.num_inputs() == 2
+    inputs_torch = []
+    inputs_runtime = []
+    input_layouts = []
+    for i in first_program.program["inputs"]:
+        torch_tensor = torch.randn(
+            i["desc"]["shape"],
+            dtype=Binary.Program.from_data_type(
+                i["desc"]["layout"]["memory_desc"]["data_type"]
+            ),
+        )
+        runtime_dtype = Binary.Program.to_data_type(torch_tensor.dtype)
+        inputs_torch.append(torch_tensor)
+        runtime_tensor = ttrt.runtime.create_tensor(
+            torch_tensor.data_ptr(),
+            list(torch_tensor.shape),
+            list(torch_tensor.stride()),
+            torch_tensor.element_size(),
+            runtime_dtype,
+        )
+        inputs_runtime.append(runtime_tensor)
+        input_layouts.append(
+            ttrt.runtime.testing.get_dram_interleaved_row_major_layout(runtime_dtype)
+        )
+
+    activations, weights = inputs_runtime
+    activations_layout, weights_layout = input_layouts
+    with DeviceContext([helper.query.device_ids[0]]) as device:
+        activations = ttrt.runtime.to_layout(activations, device, activations_layout)
+        weights = ttrt.runtime.to_layout(weights, device, weights_layout)
+        program_indices = list(range(helper.binary.get_num_programs()))
+        for program_index in program_indices:
+            program = helper.binary.get_program(program_index)
+            assert program.num_inputs() == 2 and program.num_outputs() == 1
+            outputs = ttrt.runtime.submit(
+                device, helper.binary.fbb, program_index, [activations, weights]
+            )
+            activations = ttrt.runtime.to_layout(outputs[0], device, activations_layout)
+            ttrt.runtime.deallocate_tensor(outputs[0], force=True)
+        activations = ttrt.runtime.to_host(activations, untilize=True)
+        ttrt.runtime.deallocate_tensor(weights, force=True)
+
+    last_program: Binary.Program = helper.binary.get_program(program_indices[-1])
+    torch_result_tensor = torch.randn(
+        last_program.program["outputs"][0]["desc"]["shape"],
+        dtype=Binary.Program.from_data_type(
+            last_program.program["outputs"][0]["desc"]["layout"]["memory_desc"][
+                "data_type"
+            ]
+        ),
+    )
+    runtime_result_tensor = ttrt.runtime.create_tensor(
+        torch_result_tensor.data_ptr(),
+        list(torch_result_tensor.shape),
+        list(torch_result_tensor.stride()),
+        torch_result_tensor.element_size(),
+        Binary.Program.to_data_type(torch_result_tensor.dtype),
+    )
+    ttrt.runtime.memcpy(runtime_result_tensor, activations)
+    golden = (
+        (inputs_torch[0] + inputs_torch[1]).mul(inputs_torch[1]).sub(inputs_torch[1])
+    )
+    assert_pcc(golden, torch_result_tensor, threshold=0.999), program_index
+    helper.teardown()
diff --git a/runtime/test/python/ttnn/utils.py b/runtime/test/python/ttnn/utils.py
new file mode 100644
index 0000000000..6596811fff
--- /dev/null
+++ b/runtime/test/python/ttnn/utils.py
@@ -0,0 +1,66 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import ttrt
+import ttrt.runtime
+import torch
+from ttrt.common.query import Query
+from ttrt.common.util import *
+
+TT_MLIR_HOME = os.environ.get("TT_MLIR_HOME", "")
+
+
+class Helper:
+    def __init__(self, logger=None):
+        self.artifacts_dir = f"{os.getcwd()}/ttrt-artifacts"
+        self.logger = logger if logger is not None else Logger()
+        self.logging = self.logger.get_logger()
+        self.file_manager = FileManager(self.logger)
+        self.artifacts = Artifacts(
+            self.logger, self.file_manager, artifacts_folder_path=self.artifacts_dir
+        )
+        self.query = Query({"--quiet": True}, self.logger, self.artifacts)
+        self.query()
+        self.test_name = None
+        self.binary_path = None
+        self.binary = None
+
+    def initialize(self, test_name, binary_path=None):
+        self.test_name = test_name
+        if binary_path:
+            self.binary_path = binary_path
+            self.binary = Binary(self.logger, self.file_manager, binary_path)
+
+    def teardown(self):
+        self.test_name = None
+        self.binary_path = None
+        self.binary = None
+
+    def check_constraints(self):
+        if not self.binary:
+            return
+        self.binary.check_version()
+        self.binary.check_system_desc(self.query)
+
+
+class DeviceContext:
+    def __init__(self, device_ids):
+        self.device = ttrt.runtime.open_device(device_ids)
+
+    def __enter__(self):
+        return self.device
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        ttrt.runtime.close_device(self.device)
+
+
+def assert_tensors_match(tensor1, tensor2):
+    assert torch.allclose(tensor1, tensor2)
+
+
+def assert_pcc(x, y, threshold=0.99):
+    combined = torch.stack([x.flatten(), y.flatten()])
+    pcc = torch.corrcoef(combined)[0, 1].item()
+    assert pcc >= threshold, f"Expected pcc {pcc} >= {threshold}"
diff --git a/runtime/test/ttnn/test_subtract.cpp b/runtime/test/ttnn/test_subtract.cpp
index 00aebe20fb..995b95665c 100644
--- a/runtime/test/ttnn/test_subtract.cpp
+++ b/runtime/test/ttnn/test_subtract.cpp
@@ -21,12 +21,13 @@ TEST(TTNNSubtract, Equal) {
   const char *fbPath = std::getenv("TTMLIR_SUBTRACT_FB_PATH");
   assert(fbPath && "Path to subtract flatbuffer must be provided");
   ::tt::runtime::Binary fbb = ::tt::runtime::Binary::loadFromPath(fbPath);
-  EXPECT_EQ(fbb.getFileIdentifier(), "TTNN");
+  ASSERT_EQ(fbb.getFileIdentifier(), "TTNN");
   ::tt::runtime::setCompatibleRuntime(fbb);
   std::vector<::tt::runtime::TensorDesc> inputDescs = fbb.getProgramInputs(0);
+  assert(inputDescs.size() == 2);
   std::vector<::tt::runtime::TensorDesc> outputDescs = fbb.getProgramOutputs(0);
-  std::vector<::tt::runtime::Tensor> inputTensors, outputTensors;
-
+  assert(outputDescs.size() == 1);
+  std::vector<::tt::runtime::Tensor> inputTensors;
   std::uint32_t tensorSize = inputDescs[0].itemsize;
   for (const int dim : inputDescs[0].shape) {
     tensorSize *= dim;
@@ -38,26 +39,27 @@ TEST(TTNNSubtract, Equal) {
     std::memset(data.get(), 1, tensorSize);
     inputTensors.emplace_back(::tt::runtime::createTensor(data, desc));
   }
-  for (const auto &desc : outputDescs) {
-    std::shared_ptr<void> data =
-        ::tt::runtime::utils::malloc_shared(tensorSize);
-    // Set to wrong value on purpose here
-    std::memset(data.get(), 1, tensorSize);
-    outputTensors.emplace_back(::tt::runtime::createTensor(data, desc));
-  }
+
+  std::shared_ptr<void> outputDataPtr =
+      ::tt::runtime::utils::malloc_shared(tensorSize);
+  // Set to wrong value on purpose here
+  std::memset(outputDataPtr.get(), 1, tensorSize);
+  ::tt::runtime::Tensor outputTensor =
+      ::tt::runtime::createTensor(outputDataPtr, outputDescs[0]);
 
   size_t numDevices = ::tt::runtime::getNumAvailableDevices();
   std::vector<int> deviceIds(numDevices);
   std::iota(deviceIds.begin(), deviceIds.end(), 0);
-  auto device = ::tt::runtime::openDevice(deviceIds);
-  auto ev = ::tt::runtime::submit(device, fbb, 0, inputTensors, outputTensors);
+  auto device = ::tt::runtime::openDevice({deviceIds[0]});
+  std::vector<::tt::runtime::Tensor> output =
+      ::tt::runtime::submit(device, fbb, 0, inputTensors);
   ::tt::runtime::closeDevice(device);
-
+  assert(output.size() == 1);
   std::shared_ptr<void> expected =
       ::tt::runtime::utils::malloc_shared(tensorSize);
   std::memset(expected.get(), 0, tensorSize);
-  for (const auto &outputTensor : outputTensors) {
-    EXPECT_EQ(std::memcmp(outputTensor.data.get(), expected.get(), tensorSize),
-              0);
-  }
+  ::tt::runtime::Tensor submitOutput = output[0];
+  ASSERT_NE(std::memcmp(outputDataPtr.get(), expected.get(), tensorSize), 0);
+  ::tt::runtime::memcpy(outputTensor, submitOutput);
+  ASSERT_EQ(std::memcmp(outputDataPtr.get(), expected.get(), tensorSize), 0);
 }
diff --git a/runtime/tools/python/CMakeLists.txt b/runtime/tools/python/CMakeLists.txt
index 353ebbe7df..966ee9681a 100644
--- a/runtime/tools/python/CMakeLists.txt
+++ b/runtime/tools/python/CMakeLists.txt
@@ -9,6 +9,7 @@ add_custom_target(ttrt
   COMMAND TTMLIR_ENABLE_RUNTIME=${TTMLIR_ENABLE_RUNTIME}
           TT_RUNTIME_ENABLE_TTNN=${TT_RUNTIME_ENABLE_TTNN}
           TT_RUNTIME_ENABLE_TTMETAL=${TT_RUNTIME_ENABLE_TTMETAL}
+          TTMLIR_ENABLE_RUNTIME_TESTS=${TTMLIR_ENABLE_RUNTIME_TESTS}
           TT_RUNTIME_ENABLE_PERF_TRACE=${TT_RUNTIME_ENABLE_PERF_TRACE}
           TT_RUNTIME_DEBUG=${TT_RUNTIME_DEBUG}
           TT_RUNTIME_WORKAROUNDS=${TT_RUNTIME_WORKAROUNDS}
diff --git a/runtime/tools/python/setup.py b/runtime/tools/python/setup.py
index f5d148578b..e227835029 100644
--- a/runtime/tools/python/setup.py
+++ b/runtime/tools/python/setup.py
@@ -31,6 +31,7 @@
 enable_runtime = os.environ.get("TTMLIR_ENABLE_RUNTIME", "OFF") == "ON"
 enable_ttnn = os.environ.get("TT_RUNTIME_ENABLE_TTNN", "OFF") == "ON"
 enable_ttmetal = os.environ.get("TT_RUNTIME_ENABLE_TTMETAL", "OFF") == "ON"
+enable_runtime_tests = os.environ.get("TTMLIR_ENABLE_RUNTIME_TESTS", "OFF") == "ON"
 enable_perf = os.environ.get("TT_RUNTIME_ENABLE_PERF_TRACE", "OFF") == "ON"
 debug_runtime = os.environ.get("TT_RUNTIME_DEBUG", "OFF") == "ON"
 configure_workarounds_runtime = os.environ.get("TT_RUNTIME_WORKAROUNDS", "OFF") == "ON"
@@ -64,7 +65,15 @@
 linklibs = ["TTBinary"]
 if enable_ttnn:
     runlibs += ["_ttnn.so"]
-    linklibs += ["TTRuntimeTTNN", "TTRuntimeTTNNOps", ":_ttnn.so"]
+    linklibs += [
+        "TTRuntimeTTNN",
+        "TTRuntimeTTNNOps",
+        "TTRuntimeTTNNHelpers",
+        ":_ttnn.so",
+    ]
+
+if enable_ttnn and enable_runtime_tests:
+    linklibs += ["TTRuntimeTTNNTestHelpers"]
 
 if enable_ttmetal:
     runlibs += ["libtt_metal.so"]
@@ -237,6 +246,7 @@ def package_files(directory):
                 f"{ttmlir_build_dir}/runtime/lib/ttnn",
                 f"{ttmlir_build_dir}/runtime/lib/ttnn/operations",
                 f"{ttmlir_build_dir}/runtime/lib/ttmetal",
+                f"{ttmlir_build_dir}/runtime/test",
                 f"{toolchain}/lib",
                 f"{ttmlir_build_dir}/runtime/tools/python/ttrt/runtime",
                 f"{metaldir}/lib",
@@ -248,6 +258,7 @@ def package_files(directory):
                     "TT_RUNTIME_WORKAROUNDS",
                     "1" if configure_workarounds_runtime else "0",
                 ),
+                ("TTMLIR_ENABLE_RUNTIME_TESTS", "1" if enable_runtime_tests else "0"),
             ],
         )
     )
diff --git a/runtime/tools/python/ttrt/common/run.py b/runtime/tools/python/ttrt/common/run.py
index c2ae10ac9a..be9711587f 100644
--- a/runtime/tools/python/ttrt/common/run.py
+++ b/runtime/tools/python/ttrt/common/run.py
@@ -380,6 +380,7 @@ def _execute(binaries):
             self.logging.debug(f"setting torch manual seed={self['--seed']}")
             torch.manual_seed(self["--seed"])
             ttrt.runtime.set_compatible_runtime(binaries[0].fbb)
+            current_runtime = ttrt.runtime.get_current_runtime()
             self.logging.debug(f"opening devices={self.query.device_ids}")
             device = ttrt.runtime.open_device(self.query.device_ids)
 
@@ -459,20 +460,43 @@ def _execute(binaries):
                                 self.logging.debug(
                                     f"starting loop={loop+1}/{self['--loops']} for binary={bin.file_path}"
                                 )
+                                if (
+                                    current_runtime
+                                    == ttrt.runtime.DeviceRuntime.TTMetal
+                                ):
+                                    event = ttrt.runtime.submit(
+                                        device,
+                                        bin.fbb,
+                                        program_index,
+                                        total_inputs[loop],
+                                        total_outputs[loop],
+                                    )
 
-                                event = ttrt.runtime.submit(
-                                    device,
-                                    bin.fbb,
-                                    program_index,
-                                    total_inputs[loop],
-                                    total_outputs[loop],
-                                )
+                                elif current_runtime == ttrt.runtime.DeviceRuntime.TTNN:
+                                    runtime_outputs = ttrt.runtime.submit(
+                                        device,
+                                        bin.fbb,
+                                        program_index,
+                                        total_inputs[loop],
+                                    )
+                                    ttrt.runtime.wait(runtime_outputs)
+                                    for i, runtime_output_tensor in enumerate(
+                                        runtime_outputs
+                                    ):
+                                        ttrt.runtime.memcpy(
+                                            total_outputs[loop][i],
+                                            runtime_output_tensor,
+                                        )
+                                        ttrt.runtime.deallocate_tensor(
+                                            runtime_output_tensor, force=True
+                                        )
 
                                 self.logging.debug(
                                     f"finished loop={loop+1}/{self['--loops']} for binary={bin.file_path}"
                                 )
 
-                            ttrt.runtime.wait(event)
+                            if event is not None:
+                                ttrt.runtime.wait(event)
 
                             if self["--identity"]:
                                 self.logging.debug(
diff --git a/runtime/tools/python/ttrt/common/util.py b/runtime/tools/python/ttrt/common/util.py
index 370643e7d3..45e0a9db95 100644
--- a/runtime/tools/python/ttrt/common/util.py
+++ b/runtime/tools/python/ttrt/common/util.py
@@ -586,6 +586,12 @@ def __init__(self, index, program):
             self.input_tensors = []
             self.output_tensors = []
 
+        def num_inputs(self):
+            return len(self.program["inputs"])
+
+        def num_outputs(self):
+            return len(self.program["outputs"])
+
         def populate_inputs(self, init_fn, golden_inputs=[]):
             if len(golden_inputs) > 0:
                 assert len(golden_inputs) == len(self.program["inputs"])
diff --git a/runtime/tools/python/ttrt/runtime/__init__.py b/runtime/tools/python/ttrt/runtime/__init__.py
index 642b0401f5..0376c07b58 100644
--- a/runtime/tools/python/ttrt/runtime/__init__.py
+++ b/runtime/tools/python/ttrt/runtime/__init__.py
@@ -12,19 +12,33 @@
         DebugEnv,
         DebugHooks,
         get_current_runtime,
+        set_current_runtime,
         set_compatible_runtime,
         get_current_system_desc,
         open_device,
         close_device,
         submit,
         create_tensor,
+        create_empty_tensor,
         create_multi_device_tensor,
         wait,
+        to_host,
+        to_layout,
+        get_layout,
         get_op_output_tensor,
         get_op_debug_str,
+        memcpy,
+        deallocate_tensor,
         WorkaroundEnv,
     )
 except ModuleNotFoundError:
     raise ImportError(
         "Error: Project was not built with runtime enabled, rebuild with: -DTTMLIR_ENABLE_RUNTIME=ON"
     )
+
+try:
+    from ._C import testing
+except ImportError:
+    print(
+        "Warning: not importing testing submodule since project was not built with runtime testing enabled. To enable, rebuild with: -DTTMLIR_ENABLE_RUNTIME_TESTS=ON"
+    )
diff --git a/runtime/tools/python/ttrt/runtime/module.cpp b/runtime/tools/python/ttrt/runtime/module.cpp
index c0378727c0..e1db607c53 100644
--- a/runtime/tools/python/ttrt/runtime/module.cpp
+++ b/runtime/tools/python/ttrt/runtime/module.cpp
@@ -8,6 +8,9 @@
 #include "tt/runtime/detail/workarounds.h"
 #include "tt/runtime/runtime.h"
 #include "tt/runtime/utils.h"
+#if defined(TTMLIR_ENABLE_RUNTIME_TESTS) && TTMLIR_ENABLE_RUNTIME_TESTS == 1
+#include "tt/runtime/test/utils.h"
+#endif
 
 #include <pybind11/functional.h>
 #include <pybind11/pybind11.h>
@@ -22,6 +25,7 @@ PYBIND11_MODULE(_C, m) {
       .def("deallocate_buffers", &tt::runtime::detail::deallocateBuffers);
   py::class_<tt::runtime::Event>(m, "Event");
   py::class_<tt::runtime::Tensor>(m, "Tensor");
+  py::class_<tt::runtime::Layout>(m, "Layout");
   py::class_<tt::runtime::OpContext>(m, "OpContext");
   py::class_<tt::runtime::CallbackContext>(m, "CallbackContext");
   py::enum_<::tt::target::DataType>(m, "DataType")
@@ -48,6 +52,8 @@ PYBIND11_MODULE(_C, m) {
   m.def("set_compatible_runtime", &tt::runtime::setCompatibleRuntime,
         py::arg("binary"),
         "Set the backend device runtime type to match the binary");
+  m.def("set_current_runtime", &tt::runtime::setCurrentRuntime,
+        py::arg("runtime"), "Set the backend device runtime type");
   m.def("get_current_system_desc", &tt::runtime::getCurrentSystemDesc,
         "Get the current system descriptor");
   m.def(
@@ -61,6 +67,15 @@ PYBIND11_MODULE(_C, m) {
             shape, stride, itemsize, dataType);
       },
       "Create a tensor with borrowed memory");
+  m.def(
+      "create_empty_tensor",
+      [](::tt::runtime::Device device, ::tt::runtime::Layout layout,
+         std::vector<std::uint32_t> const &shape,
+         std::vector<std::uint32_t> const &stride, std::uint32_t itemsize) {
+        return tt::runtime::createTensor(device, layout, shape, stride,
+                                         itemsize);
+      },
+      "Create an empty tensor with the specified layout");
   m.def(
       "create_multi_device_tensor",
       [](std::vector<std::uintptr_t> &ptrs,
@@ -69,8 +84,8 @@ PYBIND11_MODULE(_C, m) {
          ::tt::target::DataType dataType,
          std::unordered_map<std::string, std::string> const &strategy) {
         std::vector<std::shared_ptr<void>> data;
-        data.resize(ptrs.size());
-        std::transform(ptrs.begin(), ptrs.end(), data.begin(),
+        data.reserve(ptrs.size());
+        std::transform(ptrs.begin(), ptrs.end(), std::back_inserter(data),
                        [](std::uintptr_t ptr) {
                          return ::tt::runtime::utils::unsafe_borrow_shared(
                              reinterpret_cast<void *>(ptr));
@@ -85,10 +100,50 @@ PYBIND11_MODULE(_C, m) {
         py::arg("num_hw_cqs") = size_t{1},
         "Open a mesh of devices for execution");
   m.def("close_device", &tt::runtime::closeDevice, "Close a mesh device");
-  m.def("submit", &tt::runtime::submit, py::arg("device"),
-        py::arg("executable"), py::arg("program_index"), py::arg("inputs"),
-        py::arg("outputs"), "Submit a binary for execution");
-  m.def("wait", &tt::runtime::wait, py::arg("event"));
+  m.def("to_host", &tt::runtime::toHost, py::arg("tensor"),
+        py::arg("untilize") = false, "Copy the tensor to the host");
+  m.def("to_layout", &tt::runtime::toLayout, py::arg("tensor"),
+        py::arg("device"), py::arg("layout"),
+        "Create a copy of the tensor with the specified layout");
+  m.def("get_layout", &tt::runtime::getLayout, py::arg("executable"),
+        py::arg("program_index"), py::arg("input_index"),
+        "Get the layout of the input tensor");
+  m.def(
+      "submit",
+      [](::tt::runtime::Device device, ::tt::runtime::Binary executable,
+         std::uint32_t programIndex,
+         const std::vector<::tt::runtime::Tensor> &inputs)
+          -> std::vector<::tt::runtime::Tensor> {
+        return ::tt::runtime::submit(device, executable, programIndex, inputs);
+      },
+      py::arg("device"), py::arg("executable"), py::arg("program_index"),
+      py::arg("inputs"),
+      "Submit a ttnn binary for execution, returns a vector of output tensors");
+  m.def(
+      "submit",
+      [](::tt::runtime::Device device, ::tt::runtime::Binary executable,
+         std::uint32_t programIndex,
+         const std::vector<::tt::runtime::Tensor> &inputs,
+         const std::vector<::tt::runtime::Tensor> &outputs)
+          -> ::tt::runtime::Event {
+        return ::tt::runtime::submit(device, executable, programIndex, inputs,
+                                     outputs);
+      },
+      py::arg("device"), py::arg("executable"), py::arg("program_index"),
+      py::arg("inputs"), py::arg("outputs"),
+      "Submit a ttmetal binary for execution. returns event wrapper");
+  m.def(
+      "wait", [](::tt::runtime::Event event) { ::tt::runtime::wait(event); },
+      py::arg("event"));
+  m.def(
+      "wait", [](::tt::runtime::Tensor tensor) { ::tt::runtime::wait(tensor); },
+      py::arg("tensor"));
+  m.def(
+      "wait",
+      [](const std::vector<::tt::runtime::Tensor> &tensors) {
+        ::tt::runtime::wait(tensors);
+      },
+      py::arg("tensors"));
   m.def(
       "get_op_output_tensor",
       [](tt::runtime::OpContext &opContextHandle,
@@ -102,7 +157,15 @@ PYBIND11_MODULE(_C, m) {
         "Get the debug string of the op");
   m.def("get_op_loc_info", &tt::runtime::getOpLocInfo,
         "Get the location info of the op");
-
+  m.def(
+      "memcpy",
+      [](::tt::runtime::Tensor dst, ::tt::runtime::Tensor src) {
+        ::tt::runtime::memcpy(dst, src);
+      },
+      py::arg("dst"), py::arg("src"),
+      "Copy the data from src tensor to dst tensor");
+  m.def("deallocate_tensor", &tt::runtime::deallocateTensor, py::arg("tensor"),
+        py::arg("force") = false, "Deallocate the tensor memory");
   py::class_<tt::runtime::debug::Env>(m, "DebugEnv")
       .def_static("get", &tt::runtime::debug::Env::get)
       .def("__str__", [](const tt::runtime::debug::Env &env) {
@@ -138,4 +201,17 @@ PYBIND11_MODULE(_C, m) {
         os << env;
         return os.str();
       });
+
+#if defined(TTMLIR_ENABLE_RUNTIME_TESTS) && TTMLIR_ENABLE_RUNTIME_TESTS == 1
+  auto testing = m.def_submodule("testing");
+  testing.def("get_dram_interleaved_tile_layout",
+              &tt::runtime::ttnn::test::getDramInterleavedTileLayout,
+              py::arg("dtype"), "Get dram interleaved tile layout");
+  testing.def("get_dram_interleaved_row_major_layout",
+              &tt::runtime::ttnn::test::getDramInterleavedRowMajorLayout,
+              py::arg("dtype"), "Get dram interleaved row major layout");
+  testing.def("get_host_row_major_layout",
+              &tt::runtime::ttnn::test::getHostRowMajorLayout, py::arg("dtype"),
+              "Get host row major layout");
+#endif
 }
diff --git a/test/ttmlir/Dialect/TTNN/eltwise/unary/isfinite/simple_isfinite.mlir b/test/ttmlir/Dialect/TTNN/eltwise/unary/isfinite/simple_isfinite.mlir
index e819e68f4b..3089da6692 100644
--- a/test/ttmlir/Dialect/TTNN/eltwise/unary/isfinite/simple_isfinite.mlir
+++ b/test/ttmlir/Dialect/TTNN/eltwise/unary/isfinite/simple_isfinite.mlir
@@ -1,15 +1,15 @@
 // RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 module attributes {} {
-  func.func @is_finite(%arg0: tensor<64x128xf32>) -> tensor<64x128xbf16> {
+  func.func @is_finite(%arg0: tensor<64x128xbf16>) -> tensor<64x128xbf16> {
     // CHECK: %[[C:.*]] = "ttnn.empty"
     // CHECK-SAME: [[TENSOR:tensor<64x128xbf16,]]
     %0 = tensor.empty() : tensor<64x128xbf16>
     // CHECK: %[[C:.*]] = "ttnn.isfinite"
-    // CHECK-SAME: tensor<64x128xf32,
+    // CHECK-SAME: tensor<64x128xbf16,
     // CHECK-SAME: [[TENSOR]]
     // CHECK-SAME: -> [[TENSOR]]
-    %1 = "ttir.isfinite"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xbf16>) -> tensor<64x128xbf16>
+    %1 = "ttir.isfinite"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xbf16>, tensor<64x128xbf16>) -> tensor<64x128xbf16>
     return %1 : tensor<64x128xbf16>
   }
 }
diff --git a/test/ttmlir/Runtime/TTNN/runtime_stitching/eltwise_binary_op_chain.mlir b/test/ttmlir/Runtime/TTNN/runtime_stitching/eltwise_binary_op_chain.mlir
new file mode 100644
index 0000000000..97690df780
--- /dev/null
+++ b/test/ttmlir/Runtime/TTNN/runtime_stitching/eltwise_binary_op_chain.mlir
@@ -0,0 +1,49 @@
+// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" %s > %t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+
+// TODO: this is a workaround for compiler assuming input tensors are always on host. The ideal is to directly compile ttir graphs.
+#device = #tt.device<workerGrid = #tt.grid<8x8, (d0, d1) -> (0, d0, d1)>, l1Map = (d0, d1)[s0, s1] -> (0, d0 floordiv s0, d1 floordiv s1, (d0 mod s0) * s1 + d1 mod s1), dramMap = (d0, d1)[s0, s1] -> (0, 0, ((((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) floordiv 8192) mod 12, (((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) floordiv 98304 + (((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) mod 8192), meshShape = , chipIds = [0]>
+#system_memory = #ttnn.buffer_type<system_memory>
+#dram = #ttnn.buffer_type<dram>
+#ttnn_layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xbf16, #system_memory>>
+#ttnn_layout1 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>, interleaved>
+#ttnn_layout2 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xbf16, #dram>, interleaved>
+
+module attributes {tt.device = #device} {
+  func.func @add(%arg0: tensor<64x128xbf16, #ttnn_layout1>, %arg1: tensor<64x128xbf16, #ttnn_layout1>) -> tensor<64x128xbf16, #ttnn_layout> {
+    %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+    %1 = "ttnn.to_layout"(%arg0) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xbf16, #ttnn_layout1>) -> tensor<64x128xbf16, #ttnn_layout1>
+    %2 = "ttnn.to_layout"(%arg1) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xbf16, #ttnn_layout1>) -> tensor<64x128xbf16, #ttnn_layout1>
+    %3 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<bf16>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<interleaved>, #dram, <<64x128>>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout2>
+    %4 = "ttnn.add"(%1, %2, %3) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<64x128xbf16, #ttnn_layout1>, tensor<64x128xbf16, #ttnn_layout1>, tensor<64x128xbf16, #ttnn_layout2>) -> tensor<64x128xbf16, #ttnn_layout2>
+    %5 = "ttnn.from_device"(%4) : (tensor<64x128xbf16, #ttnn_layout2>) -> tensor<64x128xbf16, #ttnn_layout>
+    %6 = "ttnn.to_layout"(%5) <{layout = #ttnn.layout<row_major>}> : (tensor<64x128xbf16, #ttnn_layout>) -> tensor<64x128xbf16, #ttnn_layout>
+    return %6 : tensor<64x128xbf16, #ttnn_layout>
+  }
+}
+
+module attributes {tt.device = #device} {
+  func.func @multiply(%arg0: tensor<64x128xbf16, #ttnn_layout1>, %arg1: tensor<64x128xbf16, #ttnn_layout1>) -> tensor<64x128xbf16, #ttnn_layout> {
+    %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+    %1 = "ttnn.to_layout"(%arg0) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xbf16, #ttnn_layout1>) -> tensor<64x128xbf16, #ttnn_layout1>
+    %2 = "ttnn.to_layout"(%arg1) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xbf16, #ttnn_layout1>) -> tensor<64x128xbf16, #ttnn_layout1>
+    %3 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<bf16>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<interleaved>, #dram, <<64x128>>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout2>
+    %4 = "ttnn.multiply"(%1, %2, %3) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<64x128xbf16, #ttnn_layout1>, tensor<64x128xbf16, #ttnn_layout1>, tensor<64x128xbf16, #ttnn_layout2>) -> tensor<64x128xbf16, #ttnn_layout2>
+    %5 = "ttnn.from_device"(%4) : (tensor<64x128xbf16, #ttnn_layout2>) -> tensor<64x128xbf16, #ttnn_layout>
+    %6 = "ttnn.to_layout"(%5) <{layout = #ttnn.layout<row_major>}> : (tensor<64x128xbf16, #ttnn_layout>) -> tensor<64x128xbf16, #ttnn_layout>
+    return %6 : tensor<64x128xbf16, #ttnn_layout>
+  }
+}
+
+module attributes {tt.device = #device} {
+  func.func @subtract(%arg0: tensor<64x128xbf16, #ttnn_layout1>, %arg1: tensor<64x128xbf16, #ttnn_layout1>) -> tensor<64x128xbf16, #ttnn_layout> {
+    %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+    %1 = "ttnn.to_layout"(%arg0) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xbf16, #ttnn_layout1>) -> tensor<64x128xbf16, #ttnn_layout1>
+    %2 = "ttnn.to_layout"(%arg1) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xbf16, #ttnn_layout1>) -> tensor<64x128xbf16, #ttnn_layout1>
+    %3 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<bf16>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<interleaved>, #dram, <<64x128>>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout2>
+    %4 = "ttnn.subtract"(%1, %2, %3) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<64x128xbf16, #ttnn_layout1>, tensor<64x128xbf16, #ttnn_layout1>, tensor<64x128xbf16, #ttnn_layout2>) -> tensor<64x128xbf16, #ttnn_layout2>
+    %5 = "ttnn.from_device"(%4) : (tensor<64x128xbf16, #ttnn_layout2>) -> tensor<64x128xbf16, #ttnn_layout>
+    %6 = "ttnn.to_layout"(%5) <{layout = #ttnn.layout<row_major>}> : (tensor<64x128xbf16, #ttnn_layout>) -> tensor<64x128xbf16, #ttnn_layout>
+    return %6 : tensor<64x128xbf16, #ttnn_layout>
+  }
+}
diff --git a/test/ttmlir/Silicon/StableHLO/Unary/isfinite_op.mlir b/test/ttmlir/Silicon/StableHLO/Unary/isfinite_op.mlir
index 04b9f1fefb..35682c8c0b 100644
--- a/test/ttmlir/Silicon/StableHLO/Unary/isfinite_op.mlir
+++ b/test/ttmlir/Silicon/StableHLO/Unary/isfinite_op.mlir
@@ -7,14 +7,14 @@
 // RUN: FileCheck --input-file=%t.mlir %s
 
 module @jit_eltwise_isfinite attributes {} {
-  func.func public @test_isfinite(%arg0: tensor<64x128xf32>) -> tensor<64x128xi1> {
+  func.func public @test_isfinite(%arg0: tensor<64x128xbf16>) -> tensor<64x128xi1> {
     // CHECK-LABEL: func.func public @test_isfinite
     // CHECK: ttnn.empty
     // CHECK: ttnn.isfinite
-    // CHECK-SAME: tensor<64x128xf32,
+    // CHECK-SAME: tensor<64x128xbf16,
     // CHECK-SAME: tensor<64x128xbf16,
     // CHECK-SAME: -> tensor<64x128xbf16,
-    %0 = stablehlo.is_finite %arg0 : (tensor<64x128xf32>) -> tensor<64x128xi1>
+    %0 = stablehlo.is_finite %arg0 : (tensor<64x128xbf16>) -> tensor<64x128xi1>
     return %0 : tensor<64x128xi1>
   }
 }
diff --git a/test/ttmlir/Silicon/StableHLO/select_op.mlir b/test/ttmlir/Silicon/StableHLO/select_op.mlir
index 23b7182ce0..1cdc5e9d05 100644
--- a/test/ttmlir/Silicon/StableHLO/select_op.mlir
+++ b/test/ttmlir/Silicon/StableHLO/select_op.mlir
@@ -6,23 +6,23 @@
 // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
 
 module @jit_eltwise_select attributes {} {
-  func.func public @test_select(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
+  func.func public @test_select(%arg0: tensor<64x128xbf16>, %arg1: tensor<64x128xbf16>) -> tensor<64x128xbf16> {
     // CHECK-LABEL: func.func public @test_select
     // CHECK: tensor.empty
     // CHECK: [[EQ:{{0-9}}+]] = "ttnn.eq"
-    // CHECK-SAME: tensor<64x128xf32
-    // CHECK-SAME: tensor<64x128xf32
+    // CHECK-SAME: tensor<64x128xbf16
+    // CHECK-SAME: tensor<64x128xbf16
     // CHECK-SAME: tensor<64x128xbf16
     // CHECK-SAME: -> tensor<64x128xbf16
-    %0 = stablehlo.compare EQ, %arg0, %arg1 : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xi1>
+    %0 = stablehlo.compare EQ, %arg0, %arg1 : (tensor<64x128xbf16>, tensor<64x128xbf16>) -> tensor<64x128xi1>
     // CHECK: ttnn.where
     // CHECK-SAME: [[EQ]]
     // CHECK-SAME: tensor<64x128xbf16
-    // CHECK-SAME: tensor<64x128xf32
-    // CHECK-SAME: tensor<64x128xf32
-    // CHECK-SAME: tensor<64x128xf32
-    // CHECK-SAME: -> tensor<64x128xf32
-    %1 = stablehlo.select %0, %arg0, %arg1 : (tensor<64x128xi1>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
-    return %1 : tensor<64x128xf32>
+    // CHECK-SAME: tensor<64x128xbf16
+    // CHECK-SAME: tensor<64x128xbf16
+    // CHECK-SAME: tensor<64x128xbf16
+    // CHECK-SAME: -> tensor<64x128xbf16
+    %1 = stablehlo.select %0, %arg0, %arg1 : (tensor<64x128xi1>, tensor<64x128xbf16>, tensor<64x128xbf16>) -> tensor<64x128xbf16>
+    return %1 : tensor<64x128xbf16>
   }
 }
diff --git a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_isfinite.mlir b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_isfinite.mlir
index ce0146be40..f1489a5ebd 100644
--- a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_isfinite.mlir
+++ b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_isfinite.mlir
@@ -4,14 +4,14 @@
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 #any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
 
-func.func @is_finite(%arg0: tensor<64x128xf32>) -> tensor<64x128xbf16> {
+func.func @is_finite(%arg0: tensor<64x128xbf16>) -> tensor<64x128xbf16> {
   // CHECK: %[[C:.*]] = "ttnn.empty"
   // CHECK-SAME: [[TENSOR:tensor<64x128xbf16,]]
   %0 = tensor.empty() : tensor<64x128xbf16>
   // CHECK: %[[C:.*]] = "ttnn.isfinite"
-  // CHECK-SAME: tensor<64x128xf32,
+  // CHECK-SAME: tensor<64x128xbf16,
   // CHECK-SAME: [[TENSOR]]
   // CHECK-SAME: -> [[TENSOR]]
-  %1 = "ttir.isfinite"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xbf16>) -> tensor<64x128xbf16>
+  %1 = "ttir.isfinite"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xbf16>, tensor<64x128xbf16>) -> tensor<64x128xbf16>
   return %1 : tensor<64x128xbf16>
 }
diff --git a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_le.mlir b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_le.mlir
deleted file mode 100644
index 79de8c062d..0000000000
--- a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_le.mlir
+++ /dev/null
@@ -1,21 +0,0 @@
-// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
-// RUN: FileCheck %s --input-file=%t.mlir
-// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
-
-#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
-#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
-
-module attributes {} {
-  func.func @less_equal(%arg0: tensor<13x31xf32>, %arg1: tensor<13x31xf32>) -> tensor<13x31xf32> {
-    // CHECK: %[[C:.*]] = "ttnn.empty
-    // CHECK-SAME: [[TENSOR:tensor<13x31xf32,]]
-    %0 = tensor.empty() : tensor<13x31xf32>
-    // CHECK: %[[C:.*]] = "ttnn.le"
-    // CHECK-SAME: [[TENSOR]]
-    // CHECK-SAME: [[TENSOR]]
-    // CHECK-SAME: [[TENSOR]]
-    // CHECK-SAME: -> [[TENSOR]]
-    %1 = "ttir.le"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<13x31xf32>, tensor<13x31xf32>, tensor<13x31xf32>) -> tensor<13x31xf32>
-    return %1 : tensor<13x31xf32>
-  }
-}
diff --git a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_where.mlir b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_where.mlir
index 3bed0528c6..647f94e61e 100644
--- a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_where.mlir
+++ b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_where.mlir
@@ -4,13 +4,13 @@
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 #any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
 
-func.func @test_where(%arg0: tensor<13x37xf32>, %arg1: tensor<13x37xf32>) -> tensor<13x37xf32> {
+func.func @test_where(%arg0: tensor<13x37xbf16>, %arg1: tensor<13x37xbf16>) -> tensor<13x37xbf16> {
   %0 = tensor.empty() : tensor<13x37xbf16>
-  %1 = "ttir.eq"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xf32>, tensor<13x37xf32>, tensor<13x37xbf16>) -> tensor<13x37xbf16>
-  %2 = tensor.empty() : tensor<13x37xf32>
-  %3 = "ttir.where"(%1, %arg0, %arg1, %2) <{operandSegmentSizes = array<i32: 3, 1>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xbf16>, tensor<13x37xf32>, tensor<13x37xf32>, tensor<13x37xf32>) -> tensor<13x37xf32>
+  %1 = "ttir.eq"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xbf16>, tensor<13x37xbf16>, tensor<13x37xbf16>) -> tensor<13x37xbf16>
+  %2 = tensor.empty() : tensor<13x37xbf16>
+  %3 = "ttir.where"(%1, %arg0, %arg1, %2) <{operandSegmentSizes = array<i32: 3, 1>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xbf16>, tensor<13x37xbf16>, tensor<13x37xbf16>, tensor<13x37xbf16>) -> tensor<13x37xbf16>
   // CHECK: %[[EMPTY:.*]] = "ttnn.empty"{{.*}}
   // CHECK: %[[VAL1:[0-9]+]] = "ttnn.eq"(%{{[0-9]+}}, %{{[0-9]+}}, %[[EMPTY]])
   // CHECK: %{{[0-9]+}} = "ttnn.where"(%[[VAL1]], %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}})
-  return %3 : tensor<13x37xf32>
+  return %3 : tensor<13x37xbf16>
 }
diff --git a/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir b/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir
index b7912d4c19..2674a66fdf 100644
--- a/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir
+++ b/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir
@@ -65,15 +65,15 @@ func.func @floor(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
   return %1 : tensor<64x128xf32>
 }
 
-func.func @is_finite(%arg0: tensor<64x128xf32>) -> tensor<64x128xbf16> {
+func.func @is_finite(%arg0: tensor<64x128xbf16>) -> tensor<64x128xbf16> {
   // CHECK: %[[C:.*]] = "ttnn.empty"
   // CHECK-SAME: [[TENSOR:tensor<64x128xbf16,]]
   %0 = tensor.empty() : tensor<64x128xbf16>
   // CHECK: %[[C:.*]] = "ttnn.isfinite"
-  // CHECK-SAME: tensor<64x128xf32,
+  // CHECK-SAME: tensor<64x128xbf16,
   // CHECK-SAME: [[TENSOR]]
   // CHECK-SAME: -> [[TENSOR]]
-  %1 = "ttir.isfinite"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xbf16>) -> tensor<64x128xbf16>
+  %1 = "ttir.isfinite"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xbf16>, tensor<64x128xbf16>) -> tensor<64x128xbf16>
   return %1 : tensor<64x128xbf16>
 }
 
@@ -278,15 +278,15 @@ func.func @get_dimension_size(%arg0: tensor<13x21x3xf32>) -> tensor<1xi32> {
   // CHECK: return [[VAL]] : tensor<1xi32, {{.*}}>
 }
 
-func.func @test_where(%arg0: tensor<13x37xf32>, %arg1: tensor<13x37xf32>) -> tensor<13x37xf32> {
+func.func @test_where(%arg0: tensor<13x37xbf16>, %arg1: tensor<13x37xbf16>) -> tensor<13x37xbf16> {
   %0 = tensor.empty() : tensor<13x37xbf16>
-  %1 = "ttir.eq"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xf32>, tensor<13x37xf32>, tensor<13x37xbf16>) -> tensor<13x37xbf16>
-  %2 = tensor.empty() : tensor<13x37xf32>
-  %3 = "ttir.where"(%1, %arg0, %arg1, %2) <{operandSegmentSizes = array<i32: 3, 1>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xbf16>, tensor<13x37xf32>, tensor<13x37xf32>, tensor<13x37xf32>) -> tensor<13x37xf32>
+  %1 = "ttir.eq"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xbf16>, tensor<13x37xbf16>, tensor<13x37xbf16>) -> tensor<13x37xbf16>
+  %2 = tensor.empty() : tensor<13x37xbf16>
+  %3 = "ttir.where"(%1, %arg0, %arg1, %2) <{operandSegmentSizes = array<i32: 3, 1>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xbf16>, tensor<13x37xbf16>, tensor<13x37xbf16>, tensor<13x37xbf16>) -> tensor<13x37xbf16>
   // CHECK: %[[EMPTY:.*]] = "ttnn.empty"{{.*}}
   // CHECK: %[[VAL1:[0-9]+]] = "ttnn.eq"(%{{[0-9]+}}, %{{[0-9]+}}, %[[EMPTY]])
   // CHECK: %{{[0-9]+}} = "ttnn.where"(%[[VAL1]], %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}})
-  return %3 : tensor<13x37xf32>
+  return %3 : tensor<13x37xbf16>
 }
 
 func.func @gelu(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {

From 59cd3d383087e81bf2471cdb0a21744f6fccb3de Mon Sep 17 00:00:00 2001
From: Kyle Mabee <118925087+kmabeeTT@users.noreply.github.com>
Date: Tue, 3 Dec 2024 22:22:12 -0500
Subject: [PATCH 47/84] Remove hanging simple_max.mlir test until figured out,
 hanging Dec3 issue #1491 (#1490)

---
 test/ttmlir/Silicon/TTMetal/simple_max.mlir | 13 -------------
 1 file changed, 13 deletions(-)
 delete mode 100644 test/ttmlir/Silicon/TTMetal/simple_max.mlir

diff --git a/test/ttmlir/Silicon/TTMetal/simple_max.mlir b/test/ttmlir/Silicon/TTMetal/simple_max.mlir
deleted file mode 100644
index 92bdbe72c7..0000000000
--- a/test/ttmlir/Silicon/TTMetal/simple_max.mlir
+++ /dev/null
@@ -1,13 +0,0 @@
-// RUN: ttmlir-opt --ttir-to-ttmetal-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
-// RUN: FileCheck %s --input-file=%t.mlir
-// RUN: ttmlir-translate --ttmetal-to-flatbuffer %t.mlir > %t.ttm
-
-#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
-
-func.func @maximum(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
-  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
-  %0 = tensor.empty() : tensor<64x128xf32>
-  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
-  %1 = "ttir.maximum"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
-  return %1 : tensor<64x128xf32>
-}

From 02daac6e9c0736063863e5596bb7fa4f0be1b32b Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <157983820+vmilosevic@users.noreply.github.com>
Date: Wed, 4 Dec 2024 04:55:08 +0100
Subject: [PATCH 48/84] Uplift third_party/tt-metal to
 6a524ab817aeb09c273e37254f39ad8124ddf2f8 2024-12-03 (#1463)

* Uplift third_party/tt-metal to 6a524ab817aeb09c273e37254f39ad8124ddf2f8 2024-12-03
* reflect tt-metal refactor to uplift tt-metal
- fixed build error due to refactor in https://github.com/tenstorrent/tt-metal/commit/3584ac9512fef17e96f7e838bff1fd12c97798fb

---------

Co-authored-by: kmitrovicTT <169657397+kmitrovicTT@users.noreply.github.com>
Co-authored-by: Brata Choudhury <achoudhury@tenstorrent.com>
---
 runtime/include/tt/runtime/detail/ttnn.h       | 2 +-
 runtime/lib/ttnn/operations/pool/maxpool2d.cpp | 6 ++++--
 third_party/CMakeLists.txt                     | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/runtime/include/tt/runtime/detail/ttnn.h b/runtime/include/tt/runtime/detail/ttnn.h
index e7b8fbcf21..31b979e139 100644
--- a/runtime/include/tt/runtime/detail/ttnn.h
+++ b/runtime/include/tt/runtime/detail/ttnn.h
@@ -25,7 +25,7 @@
 #include "ttnn/operations/embedding/embedding.hpp"
 #include "ttnn/operations/matmul/matmul.hpp"
 #include "ttnn/operations/normalization/softmax/softmax.hpp"
-#include "ttnn/operations/pool/maxpool/max_pool2d.hpp"
+#include "ttnn/operations/pool/generic/generic_pools.hpp"
 #include "ttnn/operations/reduction/generic/generic_reductions.hpp"
 #include "ttnn/tensor/host_buffer/functions.hpp"
 #include "ttnn/tensor/tensor.hpp"
diff --git a/runtime/lib/ttnn/operations/pool/maxpool2d.cpp b/runtime/lib/ttnn/operations/pool/maxpool2d.cpp
index c405a86f1c..ddbe639c74 100644
--- a/runtime/lib/ttnn/operations/pool/maxpool2d.cpp
+++ b/runtime/lib/ttnn/operations/pool/maxpool2d.cpp
@@ -47,8 +47,10 @@ preshardForMaxPool2d(const ::tt::target::ttnn::MaxPool2dOp *op,
 
 void run(const ::tt::target::ttnn::MaxPool2dOp *op, ProgramContext &context) {
   ProgramTensorPool &tensorPool = context.getTensorPool();
-  const ::ttnn::operations::pool::MaxPool2DOp operation =
-      ::ttnn::operations::pool::MaxPool2DOp();
+  const ::ttnn::operations::pool::Pool2DOp<
+      ::ttnn::operations::pool::Pool2DType::MAX_POOL2D>
+      operation = ::ttnn::operations::pool::Pool2DOp<
+          ::ttnn::operations::pool::Pool2DType::MAX_POOL2D>();
 
   ::ttnn::Tensor input = tensorPool.at(op->in()->global_id());
   DEBUG_ASSERT(input.is_allocated());
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index bf28aebc9f..4c814fdc8d 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -1,6 +1,6 @@
 include(ExternalProject)
 
-set(TT_METAL_VERSION "ab3dc0c4f5c3ce9722261c878970bfa92a212fc9")
+set(TT_METAL_VERSION "6a524ab817aeb09c273e37254f39ad8124ddf2f8")
 
 if ("$ENV{ARCH_NAME}" STREQUAL "grayskull")
   set(ARCH_NAME "grayskull")

From f794b4c729b7fa48b788fa6a9cacefe3e281457e Mon Sep 17 00:00:00 2001
From: Filip Bajraktari <fbajraktari@tenstorrent.com>
Date: Wed, 4 Dec 2024 11:14:50 +0100
Subject: [PATCH 49/84] [Optimizer] Greedy solution for join nodes in L1
 Interleaved policy (#1162)

Version 2.0 of L1 Interleaved policy
---
 .../Dialect/TTNN/Analysis/L1ChainConfig.h     |   2 +-
 .../TTNN/Analysis/L1InterleavedPolicy.h       |  97 +++++
 .../TTNN/Analysis/MemoryLayoutAnalysis.h      |   2 +-
 .../ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td    |   3 +
 .../Dialect/TTNN/Pipelines/TTNNPipelines.h    |   3 +-
 .../Utils/MemoryLayoutAnalysisParams.h        |   6 +-
 .../Dialect/TTNN/Utils/OptimizerOverrides.h   |   2 +-
 include/ttmlir/Scheduler/Scheduler.h          |   4 +
 lib/Dialect/TTNN/Analysis/CMakeLists.txt      |   2 +-
 .../TTNN/Analysis/L1InterleavedPolicy.cpp     | 399 +++++++++++++-----
 .../TTNN/Analysis/LegalGridAnalysis.cpp       |   8 +
 .../TTNN/Analysis/MemoryLayoutAnalysis.cpp    |   9 +-
 lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp          |  25 +-
 lib/Dialect/TTNN/Transforms/Optimizer.cpp     |   4 +
 lib/Scheduler/Scheduler.cpp                   |   7 +-
 .../all_l1_interleaved_policy.mlir            |  19 +-
 .../l1_interleaved_policy/fork_join.mlir      |  45 ++
 .../mnist_l1_interleaved.mlir                 |   8 +-
 .../simple_join_tests/dram_ABC_l1_None.mlir   |  28 ++
 .../simple_join_tests/dram_AB_l1_C.mlir       |  31 ++
 .../simple_join_tests/dram_AC_l1_B.mlir       |  30 ++
 .../simple_join_tests/dram_A_l1_BC.mlir       |  30 ++
 .../simple_join_tests/dram_BC_l1_A.mlir       |  30 ++
 .../simple_join_tests/dram_B_l1_AC.mlir       |  30 ++
 .../simple_join_tests/dram_C_l1_AB.mlir       |  31 ++
 .../simple_join_tests/dram_None_l1_ABC.mlir   |  29 ++
 .../l1_interleaved_policy/single_op.mlir      |  10 +
 .../Silicon/TTNN/optimizer/large_tensors.mlir |  19 -
 test/unittests/Optimizer/CMakeLists.txt       |   2 +
 .../Optimizer/TestL1InterleavedPolicy.cpp     | 193 +++++++++
 30 files changed, 956 insertions(+), 152 deletions(-)
 rename include/ttmlir/Dialect/{TT => TTNN}/Utils/MemoryLayoutAnalysisParams.h (88%)
 rename test/ttmlir/{Silicon/TTNN/optimizer => Dialect/TTNN/optimizer/l1_interleaved_policy}/all_l1_interleaved_policy.mlir (79%)
 create mode 100644 test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/fork_join.mlir
 rename test/ttmlir/{Silicon/TTNN/optimizer => Dialect/TTNN/optimizer/l1_interleaved_policy}/mnist_l1_interleaved.mlir (88%)
 create mode 100644 test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_ABC_l1_None.mlir
 create mode 100644 test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AB_l1_C.mlir
 create mode 100644 test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AC_l1_B.mlir
 create mode 100644 test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_A_l1_BC.mlir
 create mode 100644 test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_BC_l1_A.mlir
 create mode 100644 test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_B_l1_AC.mlir
 create mode 100644 test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_C_l1_AB.mlir
 create mode 100644 test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_None_l1_ABC.mlir
 create mode 100644 test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/single_op.mlir
 delete mode 100644 test/ttmlir/Silicon/TTNN/optimizer/large_tensors.mlir
 create mode 100644 test/unittests/Optimizer/TestL1InterleavedPolicy.cpp

diff --git a/include/ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h b/include/ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h
index 3c57ca66b7..b8aee2e4ea 100644
--- a/include/ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h
+++ b/include/ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h
@@ -58,7 +58,7 @@ class L1ChainConfig {
            std::unordered_set<Edge> &memReconfigEdges);
 
   bool isEmpty() { return opL1MemSpecs.empty(); }
-  void addOpL1MemSpec(OpL1MemSpec &&spec) {
+  void addOpL1MemSpec(OpL1MemSpec spec) {
     assert(state == L1ChainState::InBuild);
     l1ChainedOps.insert(spec.op);
     opL1MemSpecs.push_back(std::move(spec));
diff --git a/include/ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h b/include/ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h
index f453e9a1d3..2392cd7c9c 100644
--- a/include/ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h
+++ b/include/ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h
@@ -8,10 +8,43 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h"
 #include "ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysisPolicy.h"
+#include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h"
 
 namespace mlir::tt::ttnn {
 
 class L1InterleavedPolicy : public MemoryLayoutAnalysisPolicy {
+public:
+  struct OpMemSpec {
+    TTNNLayoutAttr layout;
+    // Minimum L1 memory usage required for scheduling the op
+    // given the layouts of all the ops that are already scheduled.
+    //
+    uint64_t requiredL1Usage;
+  };
+
+  // This struct is holding information about the greedily choosen
+  // configuration of the @baseOp: 1) layouts and 2) precedence.
+  //
+  // The @layouts represents the mapping between the op and its choosen
+  // layout. All the ops that are included in the @layouts map must be
+  // either @baseOp or its operand with legal L1 Interleaved output layout
+  // at the moment of analyzing the @baseOp.
+  //
+  // The @precedence represents the order of the op's operands in which they
+  // should be scheduled. Only op's operands that are included in the @layouts
+  // map are included in the @precedence.
+  //
+  struct OpConfig {
+    Operation *baseOp;
+    llvm::DenseMap<Operation *, TTNNLayoutAttr> layouts;
+    llvm::SmallVector<Operation *> precedence;
+  };
+
+  struct L1Usage {
+    size_t outputL1Usage;
+    size_t requiredL1Usage;
+  };
+
 public:
   L1InterleavedPolicy(
       Operation *rootOp, std::vector<L1ChainConfig> &l1ChainConfigs,
@@ -22,7 +55,71 @@ class L1InterleavedPolicy : public MemoryLayoutAnalysisPolicy {
       : MemoryLayoutAnalysisPolicy(rootOp, l1ChainConfigs, legalLayouts,
                                    schedule, usableL1CacheSize) {}
 
+  /**
+   * Retrieve the greedy OpConfig for the given base operation
+   * and its opsL1Usage map.
+   *
+   * @param baseOp     The base operation for which the greedy configuration is
+   * being determined.
+   * @param opsL1Usage A map between the operation and its output L1 usage. All
+   * operations included in the opsL1Usage map must be either the baseOp or its
+   * operand with a legal L1 Interleaved output layout at the time of analyzing
+   * the baseOp.
+   * @return The greedy OpConfig for the baseOp.
+   */
+  OpConfig getGreedyConfig(Operation *baseOp,
+                           llvm::DenseMap<Operation *, L1Usage> &opsL1Usage);
+
   void run() final;
+
+private:
+  // Check if the op is analyzable. Op is analyzable if it has at least one
+  // legal layout.
+  bool isAnalyzable(Operation *op);
+
+  // Fetch op's DRAM layout from legalLayouts.
+  bool hasDRAMBufferType(Operation *op);
+  TTNNLayoutAttr getDRAMLayout(Operation *op);
+
+  // Fetch op's L1 Interleaved layout from legalLayouts.
+  bool hasL1BufferType(Operation *op);
+  TTNNLayoutAttr getL1InterleavedLayout(Operation *op);
+
+  size_t getAvailableL1CacheSize() const {
+    // Figure out this const based on exec data, but will be replaced
+    // with API.
+    //
+    constexpr float tensorL1UsageCap = 0.75;
+    return tensorL1UsageCap * usableL1CacheSize;
+  }
+
+  // Precedence schedule map for each operation. It contains the order
+  // in which operands need to be executed for each op.
+  llvm::DenseMap<Operation *, llvm::SmallVector<Operation *>> precedenceMap;
+
+  llvm::DenseSet<Operation *> visitedOps;
+  void buildSchedule(mlir::Operation *op, func::FuncOp &func) {
+
+    // Schedule all the precedents of the current operation
+    //
+    visitedOps.insert(op);
+    for (Operation *precedent : precedenceMap[op]) {
+      if (!visitedOps.count(precedent)) {
+        buildSchedule(precedent, func);
+      }
+    }
+
+    (*schedule)[func].push_back(op);
+  }
+
+  void constructSchedule(func::FuncOp &func) {
+    func->walk([&](Operation *op) {
+      if (op->hasTrait<mlir::OpTrait::ReturnLike>()) {
+        Operation *outputOp = op->getOperand(0).getDefiningOp();
+        buildSchedule(outputOp, func);
+      }
+    });
+  }
 };
 
 } // namespace mlir::tt::ttnn
diff --git a/include/ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.h b/include/ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.h
index e8b6038154..bc6284c3a0 100644
--- a/include/ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.h
+++ b/include/ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.h
@@ -6,10 +6,10 @@
 #define TTMLIR_DIALECT_TTNN_ANALYSIS_MEMORYLAYOUTANALYSIS_H
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h"
 #include "ttmlir/Dialect/TTNN/Analysis/Edge.h"
 #include "ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h"
 #include "ttmlir/Dialect/TTNN/Analysis/TTNNAnalysis.h"
+#include "ttmlir/Dialect/TTNN/Utils/MemoryLayoutAnalysisParams.h"
 
 namespace mlir::tt::ttnn {
 
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td b/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td
index e45fba0031..7d5b10abb0 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td
@@ -148,6 +148,9 @@ def TTNN_TTNNLayoutAttr: TTNN_Attr<"TTNNLayout", "ttnn_layout"> {
       bool hasShardedTensorMemoryLayout() const;
       bool hasShardedL1TensorMemoryLayout() const;
       bool hasInterleavedL1TensorMemoryLayout() const;
+      bool hasInterleavedDRAMTensorMemoryLayout() const;
+      bool hasL1BufferType() const;
+      bool hasDRAMBufferType() const;
       bool isTiled() const;
       Layout getLayout() const;
       Type getElementType() const;
diff --git a/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h b/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h
index 636d5f6238..58206039bb 100644
--- a/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h
+++ b/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h
@@ -5,9 +5,8 @@
 #ifndef TTMLIR_DIALECT_TTNN_PIPELINES_TTNNPIPELINES_H
 #define TTMLIR_DIALECT_TTNN_PIPELINES_TTNNPIPELINES_H
 
-#include "ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h"
+#include "ttmlir/Dialect/TTNN/Utils/MemoryLayoutAnalysisParams.h"
 #include "ttmlir/Dialect/TTNN/Utils/PassOverrides.h"
-#include "ttmlir/Dialect/TTNN/Utils/Utils.h"
 
 #include "mlir/Pass/PassOptions.h"
 
diff --git a/include/ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h b/include/ttmlir/Dialect/TTNN/Utils/MemoryLayoutAnalysisParams.h
similarity index 88%
rename from include/ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h
rename to include/ttmlir/Dialect/TTNN/Utils/MemoryLayoutAnalysisParams.h
index 4a44e883da..5275e2340d 100644
--- a/include/ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h
+++ b/include/ttmlir/Dialect/TTNN/Utils/MemoryLayoutAnalysisParams.h
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#ifndef TTMLIR_DIALECT_TT_UTILS_MEMORYLAYOUTANALYSISPARAMS_H
-#define TTMLIR_DIALECT_TT_UTILS_MEMORYLAYOUTANALYSISPARAMS_H
+#ifndef TTMLIR_DIALECT_TTNN_UTILS_MEMORYLAYOUTANALYSISPARAMS_H
+#define TTMLIR_DIALECT_TTNN_UTILS_MEMORYLAYOUTANALYSISPARAMS_H
 
 #include <llvm/ADT/StringSwitch.h>
 #include <llvm/Support/CommandLine.h>
@@ -49,4 +49,4 @@ struct MemoryLayoutAnalysisPolicyTypeParser
 
 } // namespace mlir::tt
 
-#endif // TTMLIR_DIALECT_TT_UTILS_MEMORYLAYOUTANALYSISPARAMS_H
+#endif // TTMLIR_DIALECT_TTNN_UTILS_MEMORYLAYOUTANALYSISPARAMS_H
diff --git a/include/ttmlir/Dialect/TTNN/Utils/OptimizerOverrides.h b/include/ttmlir/Dialect/TTNN/Utils/OptimizerOverrides.h
index c474106e3a..eccc62f26d 100644
--- a/include/ttmlir/Dialect/TTNN/Utils/OptimizerOverrides.h
+++ b/include/ttmlir/Dialect/TTNN/Utils/OptimizerOverrides.h
@@ -5,8 +5,8 @@
 #ifndef TTMLIR_DIALECT_TTNN_UTILS_OPTIMIZEROVERRIDES_H
 #define TTMLIR_DIALECT_TTNN_UTILS_OPTIMIZEROVERRIDES_H
 
-#include "ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h"
 #include "ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h"
+#include "ttmlir/Dialect/TTNN/Utils/MemoryLayoutAnalysisParams.h"
 #include "ttmlir/Dialect/TTNN/Utils/PassOverrides.h"
 
 namespace mlir::tt::ttnn {
diff --git a/include/ttmlir/Scheduler/Scheduler.h b/include/ttmlir/Scheduler/Scheduler.h
index 817271fdc9..5d41163311 100644
--- a/include/ttmlir/Scheduler/Scheduler.h
+++ b/include/ttmlir/Scheduler/Scheduler.h
@@ -23,6 +23,10 @@ class Scheduler {
   // Method to get the next set of schedulable operations
   llvm::SmallVector<mlir::Operation *> getScheduleableOps();
 
+  // Method to check if an operation is either a TTIR op or a
+  // TTNN scheduleable op.
+  bool isTTShedulableOp(mlir::Operation *op);
+
   // Method to check if an operation can be scheduled
   bool canSchedule(mlir::Operation *op);
 
diff --git a/lib/Dialect/TTNN/Analysis/CMakeLists.txt b/lib/Dialect/TTNN/Analysis/CMakeLists.txt
index 996064d791..640702f71c 100644
--- a/lib/Dialect/TTNN/Analysis/CMakeLists.txt
+++ b/lib/Dialect/TTNN/Analysis/CMakeLists.txt
@@ -15,6 +15,6 @@ add_mlir_dialect_library(MLIRTTNNAnalysis
         MLIRTTNNPassesIncGen
         MLIRTTOpsIncGen
 
-        LINK_LIBS
+        LINK_LIBS PUBLIC
         MLIRScheduler
         )
diff --git a/lib/Dialect/TTNN/Analysis/L1InterleavedPolicy.cpp b/lib/Dialect/TTNN/Analysis/L1InterleavedPolicy.cpp
index c0b3ff102f..23c1b306ab 100644
--- a/lib/Dialect/TTNN/Analysis/L1InterleavedPolicy.cpp
+++ b/lib/Dialect/TTNN/Analysis/L1InterleavedPolicy.cpp
@@ -3,19 +3,23 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h"
-#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
-#include "ttmlir/Dialect/TTNN/IR/TTNNOps.h"
+#include "ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h"
 #include "ttmlir/Scheduler/Scheduler.h"
 
 namespace mlir::tt::ttnn {
 
-uint64_t getOpOutputLayoutUsage(
-    Operation *op,
-    llvm::DenseMap<Operation *, std::vector<TTNNLayoutAttr>> &legalLayouts,
-    DeviceAttr &deviceAttr) {
-  TTNNLayoutAttr opLayout = legalLayouts.lookup(op).front();
-  assert(opLayout.hasInterleavedL1TensorMemoryLayout());
+uint64_t getOpOutputL1Usage(Operation *op, TTNNLayoutAttr opLayout,
+                            DeviceAttr &deviceAttr) {
+  // In case the opLayout is not in L1 memory space, L1 memory usage is 0.
+  //
+  if (opLayout.hasDRAMBufferType()) {
+    return 0;
+  }
 
+  // L1 memory usage of the ops without output tensors cannot be calculated.
+  // So far, this is only false for ttnn.get_device op.
+  //
+  assert(mlir::isa<RankedTensorType>(op->getResult(0).getType()));
   llvm::ArrayRef<int64_t> opOutputTensorShape =
       mlir::cast<RankedTensorType>(op->getResult(0).getType()).getShape();
 
@@ -24,132 +28,327 @@ uint64_t getOpOutputLayoutUsage(
   return opL1OutputUsage;
 }
 
-void L1InterleavedPolicy::run() {
-  rootOp->walk([&](func::FuncOp func) {
-    DeviceAttr deviceAttr = getCurrentScopeDevice(func);
-    mlir::tt::scheduler::Scheduler scheduler(&func);
-    llvm::SmallVector<mlir::Operation *> scheduleableOps;
-    llvm::DenseMap<Operation *, TTNNLayoutAttr> selectedOpLayout;
-    Operation *currentOp = nullptr;
+L1InterleavedPolicy::OpConfig L1InterleavedPolicy::getGreedyConfig(
+    Operation *baseOp, llvm::DenseMap<Operation *, L1Usage> &opsL1Usage) {
+  uint64_t numOfOps, bitIndex, currentMask;
+  uint64_t currentL1Usage, optimalL1Usage;
+  llvm::DenseMap<Operation *, TTNNLayoutAttr> optimalLayouts;
+  llvm::SmallVector<Operation *> optimalPrecedence;
+
+  constexpr uint64_t maxNumOfOps = sizeof(numOfOps) * 8;
+  numOfOps = opsL1Usage.size();
+  assert(numOfOps <= maxNumOfOps);
+
+  optimalL1Usage = 0;
+  for (currentMask = 0; currentMask < (1 << numOfOps); currentMask++) {
+    std::bitset<maxNumOfOps> bitset(currentMask);
+    llvm::DenseMap<Operation *, TTNNLayoutAttr> currentLayouts;
+    llvm::SmallVector<Operation *> currentPrecedence, optimalL1Precedence,
+        L1Precedence;
 
-    // TODO(fbajraktari): Add algorithm description. Currently, the algorithm
-    // is the same as for DFSharding policy, but works only for L1 interleaved.
+    // Calculate the L1 usage of the current configuration.
     //
-    l1ChainConfigs->push_back(L1ChainConfig());
-    while (scheduler.hasUnscheduledOps()) {
-      scheduleableOps = scheduler.getScheduleableOps();
+    currentL1Usage = 0;
+    bitIndex = 0;
+    for (const auto &[op, l1Usage] : opsL1Usage) {
+      if (bitset[bitIndex]) {
+        // In case we have an operand with L1 interleaved layout, we need to
+        // figure out its schedule among the other operands with L1 interleaved
+        // layout. Therefore, we insert all of them into the L1Precedence where
+        // calculate the optimal L1Precedence and then concatenate it with the
+        // currentPrecedence.
+        //
+        currentL1Usage += l1Usage.outputL1Usage;
+        currentLayouts[op] = getL1InterleavedLayout(op);
+
+        // Skip the baseOp.
+        //
+        if (baseOp != op) {
+          L1Precedence.push_back(op);
+        }
+      } else {
+        // It is optimal to first schedule all ops with DRAM output layout.
+        // Therefore, we can directly insert them into the
+        // currentOptimalPrecedence.
+        //
+        currentLayouts[op] = getDRAMLayout(op);
 
-      // Before starting a l1 chain, schedule layout/memory management ops
-      // first until they are exhausted from schedulable ops.
+        // Skip the baseOp.
+        //
+        if (baseOp != op) {
+          currentPrecedence.push_back(op);
+        }
+      }
+      bitIndex += 1;
+    }
+
+    // Calculate the optimal L1Precedence.
+    //
+    bool isMaskLegal = false;
+    uint64_t minRequiredL1Usage = getAvailableL1CacheSize();
+
+    std::sort(L1Precedence.begin(), L1Precedence.end());
+    do {
+      // Check if the current order of L1Precedence is legal.
       //
-      if (l1ChainConfigs->back().isEmpty()) {
-        for (auto *op : scheduleableOps) {
-          if (isa<ToLayoutOp>(op)) {
-            currentOp = op;
-            break;
-          }
+      bool isLegal = true;
+      uint64_t intermediateL1Usage = 0;
+      uint64_t intermediateRequiredL1Usage = 0;
+      for (Operation *op : L1Precedence) {
+        if (intermediateL1Usage + opsL1Usage[op].requiredL1Usage >
+            getAvailableL1CacheSize()) {
+          isLegal = false;
+          break;
         }
+
+        intermediateRequiredL1Usage =
+            std::max(intermediateRequiredL1Usage,
+                     intermediateL1Usage + opsL1Usage[op].requiredL1Usage);
+        intermediateL1Usage += opsL1Usage[op].outputL1Usage;
       }
 
-      if (currentOp == nullptr) {
-        currentOp = scheduleableOps[0];
+      // Pick optimal L1Precedence among all legal L1Precedence.
+      // The one that requires the least amount of L1 cache overall is
+      // considered optimal.
+      //
+      if (isLegal && intermediateRequiredL1Usage < minRequiredL1Usage) {
+        isMaskLegal = true;
+        minRequiredL1Usage = intermediateRequiredL1Usage;
+        optimalL1Precedence = L1Precedence;
       }
+    } while (std::next_permutation(L1Precedence.begin(), L1Precedence.end()));
+
+    if (isMaskLegal && optimalL1Usage < currentL1Usage &&
+        currentL1Usage <= getAvailableL1CacheSize()) {
 
-      // Schedule currentOp.
+      // Append the legal L1Precedence to the currentPrecedence and therefore
+      // create a complete precedence for the baseOp and currentMask.
       //
-      scheduler.scheduleOp(currentOp);
+      currentPrecedence.insert(currentPrecedence.end(),
+                               optimalL1Precedence.begin(),
+                               optimalL1Precedence.end());
 
-      // Skip starting sharding chain if currentOp is a memory management op.
+      // Update the optimal configuration.
       //
-      if (l1ChainConfigs->back().isEmpty() && isa<ToLayoutOp>(currentOp)) {
-        currentOp = nullptr;
-        continue;
-      }
+      optimalL1Usage = currentL1Usage;
+      optimalLayouts = std::move(currentLayouts);
+      optimalPrecedence = std::move(currentPrecedence);
+    }
+  }
 
-      if (scheduler.hasUnscheduledOps()) {
-        scheduleableOps = scheduler.getScheduleableOps();
+  // Create the optimal config.
+  //
+  OpConfig optimalConfig;
+  optimalConfig.baseOp = baseOp;
+  optimalConfig.layouts = std::move(optimalLayouts);
+  optimalConfig.precedence = std::move(optimalPrecedence);
 
-        // Check if currentOp has a valid successor.
+  return optimalConfig;
+}
+
+void L1InterleavedPolicy::run() {
+  for (Operation &funcOp : rootOp->getRegion(0).getOps()) {
+    func::FuncOp func = dyn_cast<func::FuncOp>(funcOp);
+    DeviceAttr deviceAttr = getCurrentScopeDevice(func);
+
+    // Start the policy.
+    //
+    llvm::DenseMap<Operation *, OpMemSpec> OpMemSpecMap;
+    mlir::tt::scheduler::Scheduler scheduler(&func);
+    llvm::SmallVector<Operation *> scheduleableOps;
+
+    while (scheduler.hasUnscheduledOps()) {
+      scheduleableOps = scheduler.getScheduleableOps();
+
+      for (Operation *op : scheduleableOps) {
+        // Schedule the op.
         //
-        Operation *nextOp = nullptr;
-        for (auto *op : scheduleableOps) {
-          for (auto operand : op->getOperands()) {
-            if (operand.getDefiningOp() == currentOp) {
-              nextOp = op;
-              break;
-            }
+        scheduler.scheduleOp(op);
+
+        // Find optimal configuration for the op.
+        //
+        llvm::DenseMap<Operation *, L1Usage> opsL1Usage;
+        llvm::SmallVector<Operation *> opsPrecedence;
+
+        // Generate optimal configuration for the current op based on the
+        // outputs of its operands and its legal output layouts.
+        //
+        if (isAnalyzable(op)) {
+
+          // Create the OpMemSpec.
+          //
+          OpMemSpec OpMemSpec;
+          assert(hasDRAMBufferType(op));
+          OpMemSpec.layout = getDRAMLayout(op);
+          OpMemSpec.requiredL1Usage = 0;
+          OpMemSpecMap[op] = OpMemSpec;
+
+          if (op->hasOneUse() && hasL1BufferType(op)) {
+            L1Usage l1Usage;
+            l1Usage.outputL1Usage =
+                getOpOutputL1Usage(op, getL1InterleavedLayout(op), deviceAttr);
+            l1Usage.requiredL1Usage = 0;
+            opsL1Usage[op] = l1Usage;
           }
         }
 
-        if (nextOp) {
+        for (auto operand : op->getOperands()) {
+          // Skip block arguments (%arg0, %arg1, ...)
+          //
+          if (::llvm::isa<mlir::BlockArgument>(operand)) {
+            continue;
+          }
 
-          // V1: Check that currentOp is not fork/join op.
+          Operation *operandOp = operand.getDefiningOp();
+
+          // Skip non-analyzable operands.
           //
-          bool validForL1Interleaved =
-              currentOp->hasOneUse() &&
-              legalLayouts.lookup(currentOp).size() > 0 &&
-              legalLayouts.lookup(nextOp).size() > 0;
-
-          if (validForL1Interleaved) {
-            // Figure out this const based on exec data, but will be replaced
-            // with API.
+          if (isAnalyzable(operandOp)) {
+            TTNNLayoutAttr operandOpLayout = OpMemSpecMap[operandOp].layout;
+
+            // Take into consideration only the operands with L1 interleaved
+            // memory space.
             //
-            constexpr float tensorL1UsageCap = 0.8;
-            uint64_t currentOpL1OutputUsage =
-                getOpOutputLayoutUsage(currentOp, legalLayouts, deviceAttr);
-            uint64_t nextOpL1OutputUsage =
-                getOpOutputLayoutUsage(nextOp, legalLayouts, deviceAttr);
-            bool l1UsageValid = (currentOpL1OutputUsage + nextOpL1OutputUsage) <
-                                tensorL1UsageCap * usableL1CacheSize;
-
-            if (l1UsageValid) {
-              selectedOpLayout[currentOp] =
-                  legalLayouts.lookup(currentOp).front();
-
-              // Add currentOp to l1 chain config.
-              //
-              OpL1MemSpec shardSpec;
-              shardSpec.op = currentOp;
-
-              // Hardcoded tensor split factor for now, until pipeline OP
-              // support is added.
-              //
-              shardSpec.tensorSplitFactor = 1;
-              l1ChainConfigs->back().addOpL1MemSpec(std::move(shardSpec));
-
-              // Update currentOp pointer.
-              //
-              currentOp = nextOp;
-              continue;
+            if (operandOpLayout.hasInterleavedL1TensorMemoryLayout()) {
+              L1Usage l1Usage;
+              l1Usage.outputL1Usage =
+                  getOpOutputL1Usage(operandOp, operandOpLayout, deviceAttr);
+              l1Usage.requiredL1Usage = OpMemSpecMap[operandOp].requiredL1Usage;
+              opsL1Usage[operandOp] = l1Usage;
+            }
+            // In case the operand has DRAM layout, we can insert it into the
+            // precedence directly. If the op is analyzable, it means that it
+            // is definitely schedulable.
+            //
+            else {
+              opsPrecedence.push_back(operandOp);
+            }
+          }
+          // In case the operand is not analyzable, i.e. there are no legal
+          // layouts for this operand, we can insert it into the precedence
+          // directly if it is schedulable since it does not use DRAM nor L1
+          // memory.
+          //
+          else {
+            if (scheduler.isTTShedulableOp(operandOp)) {
+              opsPrecedence.push_back(operandOp);
             }
           }
         }
 
-        currentOp = nullptr;
-        if (!l1ChainConfigs->back().isEmpty()) {
-          l1ChainConfigs->back().build();
-          l1ChainConfigs->push_back(L1ChainConfig());
+        // Greedily find the optimal configuration.
+        //
+        OpConfig optimalConfig = getGreedyConfig(op, opsL1Usage);
+        for (const auto &[op, layout] : optimalConfig.layouts) {
+          OpMemSpecMap[op].layout = layout;
+        }
+
+        // Override op's precedence.
+        //
+        opsPrecedence.insert(opsPrecedence.end(),
+                             optimalConfig.precedence.begin(),
+                             optimalConfig.precedence.end());
+        precedenceMap[op] = std::move(opsPrecedence);
+
+        // Update op's requiredL1Usage if the op is analyzable.
+        //
+        if (isAnalyzable(op)) {
+          uint64_t intermediateRequiredL1Usage = 0;
+          uint64_t intermediateL1Usage = 0;
+          for (auto operand : op->getOperands()) {
+            // Skip block arguments (%arg0, %arg1, ...)
+            //
+            if (::llvm::isa<mlir::BlockArgument>(operand)) {
+              continue;
+            }
+
+            Operation *operandOp = operand.getDefiningOp();
+
+            // Skip non-analyzable operands.
+            //
+            if (isAnalyzable(operandOp)) {
+              intermediateRequiredL1Usage =
+                  std::max(intermediateRequiredL1Usage,
+                           intermediateL1Usage +
+                               OpMemSpecMap[operandOp].requiredL1Usage);
+              intermediateL1Usage += getOpOutputL1Usage(
+                  operandOp, OpMemSpecMap[operandOp].layout, deviceAttr);
+            }
+          }
+          OpMemSpecMap[op].requiredL1Usage = std::max(
+              intermediateRequiredL1Usage,
+              intermediateL1Usage +
+                  getOpOutputL1Usage(op, OpMemSpecMap[op].layout, deviceAttr));
         }
       }
     }
 
-    if (l1ChainConfigs->back().isEmpty()) {
-      l1ChainConfigs->pop_back();
-    }
+    // Construct the schedule.
+    //
+    constructSchedule(func);
 
-    // Schedule
+    // Build, Resolve and Complete the L1 chain.
+    // This implementation is only here unitl we are able to merge
+    // L1ChainConfigs.
+    // TODO(fbajraktari): Fix this hack.
     //
-    (*schedule)[func] = scheduler.getSchedule();
+    l1ChainConfigs->push_back(L1ChainConfig());
+    llvm::DenseMap<Operation *, TTNNLayoutAttr> selectedOpLayout;
+    for (auto &OpMemSpec : OpMemSpecMap) {
+      OpL1MemSpec opL1MemSpec;
+      opL1MemSpec.op = OpMemSpec.first;
+      opL1MemSpec.tensorSplitFactor = 1;
+      selectedOpLayout[OpMemSpec.first] = OpMemSpec.second.layout;
+      l1ChainConfigs->back().addOpL1MemSpec(opL1MemSpec);
+    }
+    l1ChainConfigs->back().build();
+    l1ChainConfigs->back().resolve();
+    std::unordered_set<Edge> memReconfigEdges;
+    l1ChainConfigs->back().complete(selectedOpLayout, memReconfigEdges);
+  }
+}
 
-    // Resolve l1 chain configs.
+bool L1InterleavedPolicy::isAnalyzable(Operation *op) {
+  // Skip operations that are not analyzed by the LegalGridAnalysis.
+  //
+  if (legalLayouts.count(op) > 0) {
+    // Skip operations that are filterd out by the MemoryLayoutAnalysis.
     //
-    for (auto &l1ChainConfig : *l1ChainConfigs) {
-      l1ChainConfig.resolve();
+    return legalLayouts[op].size() > 0;
+  }
+  return false;
+}
 
-      std::unordered_set<Edge> memReconfigEdges;
-      l1ChainConfig.complete(selectedOpLayout, memReconfigEdges);
-    }
-  });
+bool L1InterleavedPolicy::hasDRAMBufferType(Operation *op) {
+  return std::find_if(legalLayouts[op].begin(), legalLayouts[op].end(),
+                      [](TTNNLayoutAttr layout) {
+                        return layout.hasDRAMBufferType();
+                      }) != legalLayouts[op].end();
+}
+
+TTNNLayoutAttr L1InterleavedPolicy::getDRAMLayout(Operation *op) {
+  assert(hasDRAMBufferType(op));
+  auto dramLayoutIter = std::find_if(
+      legalLayouts[op].begin(), legalLayouts[op].end(),
+      [](TTNNLayoutAttr layout) { return layout.hasDRAMBufferType(); });
+  return *dramLayoutIter;
+}
+
+bool L1InterleavedPolicy::hasL1BufferType(Operation *op) {
+  return std::find_if(legalLayouts[op].begin(), legalLayouts[op].end(),
+                      [](TTNNLayoutAttr layout) {
+                        return layout.hasInterleavedL1TensorMemoryLayout();
+                      }) != legalLayouts[op].end();
+}
+
+TTNNLayoutAttr L1InterleavedPolicy::getL1InterleavedLayout(Operation *op) {
+  assert(hasL1BufferType(op));
+  auto l1InterleaveLayoutIter =
+      std::find_if(legalLayouts[op].begin(), legalLayouts[op].end(),
+                   [](TTNNLayoutAttr layout) {
+                     return layout.hasInterleavedL1TensorMemoryLayout();
+                   });
+  return *l1InterleaveLayoutIter;
 }
 
 } // namespace mlir::tt::ttnn
diff --git a/lib/Dialect/TTNN/Analysis/LegalGridAnalysis.cpp b/lib/Dialect/TTNN/Analysis/LegalGridAnalysis.cpp
index b01f4cf384..9bbbccf5ea 100644
--- a/lib/Dialect/TTNN/Analysis/LegalGridAnalysis.cpp
+++ b/lib/Dialect/TTNN/Analysis/LegalGridAnalysis.cpp
@@ -115,6 +115,14 @@ void LegalGridAnalysis::analysisImplementation() {
     return;
   }
 
+  if (!isa<RankedTensorType>(op->getResult(0).getType())) {
+    return;
+  }
+
+  if (llvm::isa<ttnn::EmptyOp>(op)) {
+    return;
+  }
+
   // Get output tensor type.
   RankedTensorType tensorType =
       mlir::cast<RankedTensorType>(op->getResult(0).getType());
diff --git a/lib/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.cpp b/lib/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.cpp
index a89c5842b9..f3db4ed7bf 100644
--- a/lib/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.cpp
+++ b/lib/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.cpp
@@ -5,6 +5,7 @@
 #include "ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.h"
 #include "ttmlir/Dialect/TTNN/Analysis/DFShardingPolicy.h"
 #include "ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h"
+#include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h"
 
 namespace mlir::tt::ttnn {
 
@@ -35,14 +36,15 @@ filterShardedOnly(const llvm::DenseMap<Operation *, std::vector<TTNNLayoutAttr>>
 }
 
 llvm::DenseMap<Operation *, std::vector<TTNNLayoutAttr>>
-filterL1InterleavedOnly(
+filterDRAMAndL1Interleaved(
     const llvm::DenseMap<Operation *, std::vector<TTNNLayoutAttr>>
         &legalLayouts) {
   llvm::DenseMap<Operation *, std::vector<TTNNLayoutAttr>> l1InterleavedLayouts;
   for (const auto &opLayouts : legalLayouts) {
     std::vector<TTNNLayoutAttr> opL1InterleavedLayouts;
     for (const auto &layout : opLayouts.second) {
-      if (layout.hasInterleavedL1TensorMemoryLayout()) {
+      if (layout.hasDRAMBufferType() ||
+          layout.hasInterleavedL1TensorMemoryLayout()) {
         opL1InterleavedLayouts.push_back(layout);
       }
     }
@@ -68,7 +70,8 @@ void MemoryLayoutAnalysis::analysisImplementation() {
   }
   case MemoryLayoutAnalysisPolicyType::L1Interleaved: {
     L1InterleavedPolicy l1InterleavedPolicy(
-        op, l1ChainConfigs, filterL1InterleavedOnly(analysisInput.legalLayouts),
+        op, l1ChainConfigs,
+        filterDRAMAndL1Interleaved(analysisInput.legalLayouts),
         analysisResult.schedule, analysisInput.usableL1CacheSize);
     l1InterleavedPolicy.run();
     break;
diff --git a/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp b/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp
index 8aaae12618..10b54f418b 100644
--- a/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp
+++ b/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp
@@ -24,6 +24,11 @@ inline bool isDeviceBufferType(BufferType bufferType) {
   return bufferType == BufferType::DRAM || bufferType == BufferType::L1;
 }
 
+// Check if tensor is in DRAM memory
+inline bool isDRAMBufferType(BufferType bufferType) {
+  return bufferType == BufferType::DRAM;
+}
+
 // Check if tensor is in L1 memory
 inline bool isL1BufferType(BufferType bufferType) {
   return bufferType == BufferType::L1;
@@ -39,6 +44,16 @@ Layout TTNNLayoutAttr::getLayout() const {
   return isTiled() ? Layout::Tile : Layout::RowMajor;
 }
 
+// Check if the tensor memory buffer type is L1
+bool TTNNLayoutAttr::hasL1BufferType() const {
+  return isL1BufferType(getBufferType());
+}
+
+// Check if the tensor memory buffer type is DRAM
+bool TTNNLayoutAttr::hasDRAMBufferType() const {
+  return isDRAMBufferType(getBufferType());
+}
+
 // Check if the tensor memory layout is sharded
 bool TTNNLayoutAttr::hasShardedTensorMemoryLayout() const {
   return (getMemLayout() == TensorMemoryLayout::HeightSharded ||
@@ -48,7 +63,7 @@ bool TTNNLayoutAttr::hasShardedTensorMemoryLayout() const {
 
 // Check if the tensor memory layout is sharded in L1 memory
 bool TTNNLayoutAttr::hasShardedL1TensorMemoryLayout() const {
-  return isL1BufferType(getBufferType()) &&
+  return hasL1BufferType() &&
          (getMemLayout() == TensorMemoryLayout::HeightSharded ||
           getMemLayout() == TensorMemoryLayout::WidthSharded ||
           getMemLayout() == TensorMemoryLayout::BlockSharded);
@@ -56,7 +71,13 @@ bool TTNNLayoutAttr::hasShardedL1TensorMemoryLayout() const {
 
 // Check if the tensor memory layout is interleaved and in L1 memory
 bool TTNNLayoutAttr::hasInterleavedL1TensorMemoryLayout() const {
-  return isL1BufferType(getBufferType()) &&
+  return hasL1BufferType() &&
+         (getMemLayout() == TensorMemoryLayout::Interleaved);
+}
+
+// Check if the tensor memory layout is interleaved and in DRAM memory
+bool TTNNLayoutAttr::hasInterleavedDRAMTensorMemoryLayout() const {
+  return hasDRAMBufferType() &&
          (getMemLayout() == TensorMemoryLayout::Interleaved);
 }
 
diff --git a/lib/Dialect/TTNN/Transforms/Optimizer.cpp b/lib/Dialect/TTNN/Transforms/Optimizer.cpp
index e5d2f86d86..783f3ea07f 100644
--- a/lib/Dialect/TTNN/Transforms/Optimizer.cpp
+++ b/lib/Dialect/TTNN/Transforms/Optimizer.cpp
@@ -170,6 +170,10 @@ class TTNNOptimizer : public impl::TTNNOptimizerBase<TTNNOptimizer> {
         return;
       }
 
+      if (llvm::isa<ttnn::EmptyOp>(op)) {
+        return;
+      }
+
       RankedTensorType tensorType =
           mlir::cast<RankedTensorType>(op->getResult(0).getType());
       LegalGridAnalysis legalGridAnalysis =
diff --git a/lib/Scheduler/Scheduler.cpp b/lib/Scheduler/Scheduler.cpp
index 25923fffdf..52066c5e87 100644
--- a/lib/Scheduler/Scheduler.cpp
+++ b/lib/Scheduler/Scheduler.cpp
@@ -12,7 +12,8 @@
 
 namespace mlir::tt::scheduler {
 
-bool isTTNNOp(mlir::Operation *op) {
+// TTNN op is scheduleable if it is not an EmptyOp and has at least one result.
+bool isTTNNScheduleableOp(mlir::Operation *op) {
   return isa<ttnn::TTNNDialect>(op->getDialect()) && op->getNumResults() > 0 &&
          !llvm::isa<ttnn::EmptyOp>(op);
 }
@@ -21,8 +22,8 @@ bool isTTIROp(mlir::Operation *op) {
   return isa<ttir::TTIRDialect>(op->getDialect());
 }
 
-bool isTTShedulableOp(mlir::Operation *op) {
-  return isTTNNOp(op) || isTTIROp(op);
+bool Scheduler::isTTShedulableOp(mlir::Operation *op) {
+  return isTTNNScheduleableOp(op) || isTTIROp(op);
 }
 
 // Init the dependencies map of all ops which are TTIR ops
diff --git a/test/ttmlir/Silicon/TTNN/optimizer/all_l1_interleaved_policy.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/all_l1_interleaved_policy.mlir
similarity index 79%
rename from test/ttmlir/Silicon/TTNN/optimizer/all_l1_interleaved_policy.mlir
rename to test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/all_l1_interleaved_policy.mlir
index 6fa884d79f..11eb41da17 100644
--- a/test/ttmlir/Silicon/TTNN/optimizer/all_l1_interleaved_policy.mlir
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/all_l1_interleaved_policy.mlir
@@ -1,30 +1,27 @@
-// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path% enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s > %t.mlir
-// RUN: FileCheck %s --input-file=%t.mlir
-// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 module attributes {} {
   func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x96xbf16>, %arg2: tensor<64x96xbf16>, %arg3: tensor<96x32xbf16>, %arg4: tensor<64x32xbf16>) -> tensor<64x32xbf16> {
     // CHECK: #[[L1_:.*]] = #ttnn.buffer_type<l1>
-    // CHECK: #[[LAYOUT_6:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<{{.*}}, #l1_>, interleaved>
     // CHECK: #[[LAYOUT_7:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<{{.*}}, #l1_>, interleaved>
-    // CHECK: #[[LAYOUT_8:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<{{.*}}, #dram>, interleaved>
+    // CHECK: #[[LAYOUT_10:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<{{.*}}, #l1_>, interleaved>
     %0 = tensor.empty() : tensor<64x96xbf16>
-    // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<64x96xbf16, #[[LAYOUT_6]]>
+    // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<64x96xbf16, #[[LAYOUT_7]]>
     %1 = "ttir.matmul"(%arg0, %arg1, %0) <{operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xbf16>, tensor<128x96xbf16>, tensor<64x96xbf16>) -> tensor<64x96xbf16>
     %2 = tensor.empty() : tensor<64x96xbf16>
-    // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<64x96xbf16, #[[LAYOUT_6]]>
+    // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<64x96xbf16, #[[LAYOUT_7]]>
     %3 = "ttir.add"(%1, %arg2, %2) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x96xbf16>, tensor<64x96xbf16>, tensor<64x96xbf16>) -> tensor<64x96xbf16>
     %4 = tensor.empty() : tensor<64x96xbf16>
-    // CHECK: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<64x96xbf16, #[[LAYOUT_6]]>
+    // CHECK: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<64x96xbf16, #[[LAYOUT_7]]>
     %5 = "ttir.relu"(%3, %4) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x96xbf16>, tensor<64x96xbf16>) -> tensor<64x96xbf16>
     %6 = tensor.empty() : tensor<64x32xbf16>
-    // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<64x32xbf16, #[[LAYOUT_7]]>
+    // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<64x32xbf16, #[[LAYOUT_10]]>
     %7 = "ttir.matmul"(%5, %arg3, %6) <{operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x96xbf16>, tensor<96x32xbf16>, tensor<64x32xbf16>) -> tensor<64x32xbf16>
     %8 = tensor.empty() : tensor<64x32xbf16>
-    // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<64x32xbf16, #[[LAYOUT_7]]>
+    // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<64x32xbf16, #[[LAYOUT_10]]>
     %9 = "ttir.add"(%7, %arg4, %8) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x32xbf16>, tensor<64x32xbf16>, tensor<64x32xbf16>) -> tensor<64x32xbf16>
     %10 = tensor.empty() : tensor<64x32xbf16>
-    // CHECK: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<64x32xbf16, #[[LAYOUT_8]]>
+    // CHECK: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<64x32xbf16, #[[LAYOUT_10]]>
     %11 = "ttir.relu"(%9, %10) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x32xbf16>, tensor<64x32xbf16>) -> tensor<64x32xbf16>
     return %11 : tensor<64x32xbf16>
   }
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/fork_join.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/fork_join.mlir
new file mode 100644
index 0000000000..fef8cdd489
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/fork_join.mlir
@@ -0,0 +1,45 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s
+//
+//         A
+//         |
+//         B
+//       /   \
+//      C     D
+//      |     |
+//      |     E
+//       \   /
+//         F
+//         |
+//         G
+//
+// This tests two things:
+//   1. Output of op B (fork op) should be in DRAM.
+//   2. Even though both precedence [C, E] and [E, C] for op F are legal,
+//      the optimizer should choose the one with lower requiredL1Usage. In
+//      this case, [E, C] should be chosen.
+//
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<64x64xbf16>, %arg1: tensor<64x32xbf16>) -> tensor<64x32xbf16> {
+    // CHECK: #[[L1_:.*]] = #ttnn.buffer_type<l1>
+    // CHECK: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8>, memref<8x8xbf16, #dram>, interleaved>
+    // CHECK: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8>, memref<8x4xbf16, #l1_>, interleaved>
+    // CHECK: #[[LAYOUT_6:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8>, memref<8x8xbf16, #l1_>, interleaved>
+    %0 = tensor.empty() : tensor<64x64xbf16>
+    // CHECK: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<64x64xbf16, #[[LAYOUT_3]]>
+    %1 = "ttir.relu"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x64xbf16>, tensor<64x64xbf16>) -> tensor<64x64xbf16>
+    %2 = tensor.empty() : tensor<64x64xbf16>
+    %3 = "ttir.relu"(%1, %2) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x64xbf16>, tensor<64x64xbf16>) -> tensor<64x64xbf16>
+    %4 = tensor.empty() : tensor<64x32xbf16>
+    %5 = "ttir.matmul"(%1, %arg1, %4) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x64xbf16>, tensor<64x32xbf16>, tensor<64x32xbf16>) -> tensor<64x32xbf16>
+    %6 = tensor.empty() : tensor<64x32xbf16>
+    %7 = "ttir.relu"(%5, %6) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x32xbf16>, tensor<64x32xbf16>) -> tensor<64x32xbf16>
+    %8 = tensor.empty() : tensor<64x32xbf16>
+    // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<64x32xbf16, #[[LAYOUT_5]]>
+    // CHECK: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<64x32xbf16, #[[LAYOUT_5]]>
+    // CHECK: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<64x64xbf16, #[[LAYOUT_6]]>
+    // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<64x32xbf16, #[[LAYOUT_5]]>
+    %9 = "ttir.matmul"(%3, %7, %8) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x64xbf16>, tensor<64x32xbf16>, tensor<64x32xbf16>) -> tensor<64x32xbf16>
+    return %9 : tensor<64x32xbf16>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/optimizer/mnist_l1_interleaved.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/mnist_l1_interleaved.mlir
similarity index 88%
rename from test/ttmlir/Silicon/TTNN/optimizer/mnist_l1_interleaved.mlir
rename to test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/mnist_l1_interleaved.mlir
index ef6fae268e..93a19ad6e3 100644
--- a/test/ttmlir/Silicon/TTNN/optimizer/mnist_l1_interleaved.mlir
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/mnist_l1_interleaved.mlir
@@ -1,13 +1,11 @@
-// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path% enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s > %t.mlir
-// RUN: FileCheck %s --input-file=%t.mlir
-// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 #loc = loc("MNISTLinear":4294967295:0)
 module @"tt-forge-graph" attributes {} {
   func.func @main(%arg0: tensor<1x784xf32> loc("MNISTLinear":4294967295:0), %arg1: tensor<1x10xf32> loc("MNISTLinear":4294967295:0), %arg2: tensor<256x10xf32> loc("MNISTLinear":4294967295:0), %arg3: tensor<1x256xf32> loc("MNISTLinear":4294967295:0), %arg4: tensor<784x256xf32> loc("MNISTLinear":4294967295:0)) -> tensor<1x10xf32> {
+    // CHECK: #[[L1_:.*]] = #ttnn.buffer_type<l1>
     // CHECK: #[[LAYOUT_6:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<{{.*}}, #l1_>, interleaved>
     // CHECK: #[[LAYOUT_7:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<{{.*}}, #l1_>, interleaved>
-    // CHECK: #[[LAYOUT_8:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<{{.*}}, #dram>, interleaved>
     %0 = tensor.empty() : tensor<1x256xf32> loc(#loc8)
     // CHECK: %[[C:.*]] = "ttnn.matmul"[[C:.*]] -> tensor<1x256xf32, #[[LAYOUT_6]]>
     %1 = "ttir.matmul"(%arg0, %arg4, %0) <{operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x784xf32>, tensor<784x256xf32>, tensor<1x256xf32>) -> tensor<1x256xf32> loc(#loc8)
@@ -24,7 +22,7 @@ module @"tt-forge-graph" attributes {} {
     // CHECK: %[[C:.*]] = "ttnn.add"[[C:.*]] -> tensor<1x10xf32, #[[LAYOUT_7]]>
     %9 = "ttir.add"(%7, %arg1, %8) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32> loc(#loc12)
     %10 = tensor.empty() : tensor<1x10xf32> loc(#loc13)
-    // CHECK: %{{.*}} = "ttnn.softmax"{{.*}} -> tensor<1x10xf32, #[[LAYOUT_8]]>
+    // CHECK: %{{.*}} = "ttnn.softmax"{{.*}} -> tensor<1x10xf32, #[[LAYOUT_7]]>
     %11 = "ttir.softmax"(%9, %10) <{dimension = 1 : si32, operand_constraints = [#any_device, #any_device]}> : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32> loc(#loc13)
     return %11 : tensor<1x10xf32> loc(#loc7)
   } loc(#loc)
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_ABC_l1_None.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_ABC_l1_None.mlir
new file mode 100644
index 0000000000..acbb8d674a
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_ABC_l1_None.mlir
@@ -0,0 +1,28 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s
+//
+//       A     B
+//        \   /
+//          C
+//          |
+//          D
+//
+//  (A > L1) AND (B > L1) AND (C > L1)
+//      =>
+//  DRAM: ABC; L1: None
+//
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x8192xbf16>, %arg2: tensor<8192x8192xbf16>, %arg3: tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16> {
+    // CHECK-DAG: #[[LAYOUT_2:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<1024x1024xbf16, #dram>, interleaved>
+    %0 = tensor.empty() : tensor<8192x8192xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<8192x8192xbf16, #[[LAYOUT_2]]>
+    %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x8192xbf16>, tensor<8192x8192xbf16>, tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16>
+    %2 = tensor.empty() : tensor<8192x8192xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<8192x8192xbf16, #[[LAYOUT_2]]>
+    %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x8192xbf16>, tensor<8192x8192xbf16>, tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16>
+    %4 = tensor.empty() : tensor<8192x8192xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<8192x8192xbf16, #[[LAYOUT_2]]>
+    %5 = "ttir.matmul"(%1, %3, %4) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x8192xbf16>, tensor<8192x8192xbf16>, tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16>
+    return %5 : tensor<8192x8192xbf16>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AB_l1_C.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AB_l1_C.mlir
new file mode 100644
index 0000000000..49aebb6a4c
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AB_l1_C.mlir
@@ -0,0 +1,31 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s
+//
+//       A     B
+//        \   /
+//          C
+//          |
+//          D
+//
+//  (A + C > L1) AND (B + C > L1) AND (A + B > L1) AND (A < C) AND (B < C) AND (C <= L1)
+//      =>
+//  DRAM: AB; L1: C
+//
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<5120x4096xbf16>, %arg1: tensor<5120x4096xbf16>, %arg2: tensor<4096x5120xbf16>, %arg3: tensor<4096x5120xbf16>) -> tensor<5120x5120xbf16> {
+    // CHECK: #[[L1_:.*]] = #ttnn.buffer_type<l1>
+    // CHECK-DAG: #[[LAYOUT_4:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<512x640xbf16, #dram>, interleaved>
+    // CHECK-DAG: #[[LAYOUT_6:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<640x512xbf16, #dram>, interleaved>
+    // CHECK-DAG: #[[LAYOUT_7:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<640x640xbf16, #l1_>, interleaved>
+    %0 = tensor.empty() : tensor<5120x4096xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<5120x4096xbf16, #[[LAYOUT_6]]>
+    %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<5120x4096xbf16>, tensor<5120x4096xbf16>, tensor<5120x4096xbf16>) -> tensor<5120x4096xbf16>
+    %2 = tensor.empty() : tensor<4096x5120xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<4096x5120xbf16, #[[LAYOUT_4]]>
+    %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<4096x5120xbf16>, tensor<4096x5120xbf16>, tensor<4096x5120xbf16>) -> tensor<4096x5120xbf16>
+    %4 = tensor.empty() : tensor<5120x5120xbf16>
+    // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<5120x5120xbf16, #[[LAYOUT_7]]>
+    %5 = "ttir.matmul"(%1, %3, %4) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<5120x4096xbf16>, tensor<4096x5120xbf16>, tensor<5120x5120xbf16>) -> tensor<5120x5120xbf16>
+    return %5 : tensor<5120x5120xbf16>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AC_l1_B.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AC_l1_B.mlir
new file mode 100644
index 0000000000..7f41675cd4
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AC_l1_B.mlir
@@ -0,0 +1,30 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s
+//
+//       A     B
+//        \   /
+//          C
+//          |
+//          D
+//
+//  (A + C > L1) AND (B + C > L1) AND (A + B > L1) AND (A < B) AND (C < B) AND (B <= L1)
+//      =>
+//  DRAM: AC; L1: B
+//
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<4096x5120xbf16>, %arg1: tensor<4096x5120xbf16>, %arg2: tensor<5120x5120xbf16>, %arg3: tensor<5120x5120xbf16>) -> tensor<4096x5120xbf16> {
+    // CHECK: #[[L1_:.*]] = #ttnn.buffer_type<l1>
+    // CHECK-DAG: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<512x640xbf16, #dram>, interleaved>
+    // CHECK-DAG: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<640x640xbf16, #l1_>, interleaved>
+    %0 = tensor.empty() : tensor<4096x5120xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<4096x5120xbf16, #[[LAYOUT_3]]>
+    %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<4096x5120xbf16>, tensor<4096x5120xbf16>, tensor<4096x5120xbf16>) -> tensor<4096x5120xbf16>
+    %2 = tensor.empty() : tensor<5120x5120xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<5120x5120xbf16, #[[LAYOUT_5]]>
+    %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<5120x5120xbf16>, tensor<5120x5120xbf16>, tensor<5120x5120xbf16>) -> tensor<5120x5120xbf16>
+    %4 = tensor.empty() : tensor<4096x5120xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<4096x5120xbf16, #[[LAYOUT_3]]>
+    %5 = "ttir.matmul"(%1, %3, %4) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<4096x5120xbf16>, tensor<5120x5120xbf16>, tensor<4096x5120xbf16>) -> tensor<4096x5120xbf16>
+    return %5 : tensor<4096x5120xbf16>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_A_l1_BC.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_A_l1_BC.mlir
new file mode 100644
index 0000000000..7d4c923b43
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_A_l1_BC.mlir
@@ -0,0 +1,30 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s
+//
+//       A     B
+//        \   /
+//          C
+//          |
+//          D
+//
+//  (A + B + C > L1) AND (A + C < B + C) AND (A + B < B + C) AND (B + C <= L1)
+//      =>
+//  DRAM: A; L1: BC
+//
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<2048x2048xbf16>, %arg1: tensor<2048x2048xbf16>, %arg2: tensor<2048x8192xbf16>, %arg3: tensor<2048x8192xbf16>) -> tensor<2048x8192xbf16> {
+    // CHECK: #[[L1_:.*]] = #ttnn.buffer_type<l1>
+    // CHECK-DAG: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<256x256xbf16, #dram>, interleaved>
+    // CHECK-DAG: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<256x1024xbf16, #l1_>, interleaved>
+    %0 = tensor.empty() : tensor<2048x2048xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<2048x2048xbf16, #[[LAYOUT_3]]>
+    %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<2048x2048xbf16>, tensor<2048x2048xbf16>, tensor<2048x2048xbf16>) -> tensor<2048x2048xbf16>
+    %2 = tensor.empty() : tensor<2048x8192xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<2048x8192xbf16, #[[LAYOUT_5]]>
+    %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<2048x8192xbf16>, tensor<2048x8192xbf16>, tensor<2048x8192xbf16>) -> tensor<2048x8192xbf16>
+    %4 = tensor.empty() : tensor<2048x8192xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<2048x8192xbf16, #[[LAYOUT_5]]>
+    %5 = "ttir.matmul"(%1, %3, %4) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<2048x2048xbf16>, tensor<2048x8192xbf16>, tensor<2048x8192xbf16>) -> tensor<2048x8192xbf16>
+    return %5 : tensor<2048x8192xbf16>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_BC_l1_A.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_BC_l1_A.mlir
new file mode 100644
index 0000000000..c915fadd1c
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_BC_l1_A.mlir
@@ -0,0 +1,30 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s
+//
+//       A     B
+//        \   /
+//          C
+//          |
+//          D
+//
+//  (A + C > L1) AND (B + C > L1) AND (A + B > L1) AND (B < A) AND (C < A) AND (A <= L1)
+//      =>
+//  DRAM: BC; L1: A
+//
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<5120x5120xbf16>, %arg1: tensor<5120x5120xbf16>, %arg2: tensor<5120x4096xbf16>, %arg3: tensor<5120x4096xbf16>) -> tensor<5120x4096xbf16> {
+    // CHECK: #[[L1_:.*]] = #ttnn.buffer_type<l1>
+    // CHECK-DAG: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<640x512xbf16, #dram>, interleaved>
+    // CHECK-DAG: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<640x640xbf16, #l1_>, interleaved>
+    %0 = tensor.empty() : tensor<5120x5120xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<5120x5120xbf16, #[[LAYOUT_5]]>
+    %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<5120x5120xbf16>, tensor<5120x5120xbf16>, tensor<5120x5120xbf16>) -> tensor<5120x5120xbf16>
+    %2 = tensor.empty() : tensor<5120x4096xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<5120x4096xbf16, #[[LAYOUT_3]]>
+    %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<5120x4096xbf16>, tensor<5120x4096xbf16>, tensor<5120x4096xbf16>) -> tensor<5120x4096xbf16>
+    %4 = tensor.empty() : tensor<5120x4096xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<5120x4096xbf16, #[[LAYOUT_3]]>
+    %5 = "ttir.matmul"(%1, %3, %4) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<5120x5120xbf16>, tensor<5120x4096xbf16>, tensor<5120x4096xbf16>) -> tensor<5120x4096xbf16>
+    return %5 : tensor<5120x4096xbf16>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_B_l1_AC.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_B_l1_AC.mlir
new file mode 100644
index 0000000000..3d2538e245
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_B_l1_AC.mlir
@@ -0,0 +1,30 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s
+//
+//       A     B
+//        \   /
+//          C
+//          |
+//          D
+//
+//  (A + B + C > L1) AND (B + C < A + C) AND (A + B < A + C) AND (A + C <= L1)
+//      =>
+//  DRAM: B; L1: AC
+//
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<8192x2048xbf16>, %arg1: tensor<8192x2048xbf16>, %arg2: tensor<2048x2048xbf16>, %arg3: tensor<2048x2048xbf16>) -> tensor<8192x2048xbf16> {
+    // CHECK: #[[L1_:.*]] = #ttnn.buffer_type<l1>
+    // CHECK-DAG: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<256x256xbf16, #dram>, interleaved>
+    // CHECK-DAG: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<1024x256xbf16, #l1_>, interleaved>
+    %0 = tensor.empty() : tensor<8192x2048xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<8192x2048xbf16, #[[LAYOUT_5]]>
+    %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x2048xbf16>, tensor<8192x2048xbf16>, tensor<8192x2048xbf16>) -> tensor<8192x2048xbf16>
+    %2 = tensor.empty() : tensor<2048x2048xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<2048x2048xbf16, #[[LAYOUT_3]]>
+    %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<2048x2048xbf16>, tensor<2048x2048xbf16>, tensor<2048x2048xbf16>) -> tensor<2048x2048xbf16>
+    %4 = tensor.empty() : tensor<8192x2048xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<8192x2048xbf16, #[[LAYOUT_5]]>
+    %5 = "ttir.matmul"(%1, %3, %4) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x2048xbf16>, tensor<2048x2048xbf16>, tensor<8192x2048xbf16>) -> tensor<8192x2048xbf16>
+    return %5 : tensor<8192x2048xbf16>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_C_l1_AB.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_C_l1_AB.mlir
new file mode 100644
index 0000000000..320f00ce3c
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_C_l1_AB.mlir
@@ -0,0 +1,31 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s
+//
+//       A     B
+//        \   /
+//          C
+//          |
+//          D
+//
+//  (A + B + C > L1) AND (A + C < A + B) AND (B + C < A + B) AND (A + B <= L1)
+//      =>
+//  DRAM: C; L1: AB
+//
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<2048x8192xbf16>, %arg1: tensor<2048x8192xbf16>, %arg2: tensor<8192x2048xbf16>, %arg3: tensor<8192x2048xbf16>) -> tensor<2048x2048xbf16> {
+    // CHECK: #[[L1_:.*]] = #ttnn.buffer_type<l1>
+    // CHECK-DAG: #[[LAYOUT_4:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<256x1024xbf16, #l1_>, interleaved>
+    // CHECK-DAG: #[[LAYOUT_6:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<1024x256xbf16, #l1_>, interleaved>
+    // CHECK-DAG: #[[LAYOUT_7:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<256x256xbf16, #dram>, interleaved>
+    %0 = tensor.empty() : tensor<2048x8192xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<2048x8192xbf16, #[[LAYOUT_4]]>
+    %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<2048x8192xbf16>, tensor<2048x8192xbf16>, tensor<2048x8192xbf16>) -> tensor<2048x8192xbf16>
+    %2 = tensor.empty() : tensor<8192x2048xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<8192x2048xbf16, #[[LAYOUT_6]]>
+    %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x2048xbf16>, tensor<8192x2048xbf16>, tensor<8192x2048xbf16>) -> tensor<8192x2048xbf16>
+    %4 = tensor.empty() : tensor<2048x2048xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<2048x2048xbf16, #[[LAYOUT_7]]>
+    %5 = "ttir.matmul"(%1, %3, %4) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<2048x8192xbf16>, tensor<8192x2048xbf16>, tensor<2048x2048xbf16>) -> tensor<2048x2048xbf16>
+    return %5 : tensor<2048x2048xbf16>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_None_l1_ABC.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_None_l1_ABC.mlir
new file mode 100644
index 0000000000..a21a11f879
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_None_l1_ABC.mlir
@@ -0,0 +1,29 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s
+//
+//       A     B
+//        \   /
+//          C
+//          |
+//          D
+//
+//  (A + B + C <= L1)
+//      =>
+//  DRAM: None; L1: ABC
+//
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<32x32xbf16>, %arg1: tensor<32x32xbf16>, %arg2: tensor<32x32xbf16>, %arg3: tensor<32x32xbf16>) -> tensor<32x32xbf16> {
+    // CHECK: #[[L1_:.*]] = #ttnn.buffer_type<l1>
+    // CHECK-DAG: #[[LAYOUT_2:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<4x4xbf16, #l1_>, interleaved>
+    %0 = tensor.empty() : tensor<32x32xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<32x32xbf16, #[[LAYOUT_2]]>
+    %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<32x32xbf16>, tensor<32x32xbf16>, tensor<32x32xbf16>) -> tensor<32x32xbf16>
+    %2 = tensor.empty() : tensor<32x32xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<32x32xbf16, #[[LAYOUT_2]]>
+    %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<32x32xbf16>, tensor<32x32xbf16>, tensor<32x32xbf16>) -> tensor<32x32xbf16>
+    %4 = tensor.empty() : tensor<32x32xbf16>
+    // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<32x32xbf16, #[[LAYOUT_2]]>
+    %5 = "ttir.add"(%1, %3, %4) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<32x32xbf16>, tensor<32x32xbf16>, tensor<32x32xbf16>) -> tensor<32x32xbf16>
+    return %5 : tensor<32x32xbf16>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/single_op.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/single_op.mlir
new file mode 100644
index 0000000000..4820799936
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/single_op.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s
+// UNSUPPORTED: true
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<5120x5120xbf16>) -> tensor<5120x5120xbf16> {
+    %0 = tensor.empty() : tensor<5120x5120xbf16>
+    %1 = "ttir.relu"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device_tile, #any_device_tile]}> : (tensor<5120x5120xbf16>, tensor<5120x5120xbf16>) -> tensor<5120x5120xbf16>
+    return %1 : tensor<5120x5120xbf16>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/optimizer/large_tensors.mlir b/test/ttmlir/Silicon/TTNN/optimizer/large_tensors.mlir
deleted file mode 100644
index fb71dae8d7..0000000000
--- a/test/ttmlir/Silicon/TTNN/optimizer/large_tensors.mlir
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path% enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s > %t.mlir
-// RUN: FileCheck %s --input-file=%t.mlir
-// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
-#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
-module attributes {} {
-  func.func @forward(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x8192xbf16>, %arg2: tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16> {
-    // CHECK: #[[LAYOUT_2:ttnn_layout2]] = #ttnn.ttnn_layout<{{.*}}, memref<{{.*}}, #dram>, {{.*}}>
-    %0 = tensor.empty() : tensor<8192x8192xbf16>
-    // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<8192x8192xbf16, #[[LAYOUT_2]]>
-    %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x8192xbf16>, tensor<8192x8192xbf16>, tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16>
-    %2 = tensor.empty() : tensor<8192x8192xbf16>
-    // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<8192x8192xbf16, #[[LAYOUT_2]]>
-    %3 = "ttir.add"(%1, %arg2, %2) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x8192xbf16>, tensor<8192x8192xbf16>, tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16>
-    %4 = tensor.empty() : tensor<8192x8192xbf16>
-    // CHECK: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<8192x8192xbf16, #[[LAYOUT_2]]>
-    %7 = "ttir.relu"(%3, %4) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<8192x8192xbf16>, tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16>
-    return %7 : tensor<8192x8192xbf16>
-  }
-}
diff --git a/test/unittests/Optimizer/CMakeLists.txt b/test/unittests/Optimizer/CMakeLists.txt
index 4e6ee799a7..b05c8ae294 100644
--- a/test/unittests/Optimizer/CMakeLists.txt
+++ b/test/unittests/Optimizer/CMakeLists.txt
@@ -1,11 +1,13 @@
 add_mlir_unittest(OptimizerTests
     TestShardSolver.cpp
     TestOptimizerOverrides.cpp
+    TestL1InterleavedPolicy.cpp
 )
 
 target_link_libraries(OptimizerTests
     PRIVATE
     MLIR
     MLIRTTDialect
+    MLIRTTNNAnalysis
     MLIRTTNNPipelines
 )
diff --git a/test/unittests/Optimizer/TestL1InterleavedPolicy.cpp b/test/unittests/Optimizer/TestL1InterleavedPolicy.cpp
new file mode 100644
index 0000000000..7d02cef56f
--- /dev/null
+++ b/test/unittests/Optimizer/TestL1InterleavedPolicy.cpp
@@ -0,0 +1,193 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gtest/gtest.h>
+
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "llvm/ADT/SmallVector.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+
+#include "ttmlir/Dialect/TTNN/IR/TTNN.h"
+#include "ttmlir/Dialect/TTNN/IR/TTNNOps.h"
+
+#include "ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h"
+
+using namespace mlir::tt::ttnn;
+
+constexpr int TensorDimX = 128;
+constexpr int TensorDimY = 128;
+
+class L1InterleavedPolicyBase : public ::testing::Test {
+public:
+  mlir::MLIRContext context;
+  mlir::OwningOpRef<mlir::ModuleOp> module;
+  mlir::OpBuilder builder = mlir::OpBuilder(&context);
+  mlir::func::FuncOp func;
+  mlir::tt::DeviceAttr deviceAttr;
+
+  using OpMemSpec = L1InterleavedPolicy::OpMemSpec;
+  using OpConfig = L1InterleavedPolicy::OpConfig;
+  using L1Usage = L1InterleavedPolicy::L1Usage;
+
+  void SetUp() override {
+    context.loadDialect<TTNNDialect>();
+    module = mlir::ModuleOp::create(builder.getUnknownLoc());
+    builder.setInsertionPointToStart(&module->getBodyRegion().front());
+    createFuncOp();
+    deviceAttr = mlir::tt::getCurrentScopeDevice(func);
+  }
+
+  llvm::SmallVector<int64_t, 2> getTensorShape() {
+    return {TensorDimX, TensorDimY};
+  }
+
+  mlir::RankedTensorType getTensorRankedType() {
+    return mlir::RankedTensorType::get(getTensorShape(), builder.getF32Type());
+  }
+
+  mlir::Value createEmptyTensor() {
+    ShapeAttr shapeAttr = ShapeAttr::get(&context, getTensorShape());
+    return builder.create<EmptyOp>(builder.getUnknownLoc(),
+                                   getTensorRankedType(), nullptr, shapeAttr,
+                                   nullptr, nullptr, nullptr);
+  }
+
+  mlir::func::FuncOp createFuncOp() {
+    mlir::SmallVector<mlir::Type> input;
+    input.push_back(getTensorRankedType());
+
+    mlir::SmallVector<mlir::Type> output;
+    output.push_back(getTensorRankedType());
+
+    auto funcType = builder.getType<mlir::FunctionType>(
+        mlir::TypeRange(input), mlir::TypeRange(output));
+    func = builder.create<mlir::func::FuncOp>(builder.getUnknownLoc(), "test",
+                                              funcType);
+
+    mlir::Block *block = func.addEntryBlock();
+    block->addArgument(getTensorRankedType(), builder.getUnknownLoc());
+    block->addArgument(getTensorRankedType(), builder.getUnknownLoc());
+
+    builder.setInsertionPointToStart(block);
+
+    return func;
+  }
+
+  void addLayoutForOp(mlir::Operation *op,
+                      llvm::DenseMap<mlir::Operation *,
+                                     std::vector<TTNNLayoutAttr>> &legalLayouts,
+                      BufferType memorySpace,
+                      TensorMemoryLayout tensorMemoryLayout) {
+    if (legalLayouts.find(op) == legalLayouts.end()) {
+      legalLayouts[op] = std::vector<TTNNLayoutAttr>{TTNNLayoutAttr::get(
+          &context, getTensorRankedType().getShape(), builder.getF32Type(),
+          memorySpace, mlir::tt::GridAttr::get(&context, {8, 8}),
+          tensorMemoryLayout)};
+    } else {
+      legalLayouts[op].push_back(TTNNLayoutAttr::get(
+          &context, getTensorRankedType().getShape(), builder.getF32Type(),
+          memorySpace, mlir::tt::GridAttr::get(&context, {8, 8}),
+          tensorMemoryLayout));
+    }
+  }
+
+  void prepareOpForGreedyConfigPicker(
+      mlir::Operation *op, uint64_t outputL1Usage, uint64_t requiredL1Usage,
+      llvm::DenseMap<mlir::Operation *, std::vector<TTNNLayoutAttr>>
+          &legalLayouts,
+      llvm::DenseMap<mlir::Operation *, L1Usage> &opsL1Usage) {
+
+    // Add two legal layouts for the op with different buffer
+    // types: DRAM and L1.
+    addLayoutForOp(op, legalLayouts, BufferType::DRAM,
+                   TensorMemoryLayout::Interleaved);
+    addLayoutForOp(op, legalLayouts, BufferType::L1,
+                   TensorMemoryLayout::Interleaved);
+
+    L1Usage l1Usage;
+    l1Usage.outputL1Usage = outputL1Usage;
+    l1Usage.requiredL1Usage = requiredL1Usage;
+    opsL1Usage[op] = l1Usage;
+  }
+
+  void TearDown() override {}
+};
+
+TEST_F(L1InterleavedPolicyBase, VerifyGreedyPolicy) {
+  std::vector<L1ChainConfig> l1ChainConfigs;
+  llvm::DenseMap<mlir::Operation *, std::vector<TTNNLayoutAttr>> legalLayouts;
+  llvm::DenseMap<mlir::func::FuncOp, llvm::SmallVector<mlir::Operation *>>
+      schedule;
+  llvm::DenseMap<mlir::Operation *, L1Usage> opsL1Usage;
+  constexpr uint64_t usableL1CacheSize = 15;
+
+  // Create operand A
+  mlir::Value dest = createEmptyTensor();
+  mlir::Value lhs = func.getBody().getBlocks().front().getArgument(0);
+  mlir::Value rhs = func.getBody().getBlocks().front().getArgument(1);
+  mlir::Operation *opA =
+      builder.create<AddOp>(builder.getUnknownLoc(), lhs, rhs, dest);
+  uint64_t outputL1Usage = 2;
+  uint64_t requiredL1Usage = 8;
+  prepareOpForGreedyConfigPicker(opA, outputL1Usage, requiredL1Usage,
+                                 legalLayouts, opsL1Usage);
+
+  // Create operand B
+  dest = createEmptyTensor();
+  lhs = func.getBody().getBlocks().front().getArgument(0);
+  rhs = func.getBody().getBlocks().front().getArgument(1);
+  mlir::Operation *opB =
+      builder.create<AddOp>(builder.getUnknownLoc(), lhs, rhs, dest);
+  outputL1Usage = 3;
+  requiredL1Usage = 7;
+  prepareOpForGreedyConfigPicker(opB, outputL1Usage, requiredL1Usage,
+                                 legalLayouts, opsL1Usage);
+
+  // Create operand C
+  dest = createEmptyTensor();
+  lhs = func.getBody().getBlocks().front().getArgument(0);
+  rhs = func.getBody().getBlocks().front().getArgument(1);
+  mlir::Operation *opC =
+      builder.create<AddOp>(builder.getUnknownLoc(), lhs, rhs, dest);
+  outputL1Usage = 1;
+  requiredL1Usage = 9;
+  prepareOpForGreedyConfigPicker(opC, outputL1Usage, requiredL1Usage,
+                                 legalLayouts, opsL1Usage);
+
+  // Create base op D
+  dest = createEmptyTensor();
+  lhs = func.getBody().getBlocks().front().getArgument(0);
+  rhs = func.getBody().getBlocks().front().getArgument(1);
+  mlir::Operation *opD =
+      builder.create<AddOp>(builder.getUnknownLoc(), lhs, rhs, dest);
+  outputL1Usage = 4;
+  requiredL1Usage = 0;
+  prepareOpForGreedyConfigPicker(opD, outputL1Usage, requiredL1Usage,
+                                 legalLayouts, opsL1Usage);
+
+  // Run greedy config picker policy
+  L1InterleavedPolicy l1InterleavedPolicy(nullptr, l1ChainConfigs, legalLayouts,
+                                          schedule, usableL1CacheSize);
+  OpConfig greedyConfig = l1InterleavedPolicy.getGreedyConfig(opD, opsL1Usage);
+
+  // Sanity checks
+  ASSERT_TRUE(greedyConfig.baseOp == opD);
+  ASSERT_TRUE(greedyConfig.layouts.size() == 4);
+  ASSERT_TRUE(greedyConfig.precedence.size() == 3);
+
+  // All layouts should be using L1 buffer type
+  for (const auto &[op, layout] : greedyConfig.layouts) {
+    ASSERT_TRUE(layout.hasL1BufferType());
+  }
+
+  // Precedence order for op D should be: C, A, B
+  ASSERT_EQ(greedyConfig.precedence[0], opC);
+  ASSERT_EQ(greedyConfig.precedence[1], opA);
+  ASSERT_EQ(greedyConfig.precedence[2], opB);
+}

From 06b7a45a599efe493d6fea8dd478af7f5313ca89 Mon Sep 17 00:00:00 2001
From: Stefan Djordjevic <157365107+sdjordjevicTT@users.noreply.github.com>
Date: Wed, 4 Dec 2024 11:55:50 +0100
Subject: [PATCH 50/84] Adding initial workaround pass implementation (#1402)

---
 include/ttmlir/Dialect/TTNN/IR/CMakeLists.txt |   1 +
 include/ttmlir/Dialect/TTNN/IR/TTNNBase.td    |   3 +-
 include/ttmlir/Dialect/TTNN/IR/TTNNOps.h      |   1 +
 include/ttmlir/Dialect/TTNN/IR/TTNNOps.td     |  23 +-
 .../ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td    |   3 +
 .../Dialect/TTNN/IR/TTNNWorkaroundInterface.h |  18 +
 .../TTNN/IR/TTNNWorkaroundInterface.td        |  47 +++
 .../ttmlir/Dialect/TTNN/IR/TTNNWorkarounds.h  | 174 ++++++++
 .../Dialect/TTNN/Pipelines/TTNNPipelines.h    |   6 +
 .../ttmlir/Dialect/TTNN/Transforms/Passes.td  |   8 +
 .../Dialect/TTNN/Utils/TransformUtils.h       |  17 +
 include/ttmlir/Dialect/TTNN/Utils/Utils.h     |   7 +
 include/ttmlir/Utils.h                        |   5 +
 lib/Conversion/TTIRToTTNN/CMakeLists.txt      |   1 +
 lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp      |  39 +-
 lib/Dialect/TTNN/IR/CMakeLists.txt            |   3 +
 lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp          |  31 ++
 .../TTNN/IR/TTNNWorkaroundInterface.cpp       |  90 ++++
 lib/Dialect/TTNN/IR/TTNNWorkarounds.cpp       |  70 +++
 lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp  |   9 +
 lib/Dialect/TTNN/Transforms/CMakeLists.txt    |   5 +-
 .../TTNN/Transforms/TTNNWorkarounds.cpp       | 399 ++++++++++++++++++
 lib/Dialect/TTNN/Utils/CMakeLists.txt         |   4 +-
 lib/Dialect/TTNN/Utils/TransformUtils.cpp     |  30 ++
 lib/Dialect/TTNN/Utils/Utils.cpp              |   8 +
 .../Workarounds/simple_workaround.mlir        |  31 ++
 26 files changed, 998 insertions(+), 35 deletions(-)
 create mode 100644 include/ttmlir/Dialect/TTNN/IR/TTNNWorkaroundInterface.h
 create mode 100644 include/ttmlir/Dialect/TTNN/IR/TTNNWorkaroundInterface.td
 create mode 100644 include/ttmlir/Dialect/TTNN/IR/TTNNWorkarounds.h
 create mode 100644 include/ttmlir/Dialect/TTNN/Utils/TransformUtils.h
 create mode 100644 lib/Dialect/TTNN/IR/TTNNWorkaroundInterface.cpp
 create mode 100644 lib/Dialect/TTNN/IR/TTNNWorkarounds.cpp
 create mode 100644 lib/Dialect/TTNN/Transforms/TTNNWorkarounds.cpp
 create mode 100644 lib/Dialect/TTNN/Utils/TransformUtils.cpp
 create mode 100644 test/ttmlir/Dialect/TTNN/Transforms/Workarounds/simple_workaround.mlir

diff --git a/include/ttmlir/Dialect/TTNN/IR/CMakeLists.txt b/include/ttmlir/Dialect/TTNN/IR/CMakeLists.txt
index cfd65fe8db..fbf68f69dd 100644
--- a/include/ttmlir/Dialect/TTNN/IR/CMakeLists.txt
+++ b/include/ttmlir/Dialect/TTNN/IR/CMakeLists.txt
@@ -3,6 +3,7 @@ add_mlir_doc(TTNNBase TTNNDialect src/autogen/md/Dialect/ -gen-dialect-doc)
 add_mlir_doc(TTNNOps TTNNOp src/autogen/md/Dialect/ -gen-op-doc)
 
 add_mlir_interface(TTNNOpModelInterface)
+add_mlir_interface(TTNNWorkaroundInterface)
 
 set(LLVM_TARGET_DEFINITIONS TTNNOpsEnums.td)
 mlir_tablegen(TTNNOpsEnums.h.inc -gen-enum-decls)
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNBase.td b/include/ttmlir/Dialect/TTNN/IR/TTNNBase.td
index b1821c8f1b..34d3daf9cc 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNBase.td
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNBase.td
@@ -7,6 +7,7 @@
 
 include "mlir/IR/OpBase.td"
 include "ttmlir/Dialect/TTNN/IR/TTNNOpModelInterface.td"
+include "ttmlir/Dialect/TTNN/IR/TTNNWorkaroundInterface.td"
 
 //===----------------------------------------------------------------------===//
 // TTNN dialect definition.
@@ -44,6 +45,6 @@ def TTNN_Dialect : Dialect {
 //===----------------------------------------------------------------------===//
 
 class TTNN_Op<string mnemonic, list<Trait> traits = []> :
-        Op<TTNN_Dialect, mnemonic, !listconcat(traits, [TTNN_OpModelInterface])>;
+        Op<TTNN_Dialect, mnemonic, !listconcat(traits, [TTNN_OpModelInterface, TTNN_WorkaroundInterface])>;
 
 #endif
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.h b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.h
index e66fab65a3..457c7722bb 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.h
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.h
@@ -18,6 +18,7 @@
 #include "ttmlir/Dialect/TTNN/IR/TTNNOpsTypes.h"
 
 #include "ttmlir/Dialect/TTNN/IR/TTNNOpModelInterface.h.inc"
+#include "ttmlir/Dialect/TTNN/IR/TTNNWorkaroundInterface.h"
 
 #define GET_OP_CLASSES
 #include "ttmlir/Dialect/TTNN/IR/TTNNOps.h.inc"
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
index 57383c007d..94ca8cee3a 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
@@ -174,6 +174,17 @@ def TTNN_AbsOp : TTNN_ElementwiseUnaryOp<"abs"> {
     let description = [{
       Eltwise absolute operation.
     }];
+
+    let extraClassDeclaration = [{
+      MutableOperandRange getDpsInitsMutable() { return getOutputsMutable(); }
+      wa::TTNNOperandsWorkarounds getOperandsWorkarounds() {
+        wa::TTNNOperandWorkarounds tileLayoutWorkaround = wa::TTNNOperandWorkarounds(Layout::Tile);
+        return wa::TTNNOperandsWorkarounds::createEmptyTTNNOperandsWorkarounds()
+        .addInputOperandWorkaround(tileLayoutWorkaround)
+        .addInputOperandWorkaround(tileLayoutWorkaround)
+        .addOutputOperandWorkaround(tileLayoutWorkaround);
+      }
+    }];
 }
 
 def TTNN_CbrtOp : TTNN_ElementwiseUnaryOp<"cbrt"> {
@@ -325,7 +336,7 @@ def TTNN_Expm1Op: TTNN_ElementwiseUnaryOp<"expm1"> {
   }];
 }
 
-class TTIR_ElementwiseUnaryWithFloatParameterOp<string mnemonic, list<Trait> traits = []> :
+class TTNN_ElementwiseUnaryWithFloatParameterOp<string mnemonic, list<Trait> traits = []> :
     TTNN_ElementwiseUnaryOp<mnemonic, traits> {
     let summary = "Eltwise unary op with the float parameter.";
     let description = [{
@@ -345,7 +356,7 @@ class TTIR_ElementwiseUnaryWithFloatParameterOp<string mnemonic, list<Trait> tra
     ];
 }
 
-def TTIR_LeakyReluOp : TTIR_ElementwiseUnaryWithFloatParameterOp<"leaky_relu"> {
+def TTNN_LeakyReluOp : TTNN_ElementwiseUnaryWithFloatParameterOp<"leaky_relu"> {
     let summary = "Eltwise leaky relu operation.";
     let description = [{
       The Leaky ReLU (Rectified Linear Unit) operation computes an element-wise
@@ -784,6 +795,14 @@ def TTNN_EmptyOp : TTNN_Op<"empty", [NoMemoryEffect]> {
                          OptionalAttr<TTNN_MemoryConfigAttr>:$memory_config);
     let results = (outs AnyRankedTensor:$result);
 
+    let extraClassDeclaration = [{
+      wa::TTNNOperandsWorkarounds getOperandsWorkarounds() {
+        wa::TTNNOperandWorkarounds rowMajorLayoutWorkaround = wa::TTNNOperandWorkarounds(Layout::RowMajor);
+        return wa::TTNNOperandsWorkarounds::createEmptyTTNNOperandsWorkarounds()
+        .addOutputOperandWorkaround(rowMajorLayoutWorkaround);
+      }
+    }];
+
     let hasVerifier = 1;
 }
 
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td b/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td
index 7d5b10abb0..d8aea834fb 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td
@@ -92,6 +92,9 @@ def TTNN_MemoryConfigAttr : TTNN_Attr<"MemoryConfig", "memory_config"> {
     {
       return this->getShardSpec().getShardShape().getShape();
     }
+
+    MemoryConfigAttr withBufferType(::mlir::MLIRContext *context, BufferType bufferType);
+    MemoryConfigAttr withMemoryLayout(::mlir::MLIRContext *context, TensorMemoryLayout memLayout);
   }];
 }
 
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNWorkaroundInterface.h b/include/ttmlir/Dialect/TTNN/IR/TTNNWorkaroundInterface.h
new file mode 100644
index 0000000000..a6cdd5c1d7
--- /dev/null
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNWorkaroundInterface.h
@@ -0,0 +1,18 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#ifndef TTMLIR_DIALECT_TTNN_IR_TTNNWORKAROUNDINTERFACE_H
+#define TTMLIR_DIALECT_TTNN_IR_TTNNWORKAROUNDINTERFACE_H
+
+#include "ttmlir/Dialect/TTNN/IR/TTNNWorkarounds.h"
+
+#include "mlir/IR/Operation.h"
+
+namespace mlir::tt::ttnn::wa {
+// Verifies the TTNNWorkaroundInterface
+mlir::LogicalResult verifyTTNNWorkaroundInterface(mlir::Operation *op);
+} // namespace mlir::tt::ttnn::wa
+
+#include "ttmlir/Dialect/TTNN/IR/TTNNWorkaroundInterface.h.inc"
+
+#endif
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNWorkaroundInterface.td b/include/ttmlir/Dialect/TTNN/IR/TTNNWorkaroundInterface.td
new file mode 100644
index 0000000000..c1ce55cd99
--- /dev/null
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNWorkaroundInterface.td
@@ -0,0 +1,47 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTMLIR_TTMLIR_DIALECT_TTNN_IR_TTNN_WORKAROUND_INTERFACE_TD
+#define TTMLIR_TTMLIR_DIALECT_TTNN_IR_TTNN_WORKAROUND_INTERFACE_TD
+
+include "mlir/IR/OpBase.td"
+
+// This interface is used to specify workarounds for TTNN operations.
+def TTNN_WorkaroundInterface : OpInterface<"TTNNWorkaroundInterface"> {
+  let cppNamespace = "::mlir::tt::ttnn::wa";
+  let methods = [
+    InterfaceMethod<
+      /*desc=*/[{
+        Returns the workarounds associated with each operand and result of this operation.
+        If the operation is a Destination-Passing Style (DPS) operation, the same workarounds
+        must apply to both the DPS initial operands and the operation results. These constraints
+        are verified through the interface verifier.
+
+        For example, consider the following ttnn operations:
+        %0 = "ttnn.empty"() : () -> tensor<1x1xf32>
+        %1 = "ttnn.abs"(%arg0, %0) : (tensor<1x1xf32>, tensor<1x1xf32>) -> tensor<1x1xf32>
+
+        In this example:
+          - The ttnn.abs operation has two input operand workarounds.
+          - It has one output operand workaround.
+          - The output workaround must match the workaround for the second input operand,
+          ensuring consistency as required by the DPS pattern.
+      }],
+      /*retTy=*/"TTNNOperandsWorkarounds",
+      /*methodName=*/"getOperandsWorkarounds",
+      /*args=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        // Return default empty workarounds for all input and output operands
+        return TTNNOperandsWorkarounds::createEmptyTTNNOperandsWorkarounds(this->getOperation());
+      }]
+    >,
+  ];
+
+  let verify = [{
+    return verifyTTNNWorkaroundInterface($_op);
+  }];
+}
+
+#endif
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNWorkarounds.h b/include/ttmlir/Dialect/TTNN/IR/TTNNWorkarounds.h
new file mode 100644
index 0000000000..7795623384
--- /dev/null
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNWorkarounds.h
@@ -0,0 +1,174 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTMLIR_DIALECT_TTNN_IR_TTNNWORKAROUNDS_H
+#define TTMLIR_DIALECT_TTNN_IR_TTNNWORKAROUNDS_H
+
+#include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h"
+
+#include "mlir/IR/Operation.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+
+#include <optional>
+
+namespace mlir::tt::ttnn::wa {
+using TensorLayoutWorkaround = std::optional<Layout>;
+using TensorBufferTypeWorkaround = std::optional<BufferType>;
+using TensorMemoryLayoutWorkaround = std::optional<TensorMemoryLayout>;
+
+// Struct that encapsulates operand workarounds.
+// It contains tensor layout, tensor buffer type and tensor memory layout
+// workarounds.
+struct TTNNOperandWorkarounds {
+  // Tensor layout workaround.
+  TensorLayoutWorkaround tensorLayoutWorkaround;
+
+  // Tensor buffer type workaround.
+  TensorBufferTypeWorkaround tensorBufferTypeWorkaround;
+
+  // Tensor memory layout workaround.
+  TensorMemoryLayoutWorkaround tensorMemoryLayoutWorkaround;
+
+  TTNNOperandWorkarounds() = default;
+
+  // Constructor that takes tensor layout, tensor buffer type and tensor memory.
+  TTNNOperandWorkarounds(
+      TensorLayoutWorkaround tensorLayoutWorkaround,
+      TensorBufferTypeWorkaround tensorBufferTypeWorkaround,
+      TensorMemoryLayoutWorkaround tensorMemoryLayoutWorkaround)
+      : tensorLayoutWorkaround(tensorLayoutWorkaround),
+        tensorBufferTypeWorkaround(tensorBufferTypeWorkaround),
+        tensorMemoryLayoutWorkaround(tensorMemoryLayoutWorkaround) {}
+
+  // Constructor that takes tensor layout workaround and sets the other
+  // workarounds to nullopt.
+  TTNNOperandWorkarounds(TensorLayoutWorkaround tensorLayoutWorkaround)
+      : TTNNOperandWorkarounds(tensorLayoutWorkaround, std::nullopt,
+                               std::nullopt) {}
+
+  // Constructor that takes tensor buffer type workaround and sets the other
+  // workarounds to nullopt.
+  TTNNOperandWorkarounds(TensorBufferTypeWorkaround tensorBufferTypeWorkaround)
+      : TTNNOperandWorkarounds(std::nullopt, tensorBufferTypeWorkaround,
+                               std::nullopt) {}
+
+  // Constructor that takes tensor memory layout workaround and sets the other
+  // workarounds to nullopt.
+  TTNNOperandWorkarounds(
+      TensorMemoryLayoutWorkaround tensorMemoryLayoutWorkaround)
+      : TTNNOperandWorkarounds(std::nullopt, std::nullopt,
+                               tensorMemoryLayoutWorkaround) {}
+
+  // Operand workarounds factory methods.
+  static TTNNOperandWorkarounds createEmptyTTNNOperandWorkarounds();
+
+  // Equality operator.
+  bool operator==(const TTNNOperandWorkarounds &rhs) const {
+    return tensorLayoutWorkaround == rhs.tensorLayoutWorkaround &&
+           tensorBufferTypeWorkaround == rhs.tensorBufferTypeWorkaround &&
+           tensorMemoryLayoutWorkaround == rhs.tensorMemoryLayoutWorkaround;
+  }
+
+  // Inequality operator.
+  bool operator!=(const TTNNOperandWorkarounds &rhs) const {
+    return !(*this == rhs);
+  }
+
+  // Returns true if any of the workarounds is set.
+  bool hasAnyWorkaround() const {
+    return tensorLayoutWorkaround || tensorBufferTypeWorkaround ||
+           tensorMemoryLayoutWorkaround;
+  }
+};
+
+// Struct that encapsulates the result of applying the workarounds.
+// It contains the target tensor layout, buffer type and tensor memory layout
+// results and a flag indicating whether the workarounds were applied.
+struct WorkaroundResult {
+  // Target tensor layout.
+  std::pair<Layout, bool> targetTensorLayoutResult;
+
+  // Target tensor buffer type.
+  std::pair<BufferType, bool> targetTensorBufferTypeResult;
+
+  // Target tensor memory layout.
+  std::pair<TensorMemoryLayout, bool> targetTensorMemoryLayoutResult;
+
+  // Returns true if any of the workarounds were applied.
+  bool modified() const {
+    return targetTensorLayoutResult.second ||
+           targetTensorBufferTypeResult.second ||
+           targetTensorMemoryLayoutResult.second;
+  }
+};
+
+// Apply the operand workarounds to the layout attribute that contains
+// tensor layout, buffer type and tensor memory layout arguments.
+// Returns the result of applying the workarounds.
+WorkaroundResult applyWorkarounds(const TTNNOperandWorkarounds &workaround,
+                                  const TTNNLayoutAttr &inputLayoutAttr);
+
+// Class that encapsulates operands workarounds.
+// It contains input and output workarounds for operands.
+class TTNNOperandsWorkarounds {
+public:
+  // Returns input operand workarounds.
+  llvm::ArrayRef<TTNNOperandWorkarounds> getInputOperandWorkarounds() const {
+    return inputOperandWorkarounds;
+  }
+
+  // Returns output operand workarounds.
+  llvm::ArrayRef<TTNNOperandWorkarounds> getOutputOperandWorkarounds() const {
+    return outputOperandWorkarounds;
+  }
+
+  // Adds input operand workaround.
+  TTNNOperandsWorkarounds &
+  addInputOperandWorkaround(TTNNOperandWorkarounds inputOperandWorkaround) {
+    inputOperandWorkarounds.emplace_back(inputOperandWorkaround);
+    return *this;
+  }
+
+  // Adds output operand workaround.
+  TTNNOperandsWorkarounds &
+  addOutputOperandWorkaround(TTNNOperandWorkarounds outputOperandWorkaround) {
+    outputOperandWorkarounds.emplace_back(outputOperandWorkaround);
+    return *this;
+  }
+
+  // Operands workarounds factory method.
+  static TTNNOperandsWorkarounds
+  createEmptyTTNNOperandsWorkarounds(int inputSize, int outputSize);
+
+  // Operands workarounds factory method.
+  static TTNNOperandsWorkarounds createEmptyTTNNOperandsWorkarounds() {
+    return createEmptyTTNNOperandsWorkarounds(0, 0);
+  }
+
+  // Operands workarounds factory method.
+  static TTNNOperandsWorkarounds
+  createEmptyTTNNOperandsWorkarounds(Operation *op);
+
+private:
+  // Default constructor with no workarounds.
+  TTNNOperandsWorkarounds() {}
+
+  // Constructor that takes input and output workarounds for operands.
+  TTNNOperandsWorkarounds(
+      llvm::SmallVector<TTNNOperandWorkarounds> inputOperandWorkarounds,
+      llvm::SmallVector<TTNNOperandWorkarounds> outputOperandWorkarounds)
+      : inputOperandWorkarounds(std::move(inputOperandWorkarounds)),
+        outputOperandWorkarounds(std::move(outputOperandWorkarounds)) {}
+
+  // Workarounds for input operands.
+  llvm::SmallVector<TTNNOperandWorkarounds> inputOperandWorkarounds;
+
+  // Workarounds for output operands.
+  llvm::SmallVector<TTNNOperandWorkarounds> outputOperandWorkarounds;
+};
+
+} // namespace mlir::tt::ttnn::wa
+
+#endif
diff --git a/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h b/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h
index 58206039bb..e9d78b4d3c 100644
--- a/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h
+++ b/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h
@@ -111,6 +111,12 @@ struct TTIRToTTNNBackendPipelineOptions
 
   ListOption<int64_t> meshShape{
       *this, "mesh-shape", llvm::cl::desc("Set the multi-device mesh shape.")};
+
+  // Option to enable/disable the workaround pass.
+  //
+  Option<bool> workaroundPassEnabled{*this, "enable-workaround-pass",
+                                     llvm::cl::desc("Enable workaround pass."),
+                                     llvm::cl::init(false)};
 };
 
 // TTIR to EmitC pipeline options.
diff --git a/include/ttmlir/Dialect/TTNN/Transforms/Passes.td b/include/ttmlir/Dialect/TTNN/Transforms/Passes.td
index c29d01f7e4..444927e348 100644
--- a/include/ttmlir/Dialect/TTNN/Transforms/Passes.td
+++ b/include/ttmlir/Dialect/TTNN/Transforms/Passes.td
@@ -28,4 +28,12 @@ def TTNNLayout : Pass<"ttnn-layout", "::mlir::ModuleOp"> {
   }];
 }
 
+def TTNNWorkarounds : Pass<"ttnn-workaround", "::mlir::ModuleOp"> {
+  let summary = "Apply TTNN workarounds to the IR.";
+  let description = [{
+    This pass applies necessary TTNN workarounds to the IR in order to create
+    a valid and functional IR that can be executed on the hardware.
+  }];
+}
+
 #endif
diff --git a/include/ttmlir/Dialect/TTNN/Utils/TransformUtils.h b/include/ttmlir/Dialect/TTNN/Utils/TransformUtils.h
new file mode 100644
index 0000000000..2dc83388d1
--- /dev/null
+++ b/include/ttmlir/Dialect/TTNN/Utils/TransformUtils.h
@@ -0,0 +1,17 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTMLIR_DIALECT_TTNN_UTILS_TRANSFORMUTILS_H
+#define TTMLIR_DIALECT_TTNN_UTILS_TRANSFORMUTILS_H
+
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+
+namespace mlir::tt::ttnn::utils {
+// Get or insert device for the given operation.
+mlir::Value getOrInsertDevice(mlir::PatternRewriter &rewriter,
+                              mlir::Operation *op);
+} // namespace mlir::tt::ttnn::utils
+
+#endif
diff --git a/include/ttmlir/Dialect/TTNN/Utils/Utils.h b/include/ttmlir/Dialect/TTNN/Utils/Utils.h
index d7d8fbdd30..f214fa793d 100644
--- a/include/ttmlir/Dialect/TTNN/Utils/Utils.h
+++ b/include/ttmlir/Dialect/TTNN/Utils/Utils.h
@@ -11,6 +11,8 @@
 #include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h"
 #include "ttmlir/Dialect/TTNN/IR/TTNNOpsTypes.h"
 
+#include "mlir/IR/BuiltinTypes.h"
+
 namespace mlir::tt::ttnn::utils {
 
 // Map tt::MemorySpace to ttnn::BufferType
@@ -36,6 +38,11 @@ toTTMemorySpace(const mlir::tt::ttnn::BufferType bufferType);
 mlir::Type createRowMajorTypeFromDtype(::mlir::MLIRContext *context,
                                        DataType dtype);
 
+// Helper method to create a RankedTensorType with the given encoding
+RankedTensorType
+createRankedTensorTypeWithEncoding(RankedTensorType tensorType,
+                                   ttnn::TTNNLayoutAttr encoding);
+
 } // namespace mlir::tt::ttnn::utils
 
 #endif // TTMLIR_DIALECT_TTNN_UTILS_UTILS_H
diff --git a/include/ttmlir/Utils.h b/include/ttmlir/Utils.h
index bcf836741a..49dad79e5e 100644
--- a/include/ttmlir/Utils.h
+++ b/include/ttmlir/Utils.h
@@ -127,6 +127,11 @@ inline MlirAttribute wrapArrayOfMlirAttributesAsAttribute(
   return wrap(mlir::ArrayAttr::get(unwrap(ctx), unwrappedAttributesArray));
 }
 
+// Checks if the type of the given `mlir::Value` is a ranked tensor type.
+inline bool isRankedTensor(mlir::Value v) {
+  return mlir::isa<mlir::RankedTensorType>(v.getType());
+}
+
 } // namespace ttmlir::utils
 
 #endif
diff --git a/lib/Conversion/TTIRToTTNN/CMakeLists.txt b/lib/Conversion/TTIRToTTNN/CMakeLists.txt
index e31220d751..ff054f5bd7 100644
--- a/lib/Conversion/TTIRToTTNN/CMakeLists.txt
+++ b/lib/Conversion/TTIRToTTNN/CMakeLists.txt
@@ -11,4 +11,5 @@ add_mlir_library(TTMLIRTTIRToTTNN
   LINK_LIBS PUBLIC
   MLIRIR
   MLIRPass
+  TTMLIRTTNNUtils
 )
diff --git a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
index 789485eac3..3c03592fc1 100644
--- a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
+++ b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
@@ -9,6 +9,7 @@
 #include "ttmlir/Dialect/TTNN/IR/TTNNOps.h"
 #include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h"
 #include "ttmlir/Dialect/TTNN/Types/Types.h"
+#include "ttmlir/Dialect/TTNN/Utils/TransformUtils.h"
 #include "ttmlir/Dialect/TTNN/Utils/Utils.h"
 
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -28,27 +29,6 @@ using namespace mlir::tt;
 
 namespace {
 
-// Gets or inserts a GetDeviceOp at the top of the current block of the given
-// operation.
-static Value getOrInsertDevice(ConversionPatternRewriter &rewriter,
-                               Operation *op) {
-  Block *block = op->getBlock();
-  for (auto &op : block->getOperations()) {
-    if (auto deviceOp = dyn_cast<ttnn::GetDeviceOp>(op)) {
-      return deviceOp.getResult();
-    }
-  }
-
-  DeviceAttr deviceAttr = getCurrentScopeDevice(op);
-  auto currentInsertionPoint = rewriter.saveInsertionPoint();
-  rewriter.setInsertionPoint(block, block->begin());
-  auto deviceOp = rewriter.create<ttnn::GetDeviceOp>(
-      op->getLoc(), rewriter.getType<DeviceType>(deviceAttr),
-      ttnn::MeshShapeAttr::get(op->getContext(), 1, 1));
-  rewriter.restoreInsertionPoint(currentInsertionPoint);
-  return deviceOp.getResult();
-}
-
 class TensorEmptyConversionPattern
     : public OpConversionPattern<tensor::EmptyOp> {
 public:
@@ -95,7 +75,7 @@ class TensorEmptyConversionPattern
 
     // Create MemoryConfigAttr
     //
-    auto device = getOrInsertDevice(rewriter, op);
+    auto device = ::ttnn::utils::getOrInsertDevice(rewriter, op);
     llvm::SmallVector<int64_t> shardShape = layoutAttr.getShardShape();
     ttnn::MemoryConfigAttr memoryConfigAttr = ttnn::MemoryConfigAttr::get(
         op.getContext(),
@@ -193,7 +173,8 @@ class ToLayoutOpConversionPattern
     rewriter.replaceOpWithNewOp<ttnn::ToLayoutOp>(
         op, this->getTypeConverter()->convertType(result), adaptor.getInput(),
         outputLayout, outputDataType, outputMemConfigAttr,
-        isOutputOnHost ? nullptr : getOrInsertDevice(rewriter, op));
+        isOutputOnHost ? nullptr
+                       : ::ttnn::utils::getOrInsertDevice(rewriter, op));
 
     return success();
   }
@@ -520,7 +501,7 @@ class ConstantOpConversionPattern
     }
 
     if (valueAttr.isSplat()) {
-      Value device = getOrInsertDevice(rewriter, op);
+      Value device = ::ttnn::utils::getOrInsertDevice(rewriter, op);
       float fillValue =
           valueAttr.getElementType().isInteger()
               ? getIntegerValue(valueAttr)
@@ -634,7 +615,7 @@ class Conv2dOpConversionPattern : public OpConversionPattern<ttir::Conv2dOp> {
   matchAndRewrite(ttir::Conv2dOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
 
-    auto device = getOrInsertDevice(rewriter, op);
+    auto device = ::ttnn::utils::getOrInsertDevice(rewriter, op);
     auto kernel_ty =
         mlir::cast<RankedTensorType>(adaptor.getWeight().getType());
     llvm::ArrayRef<std::int64_t> kernel_shape = kernel_ty.getShape();
@@ -732,7 +713,7 @@ class MaxPool2dOpConversionPattern
            "TTNN max_pool2d does not support padding top/bottom/left/right "
            "separately");
 
-    auto device = getOrInsertDevice(rewriter, op);
+    auto device = mlir::tt::ttnn::utils::getOrInsertDevice(rewriter, op);
     auto input_ty = mlir::cast<RankedTensorType>(adaptor.getInput().getType());
     llvm::ArrayRef<std::int64_t> input_shape = input_ty.getShape();
 
@@ -836,7 +817,7 @@ class SubtractOpConversionPattern
       // addOp(lhs, negOp(rhs))
 
     } else {
-      Value device = getOrInsertDevice(rewriter, srcOp);
+      Value device = ::ttnn::utils::getOrInsertDevice(rewriter, srcOp);
       tensor::EmptyOp negEmptyOp = rewriter.create<tensor::EmptyOp>(
           srcOp.getLoc(), this->getTypeConverter()->convertType(rhsType),
           device);
@@ -862,7 +843,7 @@ class AllGatherOpConversionPattern
                   ConversionPatternRewriter &rewriter) const override {
     RankedTensorType type =
         mlir::cast<RankedTensorType>(adaptor.getInput().getType());
-    Value device = getOrInsertDevice(rewriter, op);
+    Value device = ::ttnn::utils::getOrInsertDevice(rewriter, op);
     tensor::EmptyOp emptyOp = rewriter.create<tensor::EmptyOp>(
         op.getLoc(), this->getTypeConverter()->convertType(type), device);
 
@@ -895,7 +876,7 @@ class ArangeOpConversionPattern : public OpConversionPattern<ttir::ArangeOp> {
 
     DataTypeAttr dtypeAttr = rewriter.getAttr<DataTypeAttr>(
         elementTypeToDataType(outputType.getElementType()));
-    Value device = getOrInsertDevice(rewriter, op);
+    Value device = mlir::tt::ttnn::utils::getOrInsertDevice(rewriter, op);
 
     ttnn::MemoryConfigAttr memConfigAttr =
         rewriter.getAttr<ttnn::MemoryConfigAttr>(
diff --git a/lib/Dialect/TTNN/IR/CMakeLists.txt b/lib/Dialect/TTNN/IR/CMakeLists.txt
index 4b7804a5fd..2fb004e0f3 100644
--- a/lib/Dialect/TTNN/IR/CMakeLists.txt
+++ b/lib/Dialect/TTNN/IR/CMakeLists.txt
@@ -4,6 +4,8 @@ add_mlir_dialect_library(MLIRTTNNDialect
         TTNNOps.cpp
         TTNNOpModelInterface.cpp
         TTNNOpsTypes.cpp
+        TTNNWorkaroundInterface.cpp
+        TTNNWorkarounds.cpp
 
         ADDITIONAL_HEADER_DIRS
         ${PROJECT_SOURCE_DIR}/include/ttmlir
@@ -11,6 +13,7 @@ add_mlir_dialect_library(MLIRTTNNDialect
         DEPENDS
         MLIRTTNNOpsIncGen
         MLIRTTOpsIncGen
+        MLIRTTNNWorkaroundInterfaceIncGen
         TTNNOpModelLib
 
         LINK_LIBS PUBLIC
diff --git a/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp b/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp
index 10b54f418b..fc692b0f1d 100644
--- a/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp
+++ b/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp
@@ -452,3 +452,34 @@ TTNNLayoutAttr TTNNLayoutAttr::get(
       context, shardShape, elementType, bufferType);
   return get(context, linear, grid, memRefType, memLayout);
 }
+
+// Construct a new MemoryConfig
+//
+// This function creates a deep copy of the current MemoryConfigAttr and
+// replaces the buffer type with the given one.
+//
+// param context The MLIR context.
+// param buffer type The new buffer type.
+// return The new MemoryConfigAttr with the given buffer type.
+MemoryConfigAttr MemoryConfigAttr::withBufferType(::mlir::MLIRContext *context,
+                                                  BufferType bufferType) {
+  return MemoryConfigAttr::get(context, getTensorMemoryLayout(),
+                               BufferTypeAttr::get(context, bufferType),
+                               getShardSpec());
+}
+
+// Construct a new MemoryConfig
+//
+// This function creates a deep copy of the current MemoryConfig and
+// replaces the memory layout with the given one.
+//
+// param context The MLIR context.
+// param memLayout The new memory layout.
+// return The new MemoryConfig with the given memory layout.
+MemoryConfigAttr
+MemoryConfigAttr::withMemoryLayout(::mlir::MLIRContext *context,
+                                   TensorMemoryLayout memLayout) {
+  return MemoryConfigAttr::get(context,
+                               TensorMemoryLayoutAttr::get(context, memLayout),
+                               getBufferType(), getShardSpec());
+}
diff --git a/lib/Dialect/TTNN/IR/TTNNWorkaroundInterface.cpp b/lib/Dialect/TTNN/IR/TTNNWorkaroundInterface.cpp
new file mode 100644
index 0000000000..88d49a4545
--- /dev/null
+++ b/lib/Dialect/TTNN/IR/TTNNWorkaroundInterface.cpp
@@ -0,0 +1,90 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#include "ttmlir/Dialect/TTNN/IR/TTNNWorkaroundInterface.h"
+
+#include "ttmlir/Dialect/TTNN/IR/TTNNOps.h"
+#include "ttmlir/Dialect/TTNN/IR/TTNNWorkarounds.h"
+#include "ttmlir/Utils.h"
+
+#include "mlir/Interfaces/DestinationStyleOpInterface.h"
+#include <llvm/Support/raw_ostream.h>
+
+namespace mlir::tt::ttnn::wa {
+#include "ttmlir/Dialect/TTNN/IR/TTNNWorkaroundInterface.cpp.inc"
+
+// Verifier function for TTNN Workaround Interface.
+mlir::LogicalResult verifyTTNNWorkaroundInterface(mlir::Operation *op) {
+
+  // Verify that the number of input and output operand workarounds is the same
+  // as the number of tensor operands and tensor results.
+  size_t cntTensorInputs =
+      llvm::count_if(op->getOperands(), ttmlir::utils::isRankedTensor);
+  size_t cntTensorResults =
+      llvm::count_if(op->getResults(), ttmlir::utils::isRankedTensor);
+
+  TTNNWorkaroundInterface workaroundOp =
+      mlir::cast<TTNNWorkaroundInterface>(op);
+
+  TTNNOperandsWorkarounds workarounds = workaroundOp.getOperandsWorkarounds();
+
+  if (workarounds.getInputOperandWorkarounds().size() != cntTensorInputs) {
+    return op->emitOpError()
+           << "Number of input operand workarounds "
+           << workarounds.getInputOperandWorkarounds().size()
+           << " does not match the number of tensor inputs " << cntTensorInputs;
+  }
+
+  if (workarounds.getOutputOperandWorkarounds().size() != cntTensorResults) {
+    return op->emitOpError() << "Number of output operand workarounds "
+                             << " does not match the number of tensor results "
+                             << cntTensorResults;
+  }
+
+  // For DPS ops, verify that the output workaround is the same as the input
+  // init workaround.
+  if (mlir::isa<DestinationStyleOpInterface>(op)) {
+    DestinationStyleOpInterface dpsOp =
+        mlir::cast<DestinationStyleOpInterface>(op);
+
+    // Go through all the operands and for each DPS init operand, check if the
+    // output workaround is the same.
+    int dpsDestinationIndex = 0;
+    for (int64_t i = 0; i < op->getNumOperands(); i++) {
+      OpOperand &operand = op->getOpOperand(i);
+
+      // Skip if the output result isn't a tensor.
+      if (!ttmlir::utils::isRankedTensor(operand.get())) {
+        dpsDestinationIndex++;
+        continue;
+      }
+
+      // Skip if the operand is not a DPS init.
+      if (!dpsOp.isDpsInit(&operand)) {
+        dpsDestinationIndex++;
+        continue;
+      }
+
+      // Get the tied output result for the DPS destination operand.
+      OpResult tiedOutputResult = dpsOp.getTiedOpResult(&operand);
+
+      // Check if the output workaround is the same as the input DPS destination
+      // workaround.
+      if (workarounds.getOutputOperandWorkarounds()[tiedOutputResult
+                                                        .getResultNumber()] !=
+          workarounds.getInputOperandWorkarounds()[dpsDestinationIndex]) {
+        return op->emitOpError()
+               << "DPS output workaround does not match "
+                  "the input DPS destination operand workaround "
+               << tiedOutputResult.getResultNumber() << " and "
+               << dpsDestinationIndex;
+      }
+
+      dpsDestinationIndex++;
+    }
+  }
+
+  // All checks passed, return success.
+  return mlir::success();
+}
+} // namespace mlir::tt::ttnn::wa
diff --git a/lib/Dialect/TTNN/IR/TTNNWorkarounds.cpp b/lib/Dialect/TTNN/IR/TTNNWorkarounds.cpp
new file mode 100644
index 0000000000..c1977747fb
--- /dev/null
+++ b/lib/Dialect/TTNN/IR/TTNNWorkarounds.cpp
@@ -0,0 +1,70 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ttmlir/Dialect/TTNN/IR/TTNNWorkarounds.h"
+
+#include "ttmlir/Utils.h"
+
+#include "llvm/ADT/SmallVector.h"
+
+namespace mlir::tt::ttnn::wa {
+
+// Operand workarounds factory method
+TTNNOperandWorkarounds
+TTNNOperandWorkarounds::createEmptyTTNNOperandWorkarounds() {
+  return TTNNOperandWorkarounds();
+}
+
+// Operands workarounds factory method
+TTNNOperandsWorkarounds
+TTNNOperandsWorkarounds::createEmptyTTNNOperandsWorkarounds(int inputSize,
+                                                            int outputSize) {
+  llvm::SmallVector<TTNNOperandWorkarounds> inputOperandWorkarounds(
+      inputSize, TTNNOperandWorkarounds::createEmptyTTNNOperandWorkarounds());
+  llvm::SmallVector<TTNNOperandWorkarounds> outputOperandWorkarounds(
+      outputSize, TTNNOperandWorkarounds::createEmptyTTNNOperandWorkarounds());
+  return TTNNOperandsWorkarounds(inputOperandWorkarounds,
+                                 outputOperandWorkarounds);
+}
+
+// Method to apply tensor workarounds. If the workaround is present, it
+// applies the workaround, and returns both the target workaround argument and
+// a flag indicating whether the workaround was applied.
+WorkaroundResult applyWorkarounds(const TTNNOperandWorkarounds &workaround,
+                                  const TTNNLayoutAttr &inputLayoutAttr) {
+  WorkaroundResult result;
+  result.targetTensorLayoutResult.first =
+      workaround.tensorLayoutWorkaround.value_or(inputLayoutAttr.getLayout());
+  result.targetTensorLayoutResult.second =
+      result.targetTensorLayoutResult.first != inputLayoutAttr.getLayout();
+
+  result.targetTensorBufferTypeResult.first =
+      workaround.tensorBufferTypeWorkaround.value_or(
+          inputLayoutAttr.getBufferType());
+  result.targetTensorBufferTypeResult.second =
+      result.targetTensorBufferTypeResult.first !=
+      inputLayoutAttr.getBufferType();
+
+  result.targetTensorMemoryLayoutResult.first =
+      workaround.tensorMemoryLayoutWorkaround.value_or(
+          inputLayoutAttr.getMemLayout());
+  result.targetTensorMemoryLayoutResult.second =
+      result.targetTensorMemoryLayoutResult.first !=
+      inputLayoutAttr.getMemLayout();
+
+  return result;
+}
+
+// Operands workarounds factory method.
+TTNNOperandsWorkarounds
+TTNNOperandsWorkarounds::createEmptyTTNNOperandsWorkarounds(Operation *op) {
+  size_t tensorInputs =
+      llvm::count_if(op->getOperands(), ttmlir::utils::isRankedTensor);
+  size_t tensorResults =
+      llvm::count_if(op->getResults(), ttmlir::utils::isRankedTensor);
+
+  return TTNNOperandsWorkarounds::createEmptyTTNNOperandsWorkarounds(
+      tensorInputs, tensorResults);
+}
+} // namespace mlir::tt::ttnn::wa
diff --git a/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp b/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp
index 3ade96bf82..4304b3481e 100644
--- a/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp
+++ b/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp
@@ -62,6 +62,14 @@ void createTTNNPipelineLoweringPasses(
   pm.addPass(mlir::createRemoveDeadValuesPass());
 }
 
+// Create a pass to workaround issues in the TTNN dialect.
+void createTTNNPipelineWorkaroundPass(
+    OpPassManager &pm, const TTIRToTTNNBackendPipelineOptions &options) {
+  if (options.workaroundPassEnabled) {
+    pm.addPass(createTTNNWorkarounds());
+  }
+}
+
 void createTTNNPipelineLayoutDecompositionPass(
     OpPassManager &pm, const TTIRToTTNNBackendPipelineOptions &options) {
   pm.addPass(createTTNNDecomposeLayouts());
@@ -124,6 +132,7 @@ void createTTIRToTTNNBackendPipeline(
   createTTNNPipelineTTIRPasses(pm, options);
   createTTNNPipelineTTIRBroadcastFoldPass(pm, options);
   createTTNNPipelineLoweringPasses(pm, options);
+  createTTNNPipelineWorkaroundPass(pm, options);
   createTTNNPipelineAnalysisPasses(pm, options);
   createTTNNPipelineLayoutDecompositionPass(pm, options);
   createTTNNPipelineDeallocPass(pm, options);
diff --git a/lib/Dialect/TTNN/Transforms/CMakeLists.txt b/lib/Dialect/TTNN/Transforms/CMakeLists.txt
index 3f075148b0..fd21e03d0c 100644
--- a/lib/Dialect/TTNN/Transforms/CMakeLists.txt
+++ b/lib/Dialect/TTNN/Transforms/CMakeLists.txt
@@ -1,8 +1,9 @@
 add_mlir_dialect_library(MLIRTTNNTransforms
-        TTNNLayout.cpp
-        Passes.cpp
         Optimizer.cpp
+        Passes.cpp
+        TTNNLayout.cpp
         TTNNToCpp.cpp
+        TTNNWorkarounds.cpp
 
         ADDITIONAL_HEADER_DIRS
         ${PROJECT_SOURCE_DIR}/include/ttmlir
diff --git a/lib/Dialect/TTNN/Transforms/TTNNWorkarounds.cpp b/lib/Dialect/TTNN/Transforms/TTNNWorkarounds.cpp
new file mode 100644
index 0000000000..d3e40277b5
--- /dev/null
+++ b/lib/Dialect/TTNN/Transforms/TTNNWorkarounds.cpp
@@ -0,0 +1,399 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ttmlir/Dialect/TTNN/Transforms/Passes.h"
+
+#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
+#include "ttmlir/Dialect/TTNN/IR/TTNNOps.h"
+#include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h"
+#include "ttmlir/Dialect/TTNN/IR/TTNNWorkarounds.h"
+#include "ttmlir/Dialect/TTNN/Types/Types.h"
+#include "ttmlir/Dialect/TTNN/Utils/TransformUtils.h"
+#include "ttmlir/Dialect/TTNN/Utils/Utils.h"
+
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Interfaces/DestinationStyleOpInterface.h"
+#include "mlir/Rewrite/FrozenRewritePatternSet.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "ttmlir/Utils.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <optional>
+#include <tuple>
+#include <utility>
+
+namespace mlir::tt::ttnn {
+#define GEN_PASS_DEF_TTNNWORKAROUNDS
+#include "ttmlir/Dialect/TTNN/Transforms/Passes.h.inc"
+
+// Helper method to get the tensor layout attribute from the op operand.
+static TTNNLayoutAttr getLayoutAttrFromOpOperand(OpOperand &opOperand) {
+  auto tensorType = mlir::cast<RankedTensorType>(opOperand.get().getType());
+  return mlir::cast<TTNNLayoutAttr>(tensorType.getEncoding());
+}
+
+// Helper method to get the tensor layout attribute from the op result.
+static TTNNLayoutAttr getLayoutAttrFromOpResult(OpResult &opResult) {
+  auto tensorType = mlir::cast<RankedTensorType>(opResult.getType());
+  return mlir::cast<TTNNLayoutAttr>(tensorType.getEncoding());
+}
+
+// Helper method to get the element type for the given tensor layout and data.
+static Type getElementType(MLIRContext *context, Layout tensorLayout,
+                           DataType dataType) {
+  return tensorLayout == Layout::Tile
+             ? TileType::get(context, {ttnn::TILE_HEIGHT, ttnn::TILE_WIDTH},
+                             dataType)
+             : ttnn::utils::createRowMajorTypeFromDtype(context, dataType);
+}
+
+// Helper method to insert a ToLayoutOp to convert the input operand to the
+// desired tensor layout, buffer type and memory layout.
+static mlir::Value
+createToLayoutOp(wa::TTNNWorkaroundInterface &op, OpOperand &inputOperand,
+                 PatternRewriter &rewriter, Layout targetTensorLayout,
+                 BufferType targetTensorBufferType,
+                 TensorMemoryLayout targetTensorMemoryLayout) {
+  TTNNLayoutAttr inputLayoutAttr = getLayoutAttrFromOpOperand(inputOperand);
+
+  // Create element type based on tensor layout.
+  Type elementType = getElementType(rewriter.getContext(), targetTensorLayout,
+                                    inputLayoutAttr.getDataType());
+
+  // Create the output memory config attribute.
+  ttnn::MemoryConfigAttr outputMemConfigAttr = ttnn::MemoryConfigAttr::get(
+      rewriter.getContext(),
+      ttnn::TensorMemoryLayoutAttr::get(rewriter.getContext(),
+                                        targetTensorMemoryLayout),
+      ttnn::BufferTypeAttr::get(rewriter.getContext(), targetTensorBufferType),
+      ttnn::ShardSpecAttr::get(
+          op.getContext(),
+          ttnn::ShapeAttr::get(rewriter.getContext(),
+                               inputLayoutAttr.getMemref().getShape())));
+
+  // Get the input operand type.
+  RankedTensorType inputOperandType =
+      mlir::cast<RankedTensorType>(inputOperand.get().getType());
+
+  // Create a ToLayoutOp to convert the input operand to the desired
+  // tensor layout, buffer type and memory layout.
+  return rewriter
+      .create<ttnn::ToLayoutOp>(
+          op.getLoc(),
+          ttnn::utils::createRankedTensorTypeWithEncoding(
+              inputOperandType,
+              inputLayoutAttr
+                  .withElementType(rewriter.getContext(), elementType)
+                  .withBufferType(rewriter.getContext(), targetTensorBufferType)
+                  .withMemoryLayout(rewriter.getContext(),
+                                    targetTensorMemoryLayout)),
+          inputOperand.get(),
+          LayoutAttr::get(rewriter.getContext(), targetTensorLayout),
+          DataTypeAttr::get(rewriter.getContext(),
+                            inputLayoutAttr.getDataType()),
+          outputMemConfigAttr,
+          (targetTensorBufferType == ttnn::BufferType::SystemMemory)
+              ? nullptr
+              : utils::getOrInsertDevice(rewriter, op))
+      ->getResult(0);
+}
+
+// Helper method to apply workarounds to an input operand. This method inserts a
+// ToLayoutOp with the specified tensor layout, buffer type, and memory layout.
+// It returns true if the workarounds were successfully applied.
+static bool workaroundInputOperand(
+    OpOperand &inputOperand, const wa::TTNNOperandWorkarounds &inputWorkaround,
+    PatternRewriter &rewriter, wa::TTNNWorkaroundInterface op) {
+  // Get the current input tensor layout, buffer type and memory layout from the
+  // input operand.
+  TTNNLayoutAttr inputLayoutAttr = getLayoutAttrFromOpOperand(inputOperand);
+
+  // Apply the workarounds on the input operand workaround arguments
+  wa::WorkaroundResult inputWorkaroundResult =
+      applyWorkarounds(inputWorkaround, inputLayoutAttr);
+
+  // If there were no modifications by workarounds, return false.
+  if (!inputWorkaroundResult.modified()) {
+    return false;
+  }
+
+  // Apply the workarounds on the input operand by inserting the ToLayoutOp with
+  // the desired tensor layout, buffer type and memory layout.
+  mlir::Value insertedToLayoutOpValue = createToLayoutOp(
+      op, inputOperand, rewriter,
+      inputWorkaroundResult.targetTensorLayoutResult.first,
+      inputWorkaroundResult.targetTensorBufferTypeResult.first,
+      inputWorkaroundResult.targetTensorMemoryLayoutResult.first);
+
+  // Insert to layout op between the current op and the input operand
+  // to convert the input operand to the desired tensor layout, buffer type.
+  rewriter.modifyOpInPlace(op, [&]() {
+    // Update the input operand with the new toLayout op operand.
+    op->setOperand(inputOperand.getOperandNumber(), insertedToLayoutOpValue);
+  });
+
+  return true;
+}
+
+// Helper method to apply workarounds to output results.
+// - For DPS results, this method only verifies that the output result matches
+// the
+//   corresponding DPS destination operand. At this stage, DPS results should
+//   already be propagated.
+// - For non-DPS operations, this method applies the necessary workarounds to
+// the
+//   output result and returns true if the workarounds were successfully
+//   applied.
+static bool workaroundOutputOperand(
+    OpResult &opResult, const wa::TTNNOperandWorkarounds &outputWorkaround,
+    PatternRewriter &rewriter, wa::TTNNWorkaroundInterface op) {
+  // Get the current output tensor layout, buffer type and memory layout from
+  // the input operand.
+  TTNNLayoutAttr opResultLayoutAttr = getLayoutAttrFromOpResult(opResult);
+
+  // Apply the workarounds on the output result workaround arguments
+  wa::WorkaroundResult outputWorkaroundResult =
+      wa::applyWorkarounds(outputWorkaround, opResultLayoutAttr);
+
+  // At this point, the DPS result should already be propagated, hence we only
+  // need to verify that the output workaround is in sync with the current DPS
+  // result.
+  assert(!(outputWorkaroundResult.modified() &&
+           mlir::isa<DestinationStyleOpInterface>(op.getOperation())) &&
+         "Output operand workarounds not supported for DPS ops");
+
+  // If there were no modifications by workarounds, return false.
+  if (!outputWorkaroundResult.modified()) {
+    return false;
+  }
+
+  // Create the data type attribute.
+  Type elementType =
+      getElementType(rewriter.getContext(),
+                     outputWorkaroundResult.targetTensorLayoutResult.first,
+                     opResultLayoutAttr.getDataType());
+
+  // Get the input operand type.
+  RankedTensorType opResultType =
+      mlir::cast<RankedTensorType>(opResult.getType());
+
+  // Create the new output result type with the updated tensor layout, buffer
+  // type and memory layout.
+  RankedTensorType newOutputResultType =
+      ttnn::utils::createRankedTensorTypeWithEncoding(
+          opResultType,
+          opResultLayoutAttr.withElementType(rewriter.getContext(), elementType)
+              .withBufferType(
+                  rewriter.getContext(),
+                  outputWorkaroundResult.targetTensorBufferTypeResult.first)
+              .withMemoryLayout(
+                  rewriter.getContext(),
+                  outputWorkaroundResult.targetTensorMemoryLayoutResult.first));
+
+  // Update the type of result with applied workarounds.
+  rewriter.modifyOpInPlace(op, [&]() {
+    opResult.setType(newOutputResultType);
+
+    // Some ops defines attributes with tensor layout, buffer type and memory
+    // layout, hence we need to update the attributes as well. For example,
+    // the empty op defines layout and memory_config attributes.
+    if (outputWorkaroundResult.targetTensorLayoutResult.second &&
+        op->getAttrDictionary().get("layout")) {
+      LayoutAttr updatedLayoutAttr = rewriter.getAttr<LayoutAttr>(
+          outputWorkaroundResult.targetTensorLayoutResult.first);
+      op->setAttr("layout", updatedLayoutAttr);
+    }
+
+    if ((outputWorkaroundResult.targetTensorBufferTypeResult.second ||
+         outputWorkaroundResult.targetTensorMemoryLayoutResult.second) &&
+        op->getAttrDictionary().get("memory_config")) {
+
+      MemoryConfigAttr currentMemoryConfig =
+          mlir::cast<MemoryConfigAttr>(op->getAttr("memory_config"));
+
+      // Create the output memory config attribute.
+      // Check if the buffer type got updated.
+      if (outputWorkaroundResult.targetTensorBufferTypeResult.second) {
+        currentMemoryConfig = currentMemoryConfig.withBufferType(
+            rewriter.getContext(),
+            outputWorkaroundResult.targetTensorBufferTypeResult.first);
+      }
+
+      // Check if the memory layout got updated.
+      if (outputWorkaroundResult.targetTensorMemoryLayoutResult.second) {
+        currentMemoryConfig = currentMemoryConfig.withMemoryLayout(
+            rewriter.getContext(),
+            outputWorkaroundResult.targetTensorMemoryLayoutResult.first);
+      }
+
+      // Update the changed memory config attribute.
+      op->setAttr("memory_config", currentMemoryConfig);
+    }
+  });
+
+  return true;
+}
+
+// Propagate the workaround changes for DPS input operands if they are applied
+// in above graph transforms, either in a pattern for a current op, or in a
+// pattern matched for a previous ops.
+static bool propagateDpsInitChangesToDpsResults(wa::TTNNWorkaroundInterface &op,
+                                                PatternRewriter &rewriter) {
+  // Check if the op is a DPS op.
+  if (!mlir::isa<DestinationStyleOpInterface>(op.getOperation())) {
+    return false;
+  }
+
+  bool modified = false;
+
+  auto dpsOp = mlir::cast<DestinationStyleOpInterface>(op.getOperation());
+  mlir::OperandRange dpsInits = dpsOp.getDpsInits();
+
+  // Iterate through all dps destination operands and propagate the changes if
+  // any.
+  for (size_t dpsInitIndex = 0; dpsInitIndex < dpsInits.size();
+       dpsInitIndex++) {
+    OpOperand *dpsInit = dpsOp.getDpsInitOperand(dpsInitIndex);
+    OpResult tiedDpsResult = dpsOp.getTiedOpResult(dpsInit);
+
+    // If the DPS destination is changed, update the DPS result as well.
+    if (tiedDpsResult.getType() != dpsInit->get().getType()) {
+      modified = true;
+      rewriter.modifyOpInPlace(
+          op, [&]() { tiedDpsResult.setType(dpsInit->get().getType()); });
+    }
+  }
+
+  return modified;
+}
+
+// TTNNWorkaroundInterface rewriter applies workarounds to the operands of TTNN
+// operations. TTNNWorkaroundInterface is an interface on TTNN_Op, so this
+// pattern should match each op in the IR.
+//
+// The rewriter processes both input and output operands of TTNN operations:
+// 1. **Input Operands**: The rewriter iterates through all input tensor
+// operands and applies the necessary workarounds.
+//    - Workarounds are applied by inserting ToLayoutOp with the desired tensor
+//    layout, buffer type, and memory layout.
+// 2. **DPS result propagation**: The rewriter propagates changes to tied DPS
+// destination operands to ensure consistency with previous graph
+// transformations, either in the current op match or previous op matches.
+// 3. **Output Operands**: Output workarounds are applied only if the operation
+// is not a DPS op.
+//    - At this stage, all DPS result changes should be propagated. An assertion
+//    ensures that the output result workaround matches
+//      the corresponding DPS output result.
+//    - Workarounds are applied by updating the output result type with the new
+//    tensor layout, buffer type, and memory layout.
+//    - For operations that define attributes with tensor layout, buffer type,
+//    and memory layout, these attributes are also updated.
+//      For example, the empty op defines layout and memory_config attributes.
+class TTNNOperandsWorkaroundsRewriter
+    : public OpInterfaceRewritePattern<wa::TTNNWorkaroundInterface> {
+public:
+  TTNNOperandsWorkaroundsRewriter(MLIRContext *ctx)
+      : OpInterfaceRewritePattern<wa::TTNNWorkaroundInterface>(ctx) {}
+
+  LogicalResult matchAndRewrite(wa::TTNNWorkaroundInterface op,
+                                PatternRewriter &rewriter) const final {
+
+    // To layout op is a special case, we don't want to rewrite it.
+    if (mlir::isa<ttnn::ToLayoutOp>(op.getOperation())) {
+      return failure();
+    }
+
+    bool modified = false;
+    // Get the operands workarounds for the current operation.
+    wa::TTNNOperandsWorkarounds operandsWorkarounds =
+        op.getOperandsWorkarounds();
+
+    // Filter out all the input tensor operands.
+    auto inputTensorsOperands =
+        llvm::make_filter_range(op->getOpOperands(), [](OpOperand &v) {
+          return ttmlir::utils::isRankedTensor(v.get());
+        });
+
+    // Apply workarounds to all input tensor operands.
+    llvm::for_each(
+        llvm::zip_equal(inputTensorsOperands,
+                        operandsWorkarounds.getInputOperandWorkarounds()),
+        [&](std::tuple<mlir::OpOperand &, const wa::TTNNOperandWorkarounds &>
+                pair) {
+          modified = std::get<1>(pair).hasAnyWorkaround() &&
+                     workaroundInputOperand(std::get<0>(pair),
+                                            std::get<1>(pair), rewriter, op);
+        });
+
+    // Propagate the workaround changes for DPS input operands to DPS results if
+    // they are applied in above graph transforms, either in a pattern for a
+    // current op, or in a pattern matched for a previous ops.
+    modified |= propagateDpsInitChangesToDpsResults(op, rewriter);
+
+    // Filter out all the output tensor results.
+    auto outputTensorResults =
+        llvm::make_filter_range(op->getOpResults(), [](OpResult v) {
+          return ttmlir::utils::isRankedTensor(v);
+        });
+
+    // Apply workarounds to all output tensor results.
+    llvm::for_each(
+        llvm::zip_equal(outputTensorResults,
+                        operandsWorkarounds.getOutputOperandWorkarounds()),
+        [&](std::tuple<mlir::OpResult, const wa::TTNNOperandWorkarounds &>
+                pair) {
+          modified |= std::get<1>(pair).hasAnyWorkaround() &&
+                      workaroundOutputOperand(std::get<0>(pair),
+                                              std::get<1>(pair), rewriter, op);
+        });
+
+    // Return success if the transformations were applied.
+    return modified ? success() : failure();
+  }
+};
+
+// Pass to apply workarounds to the operands of TTNN operations.
+class TTNNWorkarounds : public impl::TTNNWorkaroundsBase<TTNNWorkarounds> {
+public:
+  using impl::TTNNWorkaroundsBase<TTNNWorkarounds>::TTNNWorkaroundsBase;
+
+  void runOnOperation() final {
+    {
+      // Placeholder for workaround decomposition patterns.
+    }
+    {
+      RewritePatternSet patterns(&getContext());
+      patterns.add<TTNNOperandsWorkaroundsRewriter>(&getContext());
+
+      FrozenRewritePatternSet patternSet(std::move(patterns));
+      GreedyRewriteConfig config = GreedyRewriteConfig();
+      // This configuration specifies that the rewriter should traverse the IR
+      // in a top-down order.
+      config.useTopDownTraversal = true;
+      // This configuration specifies the maximum number of iterations the
+      // rewriter will perform on the IR. The rewriter will iterate through the
+      // IR until a fixpoint is reached. All workarounds should be applied
+      // during the first iteration. If the workarounds are not applied in the
+      // first iteration, it indicates a bug in the workarounds implementation.
+      // Although the workarounds are applied in the first iteration, the
+      // rewriter must iterate through the IR once more to confirm that the
+      // fixpoint is reached. If the fixpoint is not reached in the second
+      // iteration, it indicates a bug in the workarounds implementation.
+      config.maxIterations = 2;
+      if (failed(applyPatternsAndFoldGreedily(getOperation(), patternSet,
+                                              config))) {
+        signalPassFailure();
+        return;
+      }
+    }
+  }
+};
+} // namespace mlir::tt::ttnn
diff --git a/lib/Dialect/TTNN/Utils/CMakeLists.txt b/lib/Dialect/TTNN/Utils/CMakeLists.txt
index f78f418642..cad244c0b5 100644
--- a/lib/Dialect/TTNN/Utils/CMakeLists.txt
+++ b/lib/Dialect/TTNN/Utils/CMakeLists.txt
@@ -1,7 +1,9 @@
 add_mlir_dialect_library(TTMLIRTTNNUtils
-  Utils.cpp
   OptimizerOverrides.cpp
   PassOverrides.cpp
+  TransformUtils.cpp
+  Utils.cpp
+
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/TTNN
diff --git a/lib/Dialect/TTNN/Utils/TransformUtils.cpp b/lib/Dialect/TTNN/Utils/TransformUtils.cpp
new file mode 100644
index 0000000000..44b01e91b3
--- /dev/null
+++ b/lib/Dialect/TTNN/Utils/TransformUtils.cpp
@@ -0,0 +1,30 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ttmlir/Dialect/TTNN/Utils/TransformUtils.h"
+
+#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
+#include "ttmlir/Dialect/TTNN/IR/TTNNOps.h"
+
+namespace mlir::tt::ttnn::utils {
+// Gets or inserts a GetDeviceOp at the top of the current block of the given
+// operation.
+Value getOrInsertDevice(PatternRewriter &rewriter, Operation *op) {
+  Block *block = op->getBlock();
+  for (auto &op : block->getOperations()) {
+    if (auto deviceOp = dyn_cast<ttnn::GetDeviceOp>(op)) {
+      return deviceOp.getResult();
+    }
+  }
+
+  DeviceAttr deviceAttr = getCurrentScopeDevice(op);
+  auto currentInsertionPoint = rewriter.saveInsertionPoint();
+  rewriter.setInsertionPoint(block, block->begin());
+  auto deviceOp = rewriter.create<ttnn::GetDeviceOp>(
+      op->getLoc(), rewriter.getType<DeviceType>(deviceAttr),
+      ttnn::MeshShapeAttr::get(op->getContext(), 1, 1));
+  rewriter.restoreInsertionPoint(currentInsertionPoint);
+  return deviceOp.getResult();
+}
+} // namespace mlir::tt::ttnn::utils
diff --git a/lib/Dialect/TTNN/Utils/Utils.cpp b/lib/Dialect/TTNN/Utils/Utils.cpp
index a0736219e6..0aa7b9272c 100644
--- a/lib/Dialect/TTNN/Utils/Utils.cpp
+++ b/lib/Dialect/TTNN/Utils/Utils.cpp
@@ -134,4 +134,12 @@ Type createRowMajorTypeFromDtype(::mlir::MLIRContext *context, DataType dtype) {
   }
 }
 
+// Helper method to create a RankedTensorType with the given encoding
+RankedTensorType
+createRankedTensorTypeWithEncoding(RankedTensorType tensorType,
+                                   ttnn::TTNNLayoutAttr encoding) {
+  return RankedTensorType::get(tensorType.getShape(),
+                               tensorType.getElementType(), encoding);
+}
+
 } // namespace mlir::tt::ttnn::utils
diff --git a/test/ttmlir/Dialect/TTNN/Transforms/Workarounds/simple_workaround.mlir b/test/ttmlir/Dialect/TTNN/Transforms/Workarounds/simple_workaround.mlir
new file mode 100644
index 0000000000..9eed399840
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/Transforms/Workarounds/simple_workaround.mlir
@@ -0,0 +1,31 @@
+// RUN: ttmlir-opt --ttnn-workaround %s | FileCheck %s
+#device = #tt.device<workerGrid = #tt.grid<8x8, (d0, d1) -> (0, d0, d1)>, l1Map = (d0, d1)[s0, s1] -> (0, d0 floordiv s0, d1 floordiv s1, (d0 mod s0) * s1 + d1 mod s1), dramMap = (d0, d1)[s0, s1] -> (0, 0, ((((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) floordiv 8192) mod 12, (((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) floordiv 98304 + (((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) mod 8192), meshShape = , chipIds = [0]>
+#dram = #ttnn.buffer_type<dram>
+#system_memory = #ttnn.buffer_type<system_memory>
+#ttnn_layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #system_memory>>
+#ttnn_layout1 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #dram>, interleaved>
+#ttnn_layout2 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, f32>, #dram>, interleaved>
+module attributes {tt.device = #device} {
+  func.func @forward(%arg0: tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout> {
+    %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+    // CHECK: %[[DEVICE_OP:.*]] = "ttnn.get_device"[[C:.*]]
+    %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<interleaved>, #dram, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    // CHECK-NEXT: %[[RM_DEVICE_LAYOUT_OP:.*]] = "ttnn.to_layout"(%arg0, %[[DEVICE_OP]])
+    // CHECK-SAME: layout = #ttnn.layout<row_major>
+    // CHECK-SAME: -> tensor<64x128xf32, #ttnn_layout1>
+    %2 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<<interleaved>, #dram, <<64x128>>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout2>
+    // CHECK-NEXT: %[[EMPTY_OP:.*]] = "ttnn.empty"(%[[DEVICE_OP]])
+    // CHECK-SAME: layout = #ttnn.layout<row_major>
+    // CHECK-SAME: memory_config = #ttnn.memory_config<<interleaved>, #dram, <<64x128>>>
+    // CHECK-SAME: -> tensor<64x128xf32, #ttnn_layout1>
+    %3 = "ttnn.abs"(%1, %2) <{operandSegmentSizes = array<i32: 1, 1>}> : (tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout2>) -> tensor<64x128xf32, #ttnn_layout2>
+    // CHECK-NEXT: %[[TO_LAYOUT_LEFT:.*]] = "ttnn.to_layout"(%[[RM_DEVICE_LAYOUT_OP]], %[[DEVICE_OP]])
+    // CHECK-SAME: layout = #ttnn.layout<tile>
+    // CHECK-SAME: -> tensor<64x128xf32, #ttnn_layout2>
+    // CHECK-NEXT: %[[TO_LAYOUT_RIGHT:.*]] = "ttnn.to_layout"(%[[EMPTY_OP]], %[[DEVICE_OP]])
+    // CHECK-SAME: layout = #ttnn.layout<tile>
+    // CHECK-SAME: -> tensor<64x128xf32, #ttnn_layout2>
+    %4 = "ttnn.to_layout"(%3) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<none>, #system_memory, <<64x128>>>}> : (tensor<64x128xf32, #ttnn_layout2>) -> tensor<64x128xf32, #ttnn_layout>
+    return %4 : tensor<64x128xf32, #ttnn_layout>
+  }
+}

From 54ed7f8fab322a54eeec8e83a03254b1be590eb3 Mon Sep 17 00:00:00 2001
From: Usman Aziz <uaziz@tenstorrent.com>
Date: Wed, 4 Dec 2024 19:28:40 +0500
Subject: [PATCH 51/84] Implement conversions for stablehlo logistic, tan, tanh
 and log ops (#1122)

Implement conversions for stablehlo logistic, tan, tanh and log ops. Add tests.
---
 include/ttmlir/Dialect/TTIR/IR/TTIROps.td     | 14 +++++++++++
 include/ttmlir/Dialect/TTNN/IR/TTNNOps.td     | 14 +++++++++++
 include/ttmlir/Target/TTNN/program.fbs        |  4 ++-
 .../StableHLOToTTIRPatterns.cpp               |  9 +++++++
 lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp      |  4 ++-
 lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp    |  2 ++
 lib/Target/TTNN/TTNNToFlatbuffer.cpp          | 12 +++++++++
 .../ttnn/operations/eltwise/unary/unary.cpp   |  8 ++++++
 .../StableHLOToTTIR/unary/log_op.mlir         | 10 ++++++++
 .../StableHLOToTTIR/unary/logit_op.mlir       | 10 ++++++++
 .../StableHLOToTTIR/unary/tan_op.mlir         | 10 ++++++++
 .../StableHLOToTTIR/unary/tanh_op.mlir        | 10 ++++++++
 .../TTNN/eltwise/unary/tan/simple_tan.mlir    | 12 +++++++++
 .../TTNN/eltwise/unary/tanh/simple_tanh.mlir  | 12 +++++++++
 .../TTNN/perf_unit/test_perf_ceil.mlir        |  4 +--
 .../TTNN/perf_unit/test_perf_cosine.mlir      |  4 +--
 .../Silicon/TTNN/perf_unit/test_perf_log.mlir |  6 ++---
 .../TTNN/perf_unit/test_perf_sine.mlir        |  4 +--
 .../Silicon/TTNN/perf_unit/test_perf_tan.mlir | 13 ++++++++++
 .../TTNN/perf_unit/test_perf_tanh.mlir        | 13 ++++++++++
 test/ttmlir/Silicon/TTNN/simple_eltwise.mlir  | 25 ++++++++++++++++---
 21 files changed, 186 insertions(+), 14 deletions(-)
 create mode 100644 test/ttmlir/Conversion/StableHLOToTTIR/unary/log_op.mlir
 create mode 100644 test/ttmlir/Conversion/StableHLOToTTIR/unary/logit_op.mlir
 create mode 100644 test/ttmlir/Conversion/StableHLOToTTIR/unary/tan_op.mlir
 create mode 100644 test/ttmlir/Conversion/StableHLOToTTIR/unary/tanh_op.mlir
 create mode 100644 test/ttmlir/Dialect/TTNN/eltwise/unary/tan/simple_tan.mlir
 create mode 100644 test/ttmlir/Dialect/TTNN/eltwise/unary/tanh/simple_tanh.mlir
 create mode 100644 test/ttmlir/Silicon/TTNN/perf_unit/test_perf_tan.mlir
 create mode 100644 test/ttmlir/Silicon/TTNN/perf_unit/test_perf_tanh.mlir

diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
index f55b3acbde..53976e56a5 100644
--- a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
+++ b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
@@ -292,6 +292,20 @@ def TTIR_NegOp: TTIR_ElementwiseUnaryOp<"neg"> {
     }];
 }
 
+def TTIR_TanOp: TTIR_ElementwiseUnaryOp<"tan"> {
+    let summary = "Eltwise tan op.";
+    let description = [{
+      Eltwise tan operation.
+    }];
+}
+
+def TTIR_TanhOp: TTIR_ElementwiseUnaryOp<"tanh"> {
+    let summary = "Eltwise tanh op.";
+    let description = [{
+      Eltwise tanh operation.
+    }];
+}
+
 def TTIR_ReciprocalOp : TTIR_ElementwiseUnaryOp<"reciprocal"> {
     let summary = "Eltwise reciprocal.";
     let description = [{
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
index 94ca8cee3a..d918691611 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
@@ -262,6 +262,20 @@ def TTNN_NegOp : TTNN_ElementwiseUnaryOp<"neg"> {
     }];
 }
 
+def TTNN_TanOp: TTNN_ElementwiseUnaryOp<"tan"> {
+    let summary = "Eltwise tan op.";
+    let description = [{
+      Eltwise tan operation.
+    }];
+}
+
+def TTNN_TanhOp: TTNN_ElementwiseUnaryOp<"tanh"> {
+    let summary = "Eltwise tanh op.";
+    let description = [{
+      Eltwise tanh operation.
+    }];
+}
+
 def TTNN_ReciprocalOp : TTNN_ElementwiseUnaryOp<"reciprocal"> {
     let summary = "Eltwise reciprocal.";
     let description = [{
diff --git a/include/ttmlir/Target/TTNN/program.fbs b/include/ttmlir/Target/TTNN/program.fbs
index 19b1dbc92a..e8d349a495 100644
--- a/include/ttmlir/Target/TTNN/program.fbs
+++ b/include/ttmlir/Target/TTNN/program.fbs
@@ -112,7 +112,9 @@ enum EltwiseOpType: uint32 {
   LogicalXor,
   Clamp,
   LeakyRelu,
-  Scatter
+  Scatter,
+  Tan,
+  Tanh
 }
 
 table ClampOpParams {
diff --git a/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp b/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
index d81b6e2149..4b08f7b6e1 100644
--- a/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
+++ b/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
@@ -1833,6 +1833,15 @@ void addElementwiseUnaryOpsConversionPatterns(MLIRContext *ctx,
       mlir::stablehlo::Expm1Op, mlir::tt::ttir::Expm1Op>>(typeConverter, ctx);
   patterns.add<StableHLOToTTIROpDefaultConversionPattern<
       mlir::stablehlo::SignOp, mlir::tt::ttir::SignOp>>(typeConverter, ctx);
+  patterns.add<StableHLOToTTIROpDefaultConversionPattern<
+      mlir::stablehlo::LogisticOp, mlir::tt::ttir::SigmoidOp>>(typeConverter,
+                                                               ctx);
+  patterns.add<StableHLOToTTIROpDefaultConversionPattern<
+      mlir::stablehlo::TanOp, mlir::tt::ttir::TanOp>>(typeConverter, ctx);
+  patterns.add<StableHLOToTTIROpDefaultConversionPattern<
+      mlir::stablehlo::TanhOp, mlir::tt::ttir::TanhOp>>(typeConverter, ctx);
+  patterns.add<StableHLOToTTIROpDefaultConversionPattern<
+      mlir::stablehlo::LogOp, mlir::tt::ttir::LogOp>>(typeConverter, ctx);
 }
 
 void addElementwiseBinaryOpsConversionPatterns(MLIRContext *ctx,
diff --git a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
index 3c03592fc1..4fff567c1a 100644
--- a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
+++ b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
@@ -956,10 +956,12 @@ void populateTTIRToTTNNPatterns(MLIRContext *ctx, RewritePatternSet &patterns,
            ElementwiseOpConversionPattern<ttir::Expm1Op, ttnn::Expm1Op>,
            ElementwiseOpConversionPattern<ttir::RemainderOp, ttnn::RemainderOp>,
            ElementwiseOpConversionPattern<ttir::WhereOp, ttnn::WhereOp>,
-           ElementwiseUnaryWithFloatParameterOpConversionPattern<ttir::LeakyReluOp, ttnn::LeakyReluOp>,
+           ElementwiseOpConversionPattern<ttir::TanOp, ttnn::TanOp>,
+           ElementwiseOpConversionPattern<ttir::TanhOp, ttnn::TanhOp>,
            ReductionOpConversionPattern<ttir::SumOp, ttnn::SumOp>,
            ReductionOpConversionPattern<ttir::MeanOp, ttnn::MeanOp>,
            ReductionOpConversionPattern<ttir::MaxOp, ttnn::MaxOp>,
+	   ElementwiseUnaryWithFloatParameterOpConversionPattern<ttir::LeakyReluOp, ttnn::LeakyReluOp>,
            EmbeddingOpConversionPattern,
            SoftmaxOpConversionPattern,
            TransposeOpConversionPattern,
diff --git a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
index f04d5566b9..ff16ed0b17 100644
--- a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
+++ b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
@@ -694,6 +694,8 @@ void populateTTNNToEmitCPatterns(mlir::MLIRContext *ctx,
                DefaultOpConversionPattern<ttnn::SinOp>,
                DefaultOpConversionPattern<ttnn::CosOp>,
                DefaultOpConversionPattern<ttnn::Expm1Op>,
+               DefaultOpConversionPattern<ttnn::TanOp>,
+               DefaultOpConversionPattern<ttnn::TanhOp>,
                DefaultOpConversionPattern<ttnn::LogOp>>(typeConverter, ctx);
 
   // Eltwise binary ops
diff --git a/lib/Target/TTNN/TTNNToFlatbuffer.cpp b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
index 9706880e38..65c5b1d5ce 100644
--- a/lib/Target/TTNN/TTNNToFlatbuffer.cpp
+++ b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
@@ -556,6 +556,10 @@ createEltwiseOp(FlatbufferObjectCache &cache, EltwiseOp op) {
                               ::tt::target::ttnn::EltwiseOpWithFloatParams>(
             cache, op)
             .Union();
+  } else if constexpr (std::is_same_v<EltwiseOp, TanOp>) {
+    type = ::tt::target::ttnn::EltwiseOpType::Tan;
+  } else if constexpr (std::is_same_v<EltwiseOp, TanhOp>) {
+    type = ::tt::target::ttnn::EltwiseOpType::Tanh;
   } else {
     llvm_unreachable("unhandled EltwiseOp");
   }
@@ -959,6 +963,14 @@ emitTTNNOperation(FlatbufferObjectCache &cache, Operation *op,
     return createOperation(cache, createOp(cache, arangeOp), debugString,
                            locInfo);
   }
+  if (auto tanOp = dyn_cast<TanOp>(op); tanOp) {
+    return createOperation(cache, createEltwiseOp(cache, tanOp), debugString,
+                           locInfo);
+  }
+  if (auto tanhOp = dyn_cast<TanhOp>(op); tanhOp) {
+    return createOperation(cache, createEltwiseOp(cache, tanhOp), debugString,
+                           locInfo);
+  }
 
   llvm_unreachable("unhandled op in emitTTNNOperation");
 }
diff --git a/runtime/lib/ttnn/operations/eltwise/unary/unary.cpp b/runtime/lib/ttnn/operations/eltwise/unary/unary.cpp
index 50c53f8dbe..d24dc24f8d 100644
--- a/runtime/lib/ttnn/operations/eltwise/unary/unary.cpp
+++ b/runtime/lib/ttnn/operations/eltwise/unary/unary.cpp
@@ -127,6 +127,14 @@ void run(const ::tt::target::ttnn::EltwiseOp *op, ProgramContext &context) {
     runEltwiseUnaryOp(op, tensorPool, ::ttnn::sign);
     break;
   }
+  case ::tt::target::ttnn::EltwiseOpType::Tan: {
+    runEltwiseUnaryOp(op, tensorPool, ::ttnn::tan);
+    break;
+  }
+  case ::tt::target::ttnn::EltwiseOpType::Tanh: {
+    runEltwiseUnaryOp(op, tensorPool, ::ttnn::tanh);
+    break;
+  }
   case ::tt::target::ttnn::EltwiseOpType::Exp: {
     runEltwiseUnaryWithFastAndApproximateModeOp(op, tensorPool, ::ttnn::exp);
     break;
diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/unary/log_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/unary/log_op.mlir
new file mode 100644
index 0000000000..702bc155da
--- /dev/null
+++ b/test/ttmlir/Conversion/StableHLOToTTIR/unary/log_op.mlir
@@ -0,0 +1,10 @@
+// REQUIRES: stablehlo
+// RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s
+module @jit_eltwise_log attributes {} {
+  func.func public @test_log(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+    %0 = stablehlo.log %arg0 : tensor<13x21x3xf32>
+    // CHECK: [[VAL0:%[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<[0-9]+x[0-9]+x[0-9]+xf[0-9]+>]]
+    // CHECK: [[VAL1:%[0-9]+]] = "ttir.log"(%arg0, [[VAL0]]) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device_tile, #any_device_tile]}> : ([[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xf32>
+  }
+}
diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/unary/logit_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/unary/logit_op.mlir
new file mode 100644
index 0000000000..48c64d12d4
--- /dev/null
+++ b/test/ttmlir/Conversion/StableHLOToTTIR/unary/logit_op.mlir
@@ -0,0 +1,10 @@
+// REQUIRES: stablehlo
+// RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s
+module @jit_eltwise_logit attributes {} {
+  func.func public @test_logit(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+    %0 = stablehlo.logistic %arg0 : tensor<13x21x3xf32>
+    // CHECK: [[VAL0:%[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<[0-9]+x[0-9]+x[0-9]+xf[0-9]+>]]
+    // CHECK: [[VAL1:%[0-9]+]] = "ttir.sigmoid"(%arg0, [[VAL0]]) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device_tile, #any_device_tile]}> : ([[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xf32>
+  }
+}
diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/unary/tan_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/unary/tan_op.mlir
new file mode 100644
index 0000000000..77b8f3b8bc
--- /dev/null
+++ b/test/ttmlir/Conversion/StableHLOToTTIR/unary/tan_op.mlir
@@ -0,0 +1,10 @@
+// REQUIRES: stablehlo
+// RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s
+module @jit_eltwise_tan attributes {} {
+  func.func public @test_tan(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+    %0 = stablehlo.tan %arg0 : tensor<13x21x3xf32>
+    // CHECK: [[VAL0:%[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<[0-9]+x[0-9]+x[0-9]+xf[0-9]+>]]
+    // CHECK: [[VAL1:%[0-9]+]] = "ttir.tan"(%arg0, [[VAL0]]) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device_tile, #any_device_tile]}> : ([[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xf32>
+  }
+}
diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/unary/tanh_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/unary/tanh_op.mlir
new file mode 100644
index 0000000000..5d420c43c5
--- /dev/null
+++ b/test/ttmlir/Conversion/StableHLOToTTIR/unary/tanh_op.mlir
@@ -0,0 +1,10 @@
+// REQUIRES: stablehlo
+// RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s
+module @jit_eltwise_tanh attributes {} {
+  func.func public @test_tanh(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+    %0 = stablehlo.tanh %arg0 : tensor<13x21x3xf32>
+    // CHECK: [[VAL0:%[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<[0-9]+x[0-9]+x[0-9]+xf[0-9]+>]]
+    // CHECK: [[VAL1:%[0-9]+]] = "ttir.tanh"(%arg0, [[VAL0]]) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device_tile, #any_device_tile]}> : ([[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xf32>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/eltwise/unary/tan/simple_tan.mlir b/test/ttmlir/Dialect/TTNN/eltwise/unary/tan/simple_tan.mlir
new file mode 100644
index 0000000000..8ae9f0bec1
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/eltwise/unary/tan/simple_tan.mlir
@@ -0,0 +1,12 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s | FileCheck %s
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
+    %0 = tensor.empty() : tensor<64x128xf32>
+    // CHECK: [[VAL0:%[0-9]+]] = "ttnn.empty"(%{{[0-9]+}}) <{dtype = {{.*}}, layout = {{.*}}, memory_config = {{.*}}, <{{.*}}>>, shape = #ttnn.shape<[[TENSOR_SHAPE:[0-9]+x[0-9]+]]>}>
+    %1 = "ttir.tan"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+    // CHECK: %{{[0-9]+}} = "ttnn.tan"(%{{[0-9]+}}, [[VAL0]]) <{operandSegmentSizes = array<i32: 1, 1>}> : (tensor<[[TENSOR_SHAPE]]x{{.*}}, {{.*}}>, tensor<[[TENSOR_SHAPE]]x{{.*}}, {{.*}}) -> tensor<[[TENSOR_SHAPE]]x{{.*}}, {{.*}}>
+    return %1 : tensor<64x128xf32>
+    // CHECK: return %{{[0-9]+}} : tensor<[[TENSOR_SHAPE]]xf32, {{.*}}>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/eltwise/unary/tanh/simple_tanh.mlir b/test/ttmlir/Dialect/TTNN/eltwise/unary/tanh/simple_tanh.mlir
new file mode 100644
index 0000000000..351476448a
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/eltwise/unary/tanh/simple_tanh.mlir
@@ -0,0 +1,12 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s | FileCheck %s
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
+    %0 = tensor.empty() : tensor<64x128xf32>
+    // CHECK: [[VAL0:%[0-9]+]] = "ttnn.empty"(%{{[0-9]+}}) <{dtype = {{.*}}, layout = {{.*}}, memory_config = {{.*}}, <{{.*}}>>, shape = #ttnn.shape<[[TENSOR_SHAPE:[0-9]+x[0-9]+]]>}>
+    %1 = "ttir.tanh"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+    // CHECK: %{{[0-9]+}} = "ttnn.tanh"(%{{[0-9]+}}, [[VAL0]]) <{operandSegmentSizes = array<i32: 1, 1>}> : (tensor<[[TENSOR_SHAPE]]x{{.*}}, {{.*}}>, tensor<[[TENSOR_SHAPE]]x{{.*}}, {{.*}}) -> tensor<[[TENSOR_SHAPE]]x{{.*}}, {{.*}}>
+    return %1 : tensor<64x128xf32>
+    // CHECK: return %{{[0-9]+}} : tensor<[[TENSOR_SHAPE]]xf32, {{.*}}>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_ceil.mlir b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_ceil.mlir
index c31c789f44..2e7f55428c 100644
--- a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_ceil.mlir
+++ b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_ceil.mlir
@@ -5,9 +5,9 @@
 #any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
 
 func.func @ceil(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
-  // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
   %0 = tensor.empty() : tensor<64x128xf32>
-  // CHECK: %[[C:.*]] = "ttnn.ceil"[[C:.*]]
+  // CHECK: [[VAL0:%[0-9]+]] = "ttnn.empty"(%{{[0-9]+}})
+  // CHECK: %{{[0-9]+}} = "ttnn.ceil"(%{{[0-9]+}}, [[VAL0]])
   %1 = "ttir.ceil"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
   return %1 : tensor<64x128xf32>
 }
diff --git a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_cosine.mlir b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_cosine.mlir
index 91a7fea47d..ede823439e 100644
--- a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_cosine.mlir
+++ b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_cosine.mlir
@@ -5,9 +5,9 @@
 #any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
 
 func.func @cosine(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
-  // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
   %0 = tensor.empty() : tensor<64x128xf32>
-  // CHECK: %[[C:.*]] = "ttnn.cos"[[C:.*]]
+  // CHECK: [[VAL0:%[0-9]+]] = "ttnn.empty"(%{{[0-9]+}})
+  // CHECK: %{{[0-9]+}} = "ttnn.cos"(%{{[0-9]+}}, [[VAL0]])
   %1 = "ttir.cos"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
   return %1 : tensor<64x128xf32>
 }
diff --git a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_log.mlir b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_log.mlir
index b1ca157c61..b3de1bba4d 100644
--- a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_log.mlir
+++ b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_log.mlir
@@ -4,10 +4,10 @@
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 #any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
 
-func.func @sqrt(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
-  // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+func.func @log(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
   %0 = tensor.empty() : tensor<64x128xf32>
-  // CHECK: %[[C:.*]] = "ttnn.log"[[C:.*]]
+  // CHECK: [[VAL0:%[0-9]+]] = "ttnn.empty"(%{{[0-9]+}})
+  // CHECK: %{{[0-9]+}} = "ttnn.log"(%{{[0-9]+}}, [[VAL0]])
   %1 = "ttir.log"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
   return %1 : tensor<64x128xf32>
 }
diff --git a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_sine.mlir b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_sine.mlir
index e72d57ffa9..36f71d8e6a 100644
--- a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_sine.mlir
+++ b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_sine.mlir
@@ -5,9 +5,9 @@
 #any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
 
 func.func @sine(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
-  // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
   %0 = tensor.empty() : tensor<64x128xf32>
-  // CHECK: %[[C:.*]] = "ttnn.sin"[[C:.*]]
+  // CHECK: [[VAL0:%[0-9]+]] = "ttnn.empty"(%{{[0-9]+}})
+  // CHECK: %{{[0-9]+}} = "ttnn.sin"(%{{[0-9]+}}, [[VAL0]])
   %1 = "ttir.sin"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
   return %1 : tensor<64x128xf32>
 }
diff --git a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_tan.mlir b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_tan.mlir
new file mode 100644
index 0000000000..aa7b972983
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_tan.mlir
@@ -0,0 +1,13 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+
+func.func @tan(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
+  %0 = tensor.empty() : tensor<64x128xf32>
+  // CHECK: [[VAL0:%[0-9]+]] = "ttnn.empty"(%{{[0-9]+}})
+  // CHECK: %{{[0-9]+}} = "ttnn.tan"(%{{[0-9]+}}, [[VAL0]])
+  %1 = "ttir.tan"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+  return %1 : tensor<64x128xf32>
+}
diff --git a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_tanh.mlir b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_tanh.mlir
new file mode 100644
index 0000000000..ecb7266c96
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_tanh.mlir
@@ -0,0 +1,13 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+
+func.func @tanh(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
+  %0 = tensor.empty() : tensor<64x128xf32>
+  // CHECK: [[VAL0:%[0-9]+]] = "ttnn.empty"(%{{[0-9]+}})
+  // CHECK: %{{[0-9]+}} = "ttnn.tanh"(%{{[0-9]+}}, [[VAL0]])
+  %1 = "ttir.tanh"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+  return %1 : tensor<64x128xf32>
+}
diff --git a/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir b/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir
index 2674a66fdf..b0fb94cc6d 100644
--- a/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir
+++ b/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir
@@ -14,7 +14,8 @@ func.func @add(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<6
 
 func.func @ceil(%arg0: tensor<32x32xf32>) -> tensor<32x32xf32> {
   %0 = tensor.empty() : tensor<32x32xf32>
-  // CHECK: %[[C:.*]] = "ttnn.ceil"[[C:.*]]
+  // CHECK: [[VAL0:%[0-9]+]] = "ttnn.empty"(%{{[0-9]+}})
+  // CHECK: %{{[0-9]+}} = "ttnn.ceil"(%{{[0-9]+}}, [[VAL0]])
   %1 = "ttir.ceil"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<32x32xf32>, tensor<32x32xf32>) -> tensor<32x32xf32>
   return %1 : tensor<32x32xf32>
 }
@@ -40,7 +41,8 @@ func.func @concat(%arg0: tensor<32x32xf32>, %arg1: tensor<32x64xf32>) -> tensor<
 
 func.func @cosine(%arg0: tensor<32x32xf32>) -> tensor<32x32xf32> {
   %0 = tensor.empty() : tensor<32x32xf32>
-  // CHECK: %[[C:.*]] = "ttnn.cos"[[C:.*]]
+  // CHECK: [[VAL0:%[0-9]+]] = "ttnn.empty"(%{{[0-9]+}})
+  // CHECK: %{{[0-9]+}} = "ttnn.cos"(%{{[0-9]+}}, [[VAL0]])
   %1 = "ttir.cos"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<32x32xf32>, tensor<32x32xf32>) -> tensor<32x32xf32>
   return %1 : tensor<32x32xf32>
 }
@@ -193,7 +195,8 @@ func.func @sqrt(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
 
 func.func @sine(%arg0: tensor<32x32xf32>) -> tensor<32x32xf32> {
   %0 = tensor.empty() : tensor<32x32xf32>
-  // CHECK: %[[C:.*]] = "ttnn.sin"[[C:.*]]
+  // CHECK: [[VAL0:%[0-9]+]] = "ttnn.empty"(%{{[0-9]+}})
+  // CHECK: %{{[0-9]+}} = "ttnn.sin"(%{{[0-9]+}}, [[VAL0]])
   %1 = "ttir.sin"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<32x32xf32>, tensor<32x32xf32>) -> tensor<32x32xf32>
   return %1 : tensor<32x32xf32>
 }
@@ -301,6 +304,22 @@ func.func @gelu(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
   return %1 : tensor<64x128xf32>
 }
 
+func.func @tan(%arg0: tensor<64x128xbf16>) -> tensor<64x128xbf16> {
+  %0 = tensor.empty() : tensor<64x128xbf16>
+  // CHECK: [[VAL0:%[0-9]+]] = "ttnn.empty"(%{{[0-9]+}})
+  // CHECK: %{{[0-9]+}} = "ttnn.tan"(%{{[0-9]+}}, [[VAL0]])
+  %1 = "ttir.tan"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xbf16>, tensor<64x128xbf16>) -> tensor<64x128xbf16>
+  return %1 : tensor<64x128xbf16>
+}
+
+func.func @tanh(%arg0: tensor<64x128xbf16>) -> tensor<64x128xbf16> {
+  %0 = tensor.empty() : tensor<64x128xbf16>
+  // CHECK: [[VAL0:%[0-9]+]] = "ttnn.empty"(%{{[0-9]+}})
+  // CHECK: %{{[0-9]+}} = "ttnn.tanh"(%{{[0-9]+}}, [[VAL0]])
+  %1 = "ttir.tanh"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xbf16>, tensor<64x128xbf16>) -> tensor<64x128xbf16>
+  return %1 : tensor<64x128xbf16>
+}
+
 func.func @addint32(%arg0: tensor<64x128xi32>, %arg1: tensor<64x128xi32>) -> tensor<64x128xi32> {
   %0 = tensor.empty() : tensor<64x128xi32>
   %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xi32>, tensor<64x128xi32>, tensor<64x128xi32>) -> tensor<64x128xi32>

From 0aab97b249ff9d3a54d142394a2a237d8fb35749 Mon Sep 17 00:00:00 2001
From: Vraj Prajapati <vprajapati@tenstorrent.com>
Date: Wed, 4 Dec 2024 08:55:26 -0600
Subject: [PATCH 52/84] Integrate Perf Data into TT-Explorer (#1407)

* Non-Runtime build fix

* Interim Update: Node Data Generation

* Removed redundant print messages

* Reviewed fixes
---
 .../tt_adapter/src/tt_adapter/main.py         |  4 +--
 .../tt_adapter/src/tt_adapter/mlir.py         | 30 ++++++++++++++--
 .../tt_adapter/src/tt_adapter/runner.py       | 34 +++++++++++++++++--
 .../tt_adapter/src/tt_adapter/utils.py        |  9 +++++
 4 files changed, 70 insertions(+), 7 deletions(-)

diff --git a/tools/explorer/tt_adapter/src/tt_adapter/main.py b/tools/explorer/tt_adapter/src/tt_adapter/main.py
index d0c49b7af2..3876a09112 100644
--- a/tools/explorer/tt_adapter/src/tt_adapter/main.py
+++ b/tools/explorer/tt_adapter/src/tt_adapter/main.py
@@ -70,9 +70,9 @@ def execute(
             memory_layout_analysis_enabled = False
             memory_layout_analysis_policy = None
 
-        ttnn_ir = self.model_runner.run(
+        perf_data = self.model_runner.run(
             model_path, memory_layout_analysis_enabled, memory_layout_analysis_policy
         )
 
         # TODO(odjuricic, #933) Parse TTNN IR and return the post optimized graph.
-        return {"graphs": []}
+        return utils.to_adapter_format({"perf_data": perf_data})
diff --git a/tools/explorer/tt_adapter/src/tt_adapter/mlir.py b/tools/explorer/tt_adapter/src/tt_adapter/mlir.py
index b9ae471ca5..6b064b1558 100644
--- a/tools/explorer/tt_adapter/src/tt_adapter/mlir.py
+++ b/tools/explorer/tt_adapter/src/tt_adapter/mlir.py
@@ -12,7 +12,12 @@
 
 def get_loc_str(loc):
     try:
-        res = str(loc).split('"')[1]
+        # Constant loc( at the start of the location and ) at the end. Can just strip these characters
+        loc = str(loc)
+        if loc.startswith("loc(") and loc.endswith(")"):
+            res = str(loc)[4:-1]
+        else:
+            res = loc  # This is a fallback to just visualize / see what the loc is if not processable.
     except:
         res = "unknown"
     return res
@@ -471,6 +476,21 @@ def make_constant_node(self, name_dict, constant_name):
 ]
 
 
+def get_locs(module):
+    name_dict = defaultdict(int)
+
+    for op in module.body.operations:
+        for region in op.regions:
+            for block in region.blocks:
+                for op in block.operations:
+                    op = OpHandler(op)
+                    _id = op.get_id(name_dict)
+                    # This will now populate name_dict with all of the locations that are relevant
+
+    # The keys will be all the unique locations, and the values will be the number of times that location appears
+    return name_dict
+
+
 def build_graph(module):
     name_dict = defaultdict(int)
     output_connections = defaultdict(int)
@@ -479,7 +499,11 @@ def build_graph(module):
     op_to_graph_node = {}
 
     module_op = OpHandler(module.operation)
-    graph.nodes.append(module_op.make_graph_node(name_dict))
+    module_attrs = module_op.get_attributes()
+    module_attrs = dict((attr.key, attr.value) for attr in module_attrs)
+    # Add module attributes to the graph as "namespace attributes"
+    group_node_attrs = {}
+    group_node_attrs[module_op.get_namespace()] = module_attrs
 
     for op in module.body.operations:
         append_later = []
@@ -567,5 +591,5 @@ def build_graph(module):
                             )
                         )
                         output_connections[source_node.id] += 1
-
+    graph.groupNodeAttributes = group_node_attrs
     return graph
diff --git a/tools/explorer/tt_adapter/src/tt_adapter/runner.py b/tools/explorer/tt_adapter/src/tt_adapter/runner.py
index 65da2ec2b7..205944acd4 100644
--- a/tools/explorer/tt_adapter/src/tt_adapter/runner.py
+++ b/tools/explorer/tt_adapter/src/tt_adapter/runner.py
@@ -9,8 +9,9 @@
 # os.environ["TTRT_LOGGER_LEVEL"] = "ERROR"
 from ttrt import API as ttrt
 import ttmlir.passes
-from . import utils
+from . import utils, mlir
 import pandas as pd
+from model_explorer import node_data_builder
 
 
 class ModelRunner:
@@ -69,6 +70,9 @@ def run(
 
         module = utils.parse_mlir_file(model_path)
 
+        # Collect unique locations
+        name_dict = mlir.get_locs(module)
+
         try:
             print("Running MLIR compile: TTIR to TTNN Backend Pipeline")
             print("With options: ", options_string)
@@ -131,8 +135,34 @@ def run(
             "DEVICE FW DURATION [ns]",
             "CORE COUNT",
             "OUTPUT_0_MEMORY",
+            "LOC",
         ]
         perf = perf[columns]
         print(perf)
 
-        print("Total device duration: ", perf["DEVICE FW DURATION [ns]"].sum(), "ns")
+        print(f"Total device duration: {perf['DEVICE FW DURATION [ns]'].sum()}ns")
+
+        # Create the node_data type here
+        timing_data = list(zip(perf["LOC"], perf["DEVICE FW DURATION [ns]"]))
+        results = {}
+        for loc, duration in timing_data:
+            loc = mlir.get_loc_str(loc).replace("'", '"')
+            if loc in name_dict:
+                for i in range(name_dict[loc]):
+                    results[f"{loc}__{i}"] = node_data_builder.NodeDataResult(
+                        value=duration
+                    )
+            else:
+                print(
+                    f"Location {loc} not found in graph, ops data for this op was not reported."
+                )
+
+        gradient = [
+            node_data_builder.GradientItem(stop=0, bgColor="yellow"),
+            node_data_builder.GradientItem(stop=1, bgColor="red"),
+        ]
+
+        data = node_data_builder.GraphNodeData(results=results, gradient=gradient)
+
+        res = node_data_builder.ModelNodeData(graphsData={"tt-graph": data})
+        return res
diff --git a/tools/explorer/tt_adapter/src/tt_adapter/utils.py b/tools/explorer/tt_adapter/src/tt_adapter/utils.py
index bca7e640b4..4b404a204b 100644
--- a/tools/explorer/tt_adapter/src/tt_adapter/utils.py
+++ b/tools/explorer/tt_adapter/src/tt_adapter/utils.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 import ttmlir
+from dataclasses import make_dataclass
 
 
 def parse_mlir_file(model_path):
@@ -11,3 +12,11 @@ def parse_mlir_file(model_path):
         ttmlir.dialects.ttnn.register_dialect(ctx)
         module = ttmlir.ir.Module.parse(model_file.read(), ctx)
         return module
+
+
+def to_dataclass(obj: dict, dc_name: str = "tempClass"):
+    return make_dataclass(dc_name, ((k, type(v)) for k, v in obj.items()))(**obj)
+
+
+def to_adapter_format(obj: dict):
+    return {"graphs": [to_dataclass(obj)]}

From 824b25611fce3faa5973a483743175b18d723743 Mon Sep 17 00:00:00 2001
From: Andrej Jakovljevic <ajakovljevic@tenstorrent.com>
Date: Wed, 4 Dec 2024 16:26:10 +0100
Subject: [PATCH 53/84] Fixing reshape op so it supports reshaping of scalars
 (#1322)

---
 .github/workflows/build-and-test.yml                 |  2 +-
 include/ttmlir/Dialect/TTIR/IR/TTIROps.td            |  2 ++
 .../StableHLOToTTIR/StableHLOToTTIRPatterns.cpp      |  4 +++-
 lib/Dialect/TTIR/IR/TTIROps.cpp                      |  9 +++++++++
 .../Dialect/TTNN/reshape/reshape_folding_test.mlir   | 12 ++++++++++++
 5 files changed, 27 insertions(+), 2 deletions(-)
 create mode 100644 test/ttmlir/Dialect/TTNN/reshape/reshape_folding_test.mlir

diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index c54d734b23..62cfdd9455 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -215,7 +215,7 @@ jobs:
 
   run-tests:
 
-    timeout-minutes: 30
+    timeout-minutes: 45
     needs:
       - build-image
       - build-ttmlir
diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
index 53976e56a5..3c53a156a4 100644
--- a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
+++ b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
@@ -884,6 +884,8 @@ def TTIR_ReshapeOp: TTIR_DPSOp<"reshape"> {
     }];
 
     let hasVerifier = 1;
+
+    let hasFolder = 1;
 }
 
 def TTIR_SliceOp: TTIR_DPSOp<"slice"> {
diff --git a/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp b/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
index 4b08f7b6e1..96ef7ca017 100644
--- a/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
+++ b/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
@@ -121,7 +121,9 @@ class StableHLOToTTIRReduceOpConversionPattern
         srcOp.getLoc(), outputType.getShape(), outputType.getElementType());
 
     mlir::ArrayAttr dimArg = rewriter.getArrayAttr(SmallVector<Attribute>(
-        1, rewriter.getI32IntegerAttr(adaptor.getDimensionsAttr()[0])));
+        1, rewriter.getI32IntegerAttr(adaptor.getDimensionsAttr().size() > 0
+                                          ? adaptor.getDimensionsAttr()[0]
+                                          : 1)));
 
     // If someone changes definition of TTIR_ReductionOp this constant will
     // become outdated, but I currently see no way to get this info (without
diff --git a/lib/Dialect/TTIR/IR/TTIROps.cpp b/lib/Dialect/TTIR/IR/TTIROps.cpp
index a3a6dd586c..bc1f02868a 100644
--- a/lib/Dialect/TTIR/IR/TTIROps.cpp
+++ b/lib/Dialect/TTIR/IR/TTIROps.cpp
@@ -389,6 +389,15 @@ ::mlir::LogicalResult mlir::tt::ttir::ReshapeOp::verify() {
   return success();
 }
 
+// ReshapeOp folder
+::mlir::OpFoldResult mlir::tt::ttir::ReshapeOp::fold(FoldAdaptor adaptor) {
+
+  if (getType() == getOperand(0).getType()) {
+    return getOperand(0);
+  }
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // SliceOp
 //===----------------------------------------------------------------------===//
diff --git a/test/ttmlir/Dialect/TTNN/reshape/reshape_folding_test.mlir b/test/ttmlir/Dialect/TTNN/reshape/reshape_folding_test.mlir
new file mode 100644
index 0000000000..c7f4442f0b
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/reshape/reshape_folding_test.mlir
@@ -0,0 +1,12 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s| FileCheck %s
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+// Tests if we fold when translating from "ttir.reshape" which is called on the two same shapes.
+module @reshape_test {
+  func.func @main(%arg0: tensor<1xi32>) -> (tensor<1xi32> {jax.result_info = ""}) {
+    %0 = tensor.empty() : tensor<1xi32>
+    %1 = "ttir.reshape"(%arg0, %0) <{operand_constraints = [#any_device_tile, #any_device_tile], shape = [1 : i32]}> : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+    // CHECK-NOT: %[[C:.*]] = "ttnn.reshape"[C:.*]]
+    // CHECK: return %arg0 : tensor<1xi32, #{{.*}}>
+    return %1 : tensor<1xi32>
+  }
+}

From 7f6046e21dd5f77b1c1bf7355ac76390056e881c Mon Sep 17 00:00:00 2001
From: Lewis Panos <lpanos@tenstorrent.com>
Date: Wed, 4 Dec 2024 13:36:35 -0500
Subject: [PATCH 54/84] Mark all TTIR and TTNN ops as pure. Create tests to
 ensure dead code is erased (#1481)

---
 include/ttmlir/Dialect/TTIR/IR/TTIRBase.td    |  3 +-
 include/ttmlir/Dialect/TTNN/IR/TTNNBase.td    |  3 +-
 include/ttmlir/Dialect/TTNN/IR/TTNNOps.td     |  4 +-
 .../TTIR/test_remove_dead_values_pass.mlir    | 22 ++++++
 .../TTNN/test_remove_dead_values_pass.mlir    | 77 +++++++++++++++++++
 5 files changed, 104 insertions(+), 5 deletions(-)
 create mode 100644 test/ttmlir/Dialect/TTIR/test_remove_dead_values_pass.mlir
 create mode 100644 test/ttmlir/Dialect/TTNN/test_remove_dead_values_pass.mlir

diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIRBase.td b/include/ttmlir/Dialect/TTIR/IR/TTIRBase.td
index 57f3dc37d3..b71541b8dc 100644
--- a/include/ttmlir/Dialect/TTIR/IR/TTIRBase.td
+++ b/include/ttmlir/Dialect/TTIR/IR/TTIRBase.td
@@ -6,6 +6,7 @@
 #define TTMLIR_TTMLIR_DIALECT_TTIR_TTIRDIALECT_TD
 
 include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 
 //===----------------------------------------------------------------------===//
 // TTIR dialect definition.
@@ -38,6 +39,6 @@ def TTIR_Dialect : Dialect {
 //===----------------------------------------------------------------------===//
 
 class TTIR_Op<string mnemonic, list<Trait> traits = []> :
-        Op<TTIR_Dialect, mnemonic, traits>;
+        Op<TTIR_Dialect, mnemonic, !listconcat(traits, [Pure])>;
 
 #endif
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNBase.td b/include/ttmlir/Dialect/TTNN/IR/TTNNBase.td
index 34d3daf9cc..f6f764d01a 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNBase.td
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNBase.td
@@ -6,6 +6,7 @@
 #define TTMLIR_TTMLIR_DIALECT_TTNN_TTNNDIALECT_TD
 
 include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "ttmlir/Dialect/TTNN/IR/TTNNOpModelInterface.td"
 include "ttmlir/Dialect/TTNN/IR/TTNNWorkaroundInterface.td"
 
@@ -45,6 +46,6 @@ def TTNN_Dialect : Dialect {
 //===----------------------------------------------------------------------===//
 
 class TTNN_Op<string mnemonic, list<Trait> traits = []> :
-        Op<TTNN_Dialect, mnemonic, !listconcat(traits, [TTNN_OpModelInterface, TTNN_WorkaroundInterface])>;
+        Op<TTNN_Dialect, mnemonic, !listconcat(traits, [Pure, TTNN_OpModelInterface, TTNN_WorkaroundInterface])>;
 
 #endif
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
index d918691611..41eb9dff75 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
@@ -794,9 +794,7 @@ def TTNN_ClampOp : TTNN_Op<"clamp"> {
     let hasVerifier = 1;
 }
 
-// Note: NoMemoryEffect is used to indicate that operation can be removed if it is not used.
-// Removal of this operation is done by the dead code elimination pass (RemoveDeadValuesPass).
-def TTNN_EmptyOp : TTNN_Op<"empty", [NoMemoryEffect]> {
+def TTNN_EmptyOp : TTNN_Op<"empty"> {
     let summary = "Empty op.";
     let description = [{
       Tensor empty operation
diff --git a/test/ttmlir/Dialect/TTIR/test_remove_dead_values_pass.mlir b/test/ttmlir/Dialect/TTIR/test_remove_dead_values_pass.mlir
new file mode 100644
index 0000000000..8b6df4d0f2
--- /dev/null
+++ b/test/ttmlir/Dialect/TTIR/test_remove_dead_values_pass.mlir
@@ -0,0 +1,22 @@
+// RUN: ttmlir-opt --remove-dead-values %s | FileCheck %s
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
+    %0 = tensor.empty() : tensor<64x128xf32>
+    // CHECK: %[[C:.*]] = "ttir.multiply"[[C:.*]]
+    %1 = "ttir.multiply"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+    %2 = tensor.empty() : tensor<64x128xf32>
+    // CHECK-NOT: %[[C:.*]] = "ttir.add"[[C:.*]]
+    %3 = "ttir.add"(%arg0, %arg1, %2) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+    %4 = tensor.empty() : tensor<64x128xf32>
+    // CHECK-NOT: %[[C:.*]] = "ttir.subtract"[[C:.*]]
+    %5 = "ttir.subtract"(%arg0, %arg1, %4) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+    %6 = tensor.empty() : tensor<64x128xf32>
+    // CHECK-NOT: %[[C:.*]] = "ttir.div"[[C:.*]]
+    %7 = "ttir.div"(%arg0, %arg1, %6) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+    %8 = tensor.empty() : tensor<64x128xf32>
+    // CHECK-NOT: %[[C:.*]] = "ttir.eq"[[C:.*]]
+    %9 = "ttir.eq"(%arg0, %arg1, %8) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+    return %1 : tensor<64x128xf32>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/test_remove_dead_values_pass.mlir b/test/ttmlir/Dialect/TTNN/test_remove_dead_values_pass.mlir
new file mode 100644
index 0000000000..f3231730f5
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/test_remove_dead_values_pass.mlir
@@ -0,0 +1,77 @@
+// RUN: ttmlir-opt --remove-dead-values %s | FileCheck %s
+#device = #tt.device<workerGrid = #tt.grid<8x8, (d0, d1) -> (0, d0, d1)>, l1Map = (d0, d1)[s0, s1] -> (0, d0 floordiv s0, d1 floordiv s1, (d0 mod s0) * s1 + d1 mod s1), dramMap = (d0, d1)[s0, s1] -> (0, 0, ((((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) floordiv 8192) mod 12, (((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) floordiv 98304 + (((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) mod 8192), meshShape = , chipIds = [0]>
+#dram = #ttnn.buffer_type<dram>
+#system_desc = #tt.system_desc<[{role = host, target_triple = "x86_64-pc-linux-gnu"}], [{arch = <wormhole_b0>, grid = 8x8, l1_size = 1499136, num_dram_channels = 12, dram_channel_size = 1073741824, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32, l1_unreserved_base = 1024, erisc_l1_unreserved_base = 1024, dram_unreserved_base = 1024, dram_unreserved_end = 1073741824, physical_cores = {worker = [ 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  1x0,  1x1,  1x2,  1x3,  1x4,  1x5,  1x6,  1x7,  2x0,  2x1,  2x2,  2x3,  2x4,  2x5,  2x6,  2x7,  3x0,  3x1,  3x2,  3x3,  3x4,  3x5,  3x6,  3x7,  4x0,  4x1,  4x2,  4x3,  4x4,  4x5,  4x6,  4x7,  5x0,  5x1,  5x2,  5x3,  5x4,  5x5,  5x6,  5x7,  6x0,  6x1,  6x2,  6x3,  6x4,  6x5,  6x6,  6x7,  7x0,  7x1,  7x2,  7x3,  7x4,  7x5,  7x6,  7x7] dram = [ 8x0,  9x0,  10x0,  8x1,  9x1,  10x1,  8x2,  9x2,  10x2,  8x3,  9x3,  10x3]}, supported_data_types = [<f32>, <f16>, <bf16>, <bfp_f8>, <bfp_bf8>, <bfp_f4>, <bfp_bf4>, <bfp_f2>, <bfp_bf2>, <u32>, <u16>, <u8>], supported_tile_sizes = [ 4x16,  16x16,  32x16,  4x32,  16x32,  32x32], num_cbs = 32}], [0], [3 : i32], [ 0x0x0x0]>
+#system_memory = #ttnn.buffer_type<system_memory>
+#ttnn_layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #system_memory>>
+#ttnn_layout1 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, f32>, #dram>, interleaved>
+#ttnn_layout2 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #dram>, interleaved>
+module attributes {tt.device = #device, tt.system_desc = #system_desc} {
+  func.func @forward(%arg0: tensor<64x128xf32, #ttnn_layout>, %arg1: tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout> {
+    %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+    %1 = "ttnn.to_layout"(%arg0) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout1>
+    %2 = "ttnn.to_device"(%1, %0) <{memory_config = #ttnn.memory_config<<interleaved>, #dram, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    "ttnn.deallocate"(%1) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
+    %3 = "ttnn.to_layout"(%arg1) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout1>
+    %4 = "ttnn.to_device"(%3, %0) <{memory_config = #ttnn.memory_config<<interleaved>, #dram, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    "ttnn.deallocate"(%3) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
+    %5 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<interleaved>, #dram, <<64x128>>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout2>
+    // CHECK: %[[C:.*]] = "ttnn.multiply"[[C:.*]]
+    %6 = "ttnn.multiply"(%2, %4, %5) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout2>) -> tensor<64x128xf32, #ttnn_layout2>
+    "ttnn.deallocate"(%4) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
+    "ttnn.deallocate"(%2) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
+    %7 = "ttnn.to_layout"(%arg0) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout1>
+    %8 = "ttnn.to_device"(%7, %0) <{memory_config = #ttnn.memory_config<<interleaved>, #dram, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    "ttnn.deallocate"(%7) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
+    %9 = "ttnn.to_layout"(%arg1) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout1>
+    %10 = "ttnn.to_device"(%9, %0) <{memory_config = #ttnn.memory_config<<interleaved>, #dram, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    "ttnn.deallocate"(%9) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
+    %11 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<interleaved>, #dram, <<64x128>>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout2>
+    // CHECK-NOT: %[[C:.*]] = "ttnn.add"[[C:.*]]
+    %12 = "ttnn.add"(%8, %10, %11) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout2>) -> tensor<64x128xf32, #ttnn_layout2>
+    "ttnn.deallocate"(%11) <{force = false}> : (tensor<64x128xf32, #ttnn_layout2>) -> ()
+    "ttnn.deallocate"(%10) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
+    "ttnn.deallocate"(%8) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
+    %13 = "ttnn.to_layout"(%arg0) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout1>
+    %14 = "ttnn.to_device"(%13, %0) <{memory_config = #ttnn.memory_config<<interleaved>, #dram, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    "ttnn.deallocate"(%13) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
+    %15 = "ttnn.to_layout"(%arg1) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout1>
+    %16 = "ttnn.to_device"(%15, %0) <{memory_config = #ttnn.memory_config<<interleaved>, #dram, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    "ttnn.deallocate"(%15) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
+    %17 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<interleaved>, #dram, <<64x128>>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout2>
+    // CHECK-NOT: %[[C:.*]] = "ttnn.subtract"[[C:.*]]
+    %18 = "ttnn.subtract"(%14, %16, %17) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout2>) -> tensor<64x128xf32, #ttnn_layout2>
+    "ttnn.deallocate"(%17) <{force = false}> : (tensor<64x128xf32, #ttnn_layout2>) -> ()
+    "ttnn.deallocate"(%16) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
+    "ttnn.deallocate"(%14) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
+    %19 = "ttnn.to_layout"(%arg0) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout1>
+    %20 = "ttnn.to_device"(%19, %0) <{memory_config = #ttnn.memory_config<<interleaved>, #dram, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    "ttnn.deallocate"(%19) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
+    %21 = "ttnn.to_layout"(%arg1) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout1>
+    %22 = "ttnn.to_device"(%21, %0) <{memory_config = #ttnn.memory_config<<interleaved>, #dram, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    "ttnn.deallocate"(%21) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
+    %23 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<interleaved>, #dram, <<64x128>>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout2>
+    // CHECK-NOT: %[[C:.*]] = "ttnn.div"[[C:.*]]
+    %24 = "ttnn.div"(%20, %22, %23) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout2>) -> tensor<64x128xf32, #ttnn_layout2>
+    "ttnn.deallocate"(%23) <{force = false}> : (tensor<64x128xf32, #ttnn_layout2>) -> ()
+    "ttnn.deallocate"(%22) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
+    "ttnn.deallocate"(%20) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
+    %25 = "ttnn.to_layout"(%arg0) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout1>
+    %26 = "ttnn.to_device"(%25, %0) <{memory_config = #ttnn.memory_config<<interleaved>, #dram, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    "ttnn.deallocate"(%25) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
+    %27 = "ttnn.to_layout"(%arg1) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout1>
+    %28 = "ttnn.to_device"(%27, %0) <{memory_config = #ttnn.memory_config<<interleaved>, #dram, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    "ttnn.deallocate"(%27) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
+    %29 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<interleaved>, #dram, <<64x128>>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout2>
+    // CHECK-NOT: %[[C:.*]] = "ttnn.eq"[[C:.*]]
+    %30 = "ttnn.eq"(%26, %28, %29) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout2>) -> tensor<64x128xf32, #ttnn_layout2>
+    "ttnn.deallocate"(%29) <{force = false}> : (tensor<64x128xf32, #ttnn_layout2>) -> ()
+    "ttnn.deallocate"(%28) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
+    "ttnn.deallocate"(%26) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
+    %31 = "ttnn.from_device"(%6) : (tensor<64x128xf32, #ttnn_layout2>) -> tensor<64x128xf32, #ttnn_layout>
+    "ttnn.deallocate"(%5) <{force = false}> : (tensor<64x128xf32, #ttnn_layout2>) -> ()
+    %32 = "ttnn.to_layout"(%31) <{layout = #ttnn.layout<row_major>}> : (tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout>
+    "ttnn.deallocate"(%31) <{force = false}> : (tensor<64x128xf32, #ttnn_layout>) -> ()
+    return %32 : tensor<64x128xf32, #ttnn_layout>
+  }
+}

From d837ac74cfab079beddda9eacb0935614d1cabfc Mon Sep 17 00:00:00 2001
From: Vlad Roubtsov <vroubtsov@tenstorrent.com>
Date: Wed, 4 Dec 2024 14:29:46 -0600
Subject: [PATCH 55/84] add copy_tile_init() needed before eltwise max operand
 copies (#1495)

---
 lib/Conversion/TTIRToTTMetal/TTIRToTTMetal.cpp |  3 ++-
 test/ttmlir/Silicon/TTMetal/simple_max.mlir    | 13 +++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)
 create mode 100644 test/ttmlir/Silicon/TTMetal/simple_max.mlir

diff --git a/lib/Conversion/TTIRToTTMetal/TTIRToTTMetal.cpp b/lib/Conversion/TTIRToTTMetal/TTIRToTTMetal.cpp
index 09727e2034..00220d7e44 100644
--- a/lib/Conversion/TTIRToTTMetal/TTIRToTTMetal.cpp
+++ b/lib/Conversion/TTIRToTTMetal/TTIRToTTMetal.cpp
@@ -1007,11 +1007,12 @@ class TTIRToTTMetalDispatchRewriter : public OpRewritePattern<ttir::GenericOp> {
     builder.create<ttkernel::TileRegsAcquireOp>(location);
     {
       // copy inCB0[inCB0TileIndex] and inCB1[inCB1TileIndex] to DST:
+      builder.create<ttkernel::CopyTileInitOp>(location);
       builder.create<ttkernel::CopyTileOp>(location, inCB0, inCB0TileIndex,
                                            dstLhsTileIndex);
       builder.create<ttkernel::CopyTileOp>(location, inCB1, inCB1TileIndex,
                                            dstRhsTileIndex);
-      // SFPU ooperates on DST tiles:
+      // SFPU operates on DST tiles:
       builder.create<TTKernelTilesOp>(location, dstLhsTileIndex,
                                       dstRhsTileIndex);
     }
diff --git a/test/ttmlir/Silicon/TTMetal/simple_max.mlir b/test/ttmlir/Silicon/TTMetal/simple_max.mlir
new file mode 100644
index 0000000000..92bdbe72c7
--- /dev/null
+++ b/test/ttmlir/Silicon/TTMetal/simple_max.mlir
@@ -0,0 +1,13 @@
+// RUN: ttmlir-opt --ttir-to-ttmetal-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttmetal-to-flatbuffer %t.mlir > %t.ttm
+
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+
+func.func @maximum(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
+  %0 = tensor.empty() : tensor<64x128xf32>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
+  %1 = "ttir.maximum"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+  return %1 : tensor<64x128xf32>
+}

From 94054730da4927bf4276a9e57c231354d7faed97 Mon Sep 17 00:00:00 2001
From: Jackson Nie <jnie@tenstorrent.com>
Date: Wed, 4 Dec 2024 16:22:34 -0500
Subject: [PATCH 56/84] Remove workarounds for empty and full op (#1485)

---
 .../include/tt/runtime/detail/workarounds.h   | 34 ++-----------
 runtime/lib/common/workarounds.cpp            |  8 +--
 .../lib/ttnn/operations/creation/empty.cpp    |  5 --
 runtime/lib/ttnn/operations/creation/full.cpp | 13 -----
 runtime/test/python/ttnn/test_runtime_api.py  |  2 +-
 runtime/tools/python/test/test_run.py         | 51 -------------------
 runtime/tools/python/ttrt/common/run.py       | 24 ---------
 7 files changed, 7 insertions(+), 130 deletions(-)

diff --git a/runtime/include/tt/runtime/detail/workarounds.h b/runtime/include/tt/runtime/detail/workarounds.h
index 38d8c08cf3..c0d3b62a1c 100644
--- a/runtime/include/tt/runtime/detail/workarounds.h
+++ b/runtime/include/tt/runtime/detail/workarounds.h
@@ -15,29 +15,14 @@ struct Env {
 #else
   constexpr static Env
 #endif
-  get(bool ignoreTileShape = true, bool emptyOpForceRowMajor = true,
-      bool fullOpForceRowMajor = true, bool maxpool2dPreshard = true,
-      bool swapBinaryOperands = true)
+  get(bool maxpool2dPreshard = true, bool swapBinaryOperands = true)
 #if defined(TT_RUNTIME_WORKAROUNDS) && TT_RUNTIME_WORKAROUNDS == 1
       ;
 #else
   {
-    return Env(true, true, true, true, true);
+    return Env(true, true);
   }
 #endif
-  // TODO(bug #272), determine correct layout by tile shape in the future
-  // currently tile shape is not set correctly, so as a workaround, hardcode
-  // layout
-  bool ignoreTileShape;
-
-  // TODO(bug #582): ttnn::empty doesn't work properly with tile layout,
-  // using ROW_MAJOR until we fix it
-  bool emptyOpForceRowMajor;
-
-  // TODO(bug #582): ttnn::full doesn't work properly with tile layout,
-  // using ROW_MAJOR until we fix it
-  bool fullOpForceRowMajor;
-
   // TODO(bug #855): Ideally we should have an op that preshards for maxpool2d
   // instead of adding a method in runtime
   bool maxpool2dPreshard;
@@ -48,24 +33,13 @@ struct Env {
   bool swapBinaryOperands;
 
 private:
-  constexpr Env(bool ignoreTileShape, bool emptyOpForceRowMajor,
-                bool fullOpForceRowMajor, bool maxpool2dPreshard,
-                bool swapBinaryOperands)
-      : ignoreTileShape(ignoreTileShape),
-        emptyOpForceRowMajor(emptyOpForceRowMajor),
-        fullOpForceRowMajor(fullOpForceRowMajor),
-        maxpool2dPreshard(maxpool2dPreshard),
+  constexpr Env(bool maxpool2dPreshard, bool swapBinaryOperands)
+      : maxpool2dPreshard(maxpool2dPreshard),
         swapBinaryOperands(swapBinaryOperands) {}
 };
 
 inline std::ostream &operator<<(std::ostream &os, const Env &env) {
   os << "workaround::Env{\n";
-  os << "\t"
-     << "ignoreTileShape: " << env.ignoreTileShape << ",\n";
-  os << "\t"
-     << "emptyOpForceRowMajor: " << env.emptyOpForceRowMajor << ",\n";
-  os << "\t"
-     << "fullOpForceRowMajor: " << env.fullOpForceRowMajor << ",\n";
   os << "\t"
      << "maxpool2dPreshard: " << env.maxpool2dPreshard << ",\n";
   os << "\t"
diff --git a/runtime/lib/common/workarounds.cpp b/runtime/lib/common/workarounds.cpp
index cd2795d023..aeeb16c651 100644
--- a/runtime/lib/common/workarounds.cpp
+++ b/runtime/lib/common/workarounds.cpp
@@ -6,12 +6,8 @@
 
 namespace tt::runtime::workaround {
 #if defined(TT_RUNTIME_WORKAROUNDS) && TT_RUNTIME_WORKAROUNDS == 1
-const Env &Env::get(bool ignoreTileShape, bool emptyOpForceRowMajor,
-                    bool fullOpForceRowMajor, bool maxpool2dPreshard,
-                    bool swapBinaryOperands) {
-  static const Env config(ignoreTileShape, emptyOpForceRowMajor,
-                          fullOpForceRowMajor, maxpool2dPreshard,
-                          swapBinaryOperands);
+const Env &Env::get(bool maxpool2dPreshard, bool swapBinaryOperands) {
+  static const Env config(maxpool2dPreshard, swapBinaryOperands);
   return config;
 }
 #endif
diff --git a/runtime/lib/ttnn/operations/creation/empty.cpp b/runtime/lib/ttnn/operations/creation/empty.cpp
index d504a798b2..85eacef23d 100644
--- a/runtime/lib/ttnn/operations/creation/empty.cpp
+++ b/runtime/lib/ttnn/operations/creation/empty.cpp
@@ -24,11 +24,6 @@ struct EmptyTensorConfig {
         dtype(::tt::runtime::ttnn::operations::utils::getDataType(op->out())),
         numShards(op->num_shards()), strategy(op->strategy()) {
     layout = ::tt::runtime::ttnn::utils::toTTNNLayout(op->layout());
-    // TODO(bug #582): ttnn::empty doesn't work properly with tile layout,
-    // using ROW_MAJOR until we fix it
-    if (workaround::Env::get().emptyOpForceRowMajor) {
-      layout = ::ttnn::Layout::ROW_MAJOR;
-    }
     if (op->device()) {
       LOG_ASSERT(op->memcfg(),
                  "Memory config must be provided when device is provided");
diff --git a/runtime/lib/ttnn/operations/creation/full.cpp b/runtime/lib/ttnn/operations/creation/full.cpp
index 7f6a6c0b6f..b8536e0a86 100644
--- a/runtime/lib/ttnn/operations/creation/full.cpp
+++ b/runtime/lib/ttnn/operations/creation/full.cpp
@@ -28,19 +28,6 @@ struct FullTensorConfig {
 
     layout = ::tt::runtime::ttnn::utils::inferLayoutFromTileShape(op->out());
 
-    // TODO(bug #272), determine correct layout by tile shape in the future
-    // currently tile shape is not set correctly, so as a workaround, hardcode
-    // layout
-    if (workaround::Env::get().ignoreTileShape) {
-      layout = ::ttnn::Layout::TILE;
-    }
-
-    // TODO(bug #582): ttnn::empty doesn't work properly with tile layout,
-    // using ROW_MAJOR until we fix it
-    if (workaround::Env::get().fullOpForceRowMajor) {
-      layout = ::ttnn::Layout::ROW_MAJOR;
-    }
-
     if (!utils::inSystemMemory(op->out())) {
       memoryConfig = ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
     }
diff --git a/runtime/test/python/ttnn/test_runtime_api.py b/runtime/test/python/ttnn/test_runtime_api.py
index fe914d0c9a..d88232fa29 100644
--- a/runtime/test/python/ttnn/test_runtime_api.py
+++ b/runtime/test/python/ttnn/test_runtime_api.py
@@ -156,5 +156,5 @@ def test_runtime_stitching_eltwise_binary_op_chain(helper: Helper, request):
     golden = (
         (inputs_torch[0] + inputs_torch[1]).mul(inputs_torch[1]).sub(inputs_torch[1])
     )
-    assert_pcc(golden, torch_result_tensor, threshold=0.999), program_index
+    assert_pcc(golden, torch_result_tensor, threshold=0.99)
     helper.teardown()
diff --git a/runtime/tools/python/test/test_run.py b/runtime/tools/python/test/test_run.py
index 69d5683aaf..37167a6e93 100644
--- a/runtime/tools/python/test/test_run.py
+++ b/runtime/tools/python/test/test_run.py
@@ -311,57 +311,6 @@ def test_enable_async_ttnn_cmd_run():
     sub_process_command(command)
 
 
-def test_disable_ignore_tile_shape_run():
-    API.initialize_apis()
-    custom_args = {}
-    custom_args[
-        "--result-file"
-    ] = f"ttrt-results/{inspect.currentframe().f_code.co_name}.json"
-    custom_args["binary"] = BINARY_FILE_PATH
-    custom_args["--disable-ignore-tile-shape"] = True
-    run_instance = API.Run(args=custom_args)
-    run_instance()
-
-
-def test_disable_ignore_tile_shape_cmd_run():
-    command = f"ttrt run {BINARY_FILE_PATH} --disable-ignore-tile-shape --log-file ttrt-results/{inspect.currentframe().f_code.co_name}.log --result-file ttrt-results/{inspect.currentframe().f_code.co_name}.json"
-    sub_process_command(command)
-
-
-def test_disable_empty_op_row_major_run():
-    API.initialize_apis()
-    custom_args = {}
-    custom_args[
-        "--result-file"
-    ] = f"ttrt-results/{inspect.currentframe().f_code.co_name}.json"
-    custom_args["binary"] = BINARY_FILE_PATH
-    custom_args["--disable-empty-op-row-major"] = True
-    run_instance = API.Run(args=custom_args)
-    run_instance()
-
-
-def test_disable_empty_op_row_major_cmd_run():
-    command = f"ttrt run {BINARY_FILE_PATH} --disable-empty-op-row-major --log-file ttrt-results/{inspect.currentframe().f_code.co_name}.log --result-file ttrt-results/{inspect.currentframe().f_code.co_name}.json"
-    sub_process_command(command)
-
-
-def test_disable_full_op_row_major_run():
-    API.initialize_apis()
-    custom_args = {}
-    custom_args[
-        "--result-file"
-    ] = f"ttrt-results/{inspect.currentframe().f_code.co_name}.json"
-    custom_args["binary"] = BINARY_FILE_PATH
-    custom_args["--disable-full-op-row-major"] = True
-    run_instance = API.Run(args=custom_args)
-    run_instance()
-
-
-def test_disable_full_op_row_major_cmd_run():
-    command = f"ttrt run {BINARY_FILE_PATH} --disable-full-op-row-major --log-file ttrt-results/{inspect.currentframe().f_code.co_name}.log --result-file ttrt-results/{inspect.currentframe().f_code.co_name}.json"
-    sub_process_command(command)
-
-
 def test_disable_maxpool2d_preshard_run():
     API.initialize_apis()
     custom_args = {}
diff --git a/runtime/tools/python/ttrt/common/run.py b/runtime/tools/python/ttrt/common/run.py
index be9711587f..5dc93a50d9 100644
--- a/runtime/tools/python/ttrt/common/run.py
+++ b/runtime/tools/python/ttrt/common/run.py
@@ -124,27 +124,6 @@ def initialize_api():
             choices=[True, False],
             help="enable async mode device execution for TTNN runtime",
         )
-        Run.register_arg(
-            name="--disable-ignore-tile-shape",
-            type=bool,
-            default=False,
-            choices=[True, False],
-            help="disable ignore tile shape workaround",
-        )
-        Run.register_arg(
-            name="--disable-empty-op-row-major",
-            type=bool,
-            default=False,
-            choices=[True, False],
-            help="disable empty op force row major workaround",
-        )
-        Run.register_arg(
-            name="--disable-full-op-row-major",
-            type=bool,
-            default=False,
-            choices=[True, False],
-            help="disable full op force row major workaround",
-        )
         Run.register_arg(
             name="--disable-maxpool2d-preshard",
             type=bool,
@@ -370,9 +349,6 @@ def _execute(binaries):
             )
             self.logging.debug(f"setting tt runtime debug env={debug_env}")
             workaround_env = ttrt.runtime.WorkaroundEnv.get(
-                not self["--disable-ignore-tile-shape"],
-                not self["--disable-empty-op-row-major"],
-                not self["--disable-full-op-row-major"],
                 not self["--disable-maxpool2d-preshard"],
                 not self["--disable-swap-binary-operands"],
             )

From b5bc1034f8de0bcd860cb344d8eca56c36b0e7a4 Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <157983820+vmilosevic@users.noreply.github.com>
Date: Thu, 5 Dec 2024 06:08:23 +0100
Subject: [PATCH 57/84] Uplift third_party/tt-metal to
 dde5614e816b20110b0bb8c96aeefe5a0a7bada1 2024-12-04 (#1493)

* Uplift third_party/tt-metal to dde5614e816b20110b0bb8c96aeefe5a0a7bada1 2024-12-04

* Add back includes for system_desc that were removed in tt-metal

to access definitions: DRAM_ALIGNMENT, eth_l1_mem, L1_ALIGNMENT,
PCIE_ALIGNMENT

* Cherry pick tt-metal fix to solve hangs in tt-explorer job

---------

Co-authored-by: kmitrovicTT <169657397+kmitrovicTT@users.noreply.github.com>
Co-authored-by: Brata Choudhury <achoudhury@tenstorrent.com>
Co-authored-by: Kyle Mabee <kmabee@tenstorrent.com>
---
 runtime/lib/common/system_desc.cpp | 2 ++
 third_party/CMakeLists.txt         | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/runtime/lib/common/system_desc.cpp b/runtime/lib/common/system_desc.cpp
index 3b4685901a..cf0c6196d7 100644
--- a/runtime/lib/common/system_desc.cpp
+++ b/runtime/lib/common/system_desc.cpp
@@ -12,8 +12,10 @@
 
 #define FMT_HEADER_ONLY
 #include "distributed/mesh_device.hpp"
+#include "eth_l1_address_map.h"
 #include "host_api.hpp"
 #include "hostdevcommon/common_values.hpp"
+#include "noc/noc_parameters.h"
 
 namespace tt::runtime::system_desc {
 static ::tt::target::Dim2d toFlatbuffer(const CoreCoord &coreCoord) {
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index 4c814fdc8d..dac58a37fe 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -1,6 +1,6 @@
 include(ExternalProject)
 
-set(TT_METAL_VERSION "6a524ab817aeb09c273e37254f39ad8124ddf2f8")
+set(TT_METAL_VERSION "28e140381c515c053308c97cb952eba2176abf20")
 
 if ("$ENV{ARCH_NAME}" STREQUAL "grayskull")
   set(ARCH_NAME "grayskull")

From 051009b948004024c875cf0d9b94522acbf3c210 Mon Sep 17 00:00:00 2001
From: Sterling Taylor <166402033+staylorTT@users.noreply.github.com>
Date: Thu, 5 Dec 2024 07:25:19 -0600
Subject: [PATCH 58/84] Adding contributing guidelines for the project (#1459)

* Adding contributing guidelines for the project

* Updates based on review

* Fixing whitespace

* fix whitespace

* Update CONTRIBUTING.md

removing more whitespace
---
 CONTRIBUTING.md | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 CONTRIBUTING.md

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000..e39ed6a281
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,22 @@
+# Contributing guidelines for TT-Forge
+
+## PR Guidelines
+### Community contributions
+Thank you for your interest in the TT-Forge project we appreciate your support.
+For all PRs we have an internal policy listed below which your PR will go through after an initial review has been done.
+
+The initial review will encompase the following:
+* Review the PR for CI / CD Readiness. Includes making sure that the code and PR at a high level makes sense for the project
+* Once approved for CI / CD readiness a Tenstorrent developer will kick off our CI/CD pipeline on your behalf.
+
+### Internal contributions
+For internal contributions we have the following guidelines:
+
+* A 24 hour merge rule exists. The rule is to wait at least 24 hours since the PR was initially opened for review. This gives members of our teams that span the globe opportunity to provide feedback to PRs.
+
+In addition to the 24 hour rule the following prerequisites for landing PR exist:
+* At least 1 reviewer signs off on the change
+* Component owner sign offs (github will tell you if this hasn't been met)
+* Green CI
+* Wait at least 24 hours since opening the PR to give all tagged reviewers a chance to take a look.  Or at least comment on the issue that they need more time to review.
+  * *Rebasing or further changes to the PR do not reset the 24 hour counter.*

From 0a172a2dd91bfde72af9171ce4892bdb0053a21a Mon Sep 17 00:00:00 2001
From: Usman Aziz <uaziz@tenstorrent.com>
Date: Thu, 5 Dec 2024 19:47:47 +0500
Subject: [PATCH 59/84] Add DEVICE_LIBRARY dependency to SharedLib. (#1494)

---
 lib/SharedLib/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/SharedLib/CMakeLists.txt b/lib/SharedLib/CMakeLists.txt
index e743cb0b06..7f32c8aa66 100644
--- a/lib/SharedLib/CMakeLists.txt
+++ b/lib/SharedLib/CMakeLists.txt
@@ -2,7 +2,7 @@
 set(TTNN_RUNTIME_LIBS TTRuntime TTRuntimeTTNN TTBinary)
 
 # Dependency libs from tt-metal/ttnn project for ttnn runtime
-set(TTNN_LIBS TTMETAL_LIBRARY TTNN_LIBRARY)
+set(TTNN_LIBS TTMETAL_LIBRARY DEVICE_LIBRARY TTNN_LIBRARY)
 if (TT_RUNTIME_ENABLE_PERF_TRACE)
   list(APPEND TTNN_LIBS TRACY_LIBRARY)
 endif()

From 0a3366795b02a58dd0901a2613fba3a8ee94af39 Mon Sep 17 00:00:00 2001
From: Lewis Panos <lpanos@tenstorrent.com>
Date: Thu, 5 Dec 2024 12:30:42 -0500
Subject: [PATCH 60/84] Bringup cache updates on ttir level, add silicon test
 (#1437)

Model memory effects of cache fill/update ops

Create TTNN_InplaceOp with MemWrite trait
---
 include/ttmlir/Dialect/TTIR/IR/TTIROps.td     |  41 ++++++
 include/ttmlir/Dialect/TTNN/IR/TTNNBase.td    |   3 +
 include/ttmlir/Dialect/TTNN/IR/TTNNOps.td     |  27 ++++
 include/ttmlir/Target/TTNN/program.fbs        |  15 +++
 lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp      |  90 +++++++++++--
 lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp    |   7 +
 lib/Dialect/TTIR/IR/TTIROps.cpp               | 121 ++++++++++++++++++
 lib/Dialect/TTNN/IR/TTNNOps.cpp               | 121 ++++++++++++++++++
 lib/Target/TTNN/TTNNToFlatbuffer.cpp          |  32 +++++
 runtime/include/tt/runtime/detail/ttnn.h      |   4 +
 .../include/tt/runtime/detail/workarounds.h   |  23 +++-
 runtime/lib/common/workarounds.cpp            |   6 +-
 runtime/lib/ttnn/operations/CMakeLists.txt    |   2 +
 .../lib/ttnn/operations/creation/arange.cpp   |   3 +-
 .../ttnn/operations/kv_cache/fill_cache.cpp   |  16 +++
 .../lib/ttnn/operations/kv_cache/fill_cache.h |  15 +++
 .../ttnn/operations/kv_cache/update_cache.cpp |  35 +++++
 .../ttnn/operations/kv_cache/update_cache.h   |  15 +++
 runtime/lib/ttnn/program.cpp                  |   8 ++
 runtime/tools/python/ttrt/common/run.py       |   8 ++
 test/ttmlir/Dialect/TTNN/simple_constant.mlir |  10 +-
 .../StableHLO/Constant/constant_bf16.mlir     |   4 +-
 .../StableHLO/Constant/constant_bool.mlir     |   4 +-
 .../StableHLO/Constant/constant_f32.mlir      |   4 +-
 .../StableHLO/Constant/constant_f64.mlir      |   4 +-
 .../StableHLO/Constant/constant_i16.mlir      |   4 +-
 .../StableHLO/Constant/constant_i32.mlir      |   4 +-
 .../StableHLO/Constant/constant_i64.mlir      |   4 +-
 .../Silicon/TTNN/kv_cache/fill_cache.mlir     |  14 ++
 .../Silicon/TTNN/kv_cache/update_cache.mlir   |  15 +++
 test/ttmlir/Silicon/TTNN/simple_constant.mlir |   6 +-
 31 files changed, 626 insertions(+), 39 deletions(-)
 create mode 100644 runtime/lib/ttnn/operations/kv_cache/fill_cache.cpp
 create mode 100644 runtime/lib/ttnn/operations/kv_cache/fill_cache.h
 create mode 100644 runtime/lib/ttnn/operations/kv_cache/update_cache.cpp
 create mode 100644 runtime/lib/ttnn/operations/kv_cache/update_cache.h
 create mode 100644 test/ttmlir/Silicon/TTNN/kv_cache/fill_cache.mlir
 create mode 100644 test/ttmlir/Silicon/TTNN/kv_cache/update_cache.mlir

diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
index 3c53a156a4..69510f93a4 100644
--- a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
+++ b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
@@ -707,6 +707,47 @@ def TTIR_ConcatOp : TTIR_DPSOp<"concat"> {
     let hasVerifier = 1;
 }
 
+def TTIR_UpdateCacheOp : TTIR_DPSOp<"update_cache"> {
+  let summary = "Update static cache tensor.";
+  let description = [{
+      Updates the `cache` tensor in-place with values from `input` at `update_index` and `batch_offset`.
+  }];
+
+  let arguments = (ins AnyRankedTensor:$cache,
+                       AnyRankedTensor:$input,
+                       AnyRankedTensor:$update_index,
+                       I32Attr:$batch_offset,
+                       TT_OperandConstraintArrayAttr:$operand_constraints);
+
+  let results = (outs AnyRankedTensor:$result);
+
+  let extraClassDeclaration = [{
+      MutableOperandRange getDpsInitsMutable() { return getCacheMutable(); }
+  }];
+
+  let hasVerifier = 1;
+}
+
+def TTIR_FillCacheOp : TTIR_DPSOp<"fill_cache"> {
+  let summary = "Fill static cache tensor.";
+  let description = [{
+      Fills the `cache` tensor in-place with values from `input` at `batch_offset`.
+  }];
+
+  let arguments = (ins AnyRankedTensor:$cache,
+                       AnyRankedTensor:$input,
+                       I32Attr:$batch_offset,
+                       TT_OperandConstraintArrayAttr:$operand_constraints);
+
+  let results = (outs AnyRankedTensor:$result);
+
+  let hasVerifier = 1;
+
+  let extraClassDeclaration = [{
+      MutableOperandRange getDpsInitsMutable() { return getCacheMutable(); }
+  }];
+}
+
 def TTIR_BroadcastOp : TTIR_DPSOp<"broadcast"> {
     let summary = "Broadcast operation.";
     let description = [{
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNBase.td b/include/ttmlir/Dialect/TTNN/IR/TTNNBase.td
index f6f764d01a..ea77d6795b 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNBase.td
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNBase.td
@@ -48,4 +48,7 @@ def TTNN_Dialect : Dialect {
 class TTNN_Op<string mnemonic, list<Trait> traits = []> :
         Op<TTNN_Dialect, mnemonic, !listconcat(traits, [Pure, TTNN_OpModelInterface, TTNN_WorkaroundInterface])>;
 
+class TTNN_InplaceOp<string mnemonic, list<Trait> traits = []> :
+        Op<TTNN_Dialect, mnemonic, !listconcat(traits, [MemoryEffects<[MemWrite]>, TTNN_OpModelInterface, TTNN_WorkaroundInterface])>;
+
 #endif
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
index 41eb9dff75..ed914cb555 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
@@ -577,6 +577,33 @@ def TTNN_EmbeddingOp : TTNN_NamedDPSOp<"embedding"> {
     let hasVerifier = 1;
 }
 
+def TTNN_UpdateCacheOp : TTNN_InplaceOp<"update_cache"> {
+  let summary = "Update static cache tensor.";
+  let description = [{
+      Updates the `cache` tensor in-place with values from `input` at `update_index` and `batch_offset`.
+  }];
+
+  let arguments = (ins Arg<AnyRankedTensor, "cache tensor", [MemWrite]>:$cache,
+                       AnyRankedTensor:$input,
+                       AnyRankedTensor:$update_index,
+                       I32Attr:$batch_offset);
+
+  let hasVerifier = 1;
+}
+
+def TTNN_FillCacheOp : TTNN_InplaceOp<"fill_cache"> {
+  let summary = "Fill static cache tensor.";
+  let description = [{
+      Fills the `cache` tensor in-place with values from `input` at `batch_offset`.
+  }];
+
+  let arguments = (ins Arg<AnyRankedTensor, "cache tensor", [MemWrite]>:$cache,
+                       AnyRankedTensor:$input,
+                       I32Attr:$batch_offset);
+
+  let hasVerifier = 1;
+}
+
 def TTNN_SoftmaxOp : TTNN_Op<"softmax"> {
     let summary = "Softmax op.";
     let description = [{
diff --git a/include/ttmlir/Target/TTNN/program.fbs b/include/ttmlir/Target/TTNN/program.fbs
index e8d349a495..5644c970d8 100644
--- a/include/ttmlir/Target/TTNN/program.fbs
+++ b/include/ttmlir/Target/TTNN/program.fbs
@@ -37,6 +37,19 @@ table ToDeviceOp {
   out: tt.target.TensorRef;
 }
 
+table UpdateCacheOp {
+  cache: tt.target.TensorRef;
+  input: tt.target.TensorRef;
+  update_index: tt.target.TensorRef;
+  batch_offset: uint32;
+}
+
+table FillCacheOp {
+  cache: tt.target.TensorRef;
+  input: tt.target.TensorRef;
+  batch_offset: uint32;
+}
+
 table FromDeviceOp {
   in: tt.target.TensorRef;
   out: tt.target.TensorRef;
@@ -283,6 +296,8 @@ union OpType {
   DeallocateOp,
   AllGatherOp,
   ArangeOp,
+  UpdateCacheOp,
+  FillCacheOp,
 }
 
 table Operation {
diff --git a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
index 4fff567c1a..b7404ec4d1 100644
--- a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
+++ b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
@@ -23,6 +23,7 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <mlir/IR/Operation.h>
 
 using namespace mlir;
 using namespace mlir::tt;
@@ -334,6 +335,78 @@ class ClampOpConversionPattern : public OpConversionPattern<ttir::ClampOp> {
   }
 };
 
+class UpdateCacheOpConversionPattern
+    : public OpConversionPattern<ttir::UpdateCacheOp> {
+public:
+  using OpConversionPattern<ttir::UpdateCacheOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(ttir::UpdateCacheOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    // The TTIR version of this op is pure. In TTNN this op is in-place.
+    // We need to replace uses of the result ot the TTIR op with uses
+    // of the cache argument.
+    //
+    // The presence of the MemWrite trait of this op should preserve
+    // the order of this op relative to the cache arguments uses, preserving
+    // program correctness.
+
+    // This op can only work if it is the final use of the cache tensor in the
+    // order of execution. For now, checking that there is only one user (this
+    // op) of the cache tensor will suffice.
+    std::vector<mlir::Operation *> users(op.getCache().getUsers().begin(),
+                                         op.getCache().getUsers().end());
+    if (users.size() != 1) {
+      return rewriter.notifyMatchFailure(
+          op, "UpdateCacheOp must have exactly one user");
+    }
+
+    rewriter.create<ttnn::UpdateCacheOp>(
+        op.getLoc(), adaptor.getCache(), adaptor.getInput(),
+        adaptor.getUpdateIndex(), adaptor.getBatchOffset());
+
+    rewriter.replaceOp(op, adaptor.getCache());
+    return success();
+  }
+};
+
+class FillCacheOpConversionPattern
+    : public OpConversionPattern<ttir::FillCacheOp> {
+public:
+  using OpConversionPattern<ttir::FillCacheOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(ttir::FillCacheOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    // The TTIR version of this op is pure. In TTNN this op is in-place.
+    // We need to replace uses of the result ot the TTIR op with uses
+    // of the cache argument.
+    //
+    // The presence of the MemWrite trait of this op should preserve
+    // the order of this op relative to the cache arguments uses, preserving
+    // program correctness.
+
+    // This op can only work if it is the final use of the cache tensor in the
+    // order of execution. For now, checking that there is only one user (this
+    // op) of the cache tensor will suffice.
+    std::vector<mlir::Operation *> users(op.getCache().getUsers().begin(),
+                                         op.getCache().getUsers().end());
+    if (users.size() != 1) {
+      return rewriter.notifyMatchFailure(
+          op, "FillCacheOp must have exactly one user");
+    }
+
+    rewriter.create<ttnn::FillCacheOp>(op.getLoc(), adaptor.getCache(),
+                                       adaptor.getInput(),
+                                       adaptor.getBatchOffset());
+
+    rewriter.replaceOp(op, adaptor.getCache());
+    return success();
+  }
+};
+
 template <typename TTIROpTy, typename TTNNOpTy,
           typename OpAdaptor = typename TTIROpTy::Adaptor>
 class ElementwiseUnaryWithFloatParameterOpConversionPattern
@@ -506,15 +579,12 @@ class ConstantOpConversionPattern
           valueAttr.getElementType().isInteger()
               ? getIntegerValue(valueAttr)
               : valueAttr.getSplatValue<mlir::APFloat>().convertToFloat();
-      if (fillValue == 0) {
-        rewriter.replaceOpWithNewOp<tensor::EmptyOp>(
-            op, this->getTypeConverter()->convertType(op.getType()), device);
-      } else {
-        ::mlir::FloatAttr fillValueAttr = rewriter.getF32FloatAttr(fillValue);
-        rewriter.replaceOpWithNewOp<ttnn::FullOp>(
-            op, this->getTypeConverter()->convertType(op.getType()), device,
-            fillValueAttr);
-      }
+
+      ::mlir::FloatAttr fillValueAttr = rewriter.getF32FloatAttr(fillValue);
+      rewriter.replaceOpWithNewOp<ttnn::FullOp>(
+          op, this->getTypeConverter()->convertType(op.getType()), device,
+          fillValueAttr);
+
     } else {
       return rewriter.notifyMatchFailure(
           op, "TTNN doesn't currently support tensor creation from multiple "
@@ -980,6 +1050,8 @@ void populateTTIRToTTNNPatterns(MLIRContext *ctx, RewritePatternSet &patterns,
            SubtractOpConversionPattern,
            AllGatherOpConversionPattern,
            ArangeOpConversionPattern,
+           UpdateCacheOpConversionPattern,
+           FillCacheOpConversionPattern,
            ScatterOpConversionPattern
            >(typeConverter, ctx);
   // ANCHOR_END: op_rewriter_pattern_set
diff --git a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
index ff16ed0b17..dc3fc1cde8 100644
--- a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
+++ b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
@@ -757,6 +757,13 @@ void populateTTNNToEmitCPatterns(mlir::MLIRContext *ctx,
   // Module op
   //
   patterns.add<ModuleOpConversionPattern>(typeConverter, ctx);
+
+  // KV Cache ops
+  //
+  patterns.add<DefaultOpConversionPattern<ttnn::UpdateCacheOp>>(typeConverter,
+                                                                ctx);
+  patterns.add<DefaultOpConversionPattern<ttnn::FillCacheOp>>(typeConverter,
+                                                              ctx);
 }
 
 } // namespace mlir::tt
diff --git a/lib/Dialect/TTIR/IR/TTIROps.cpp b/lib/Dialect/TTIR/IR/TTIROps.cpp
index bc1f02868a..1e9ae04afc 100644
--- a/lib/Dialect/TTIR/IR/TTIROps.cpp
+++ b/lib/Dialect/TTIR/IR/TTIROps.cpp
@@ -1394,6 +1394,127 @@ ::mlir::LogicalResult mlir::tt::ttir::ScatterOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// UpdateCacheOp
+//===----------------------------------------------------------------------===//
+
+::mlir::LogicalResult mlir::tt::ttir::UpdateCacheOp::verify() {
+  if (getBatchOffset() != 0) {
+    return emitOpError(
+        "Only single-batch is supported. Batch offset must be 0");
+  }
+
+  const ::mlir::RankedTensorType cacheType = getCache().getType();
+  const ::mlir::RankedTensorType inputType = getInput().getType();
+
+  const DataType cacheDataType =
+      elementTypeToDataType(cacheType.getElementType());
+  const DataType inputDataType =
+      elementTypeToDataType(inputType.getElementType());
+
+  if (cacheDataType != inputDataType) {
+    return emitOpError(
+        "Cache and input tensors must have the same dtype. "
+        "Got cache dtype = " +
+        DataTypeEnumToString(cacheDataType) +
+        ", input dtype = " + DataTypeEnumToString(inputDataType));
+  }
+
+  if (cacheType.getRank() != 4) {
+    return emitOpError("Cache tensor must be a 4D tensor");
+  }
+
+  if (inputType.getRank() != 4) {
+    return emitOpError("Input tensor must be a 4D tensor");
+  }
+
+  if (inputType.getShape()[2] != 1) {
+    return emitOpError("Input tensor requires that dim 2 have size 1, got "
+                       "input dim 2 size = " +
+                       std::to_string(inputType.getShape()[2]));
+  }
+
+  if (cacheType.getShape()[0] != inputType.getShape()[0] ||
+      cacheType.getShape()[1] != inputType.getShape()[1] ||
+      cacheType.getShape()[3] != inputType.getShape()[3]) {
+    return emitOpError("Cache tensor shape must match input tensor shape on "
+                       "all dimensions except dim 2. Got cache shape (" +
+                       std::to_string(cacheType.getShape()[0]) + ", " +
+                       std::to_string(cacheType.getShape()[1]) + ", " +
+                       std::to_string(cacheType.getShape()[2]) + ", " +
+                       std::to_string(cacheType.getShape()[3]) +
+                       "), input shape ()" +
+                       std::to_string(inputType.getShape()[0]) + "x" +
+                       std::to_string(inputType.getShape()[1]) + "x" +
+                       std::to_string(inputType.getShape()[2]) + "x" +
+                       std::to_string(inputType.getShape()[3]) + ")");
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// FillCacheOp
+//===----------------------------------------------------------------------===//
+
+::mlir::LogicalResult mlir::tt::ttir::FillCacheOp::verify() {
+  if (getBatchOffset() != 0) {
+    return emitOpError(
+        "Only single-batch is supported. Batch offset must be 0");
+  }
+
+  const ::mlir::RankedTensorType cacheType = getCache().getType();
+  const ::mlir::RankedTensorType inputType = getInput().getType();
+
+  const DataType cacheDataType =
+      elementTypeToDataType(cacheType.getElementType());
+  const DataType inputDataType =
+      elementTypeToDataType(inputType.getElementType());
+
+  if (cacheDataType != inputDataType) {
+    return emitOpError(
+        "Cache and input tensors must have the same dtype. "
+        "Got cache dtype = " +
+        DataTypeEnumToString(cacheDataType) +
+        ", input dtype = " + DataTypeEnumToString(inputDataType));
+  }
+
+  if (cacheType.getRank() != 4) {
+    return emitOpError("Cache tensor must be a 4D tensor");
+  }
+
+  if (inputType.getRank() != 4) {
+    return emitOpError("Input tensor must be a 4D tensor");
+  }
+
+  if (inputType.getShape()[2] > cacheType.getShape()[2]) {
+    return emitOpError(
+        "Input tensor requires that dim 2 have a size which is less than or "
+        "equal to the size of dim 2 of the cache tensor. Got cache dim 2 size "
+        "= " +
+        std::to_string(cacheType.getShape()[2]) +
+        ", input dim 2 size = " + std::to_string(inputType.getShape()[2]));
+  }
+
+  if (cacheType.getShape()[0] != inputType.getShape()[0] ||
+      cacheType.getShape()[1] != inputType.getShape()[1] ||
+      cacheType.getShape()[3] != inputType.getShape()[3]) {
+    return emitOpError("Cache tensor shape must match input tensor shape on "
+                       "all dimensions except dim 2. Got cache shape (" +
+                       std::to_string(cacheType.getShape()[0]) + ", " +
+                       std::to_string(cacheType.getShape()[1]) + ", " +
+                       std::to_string(cacheType.getShape()[2]) + ", " +
+                       std::to_string(cacheType.getShape()[3]) +
+                       "), input shape (" +
+                       std::to_string(inputType.getShape()[0]) + ", " +
+                       std::to_string(inputType.getShape()[1]) + ", " +
+                       std::to_string(inputType.getShape()[2]) + ", " +
+                       std::to_string(inputType.getShape()[3]) + ")");
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // GenericOp
 //===----------------------------------------------------------------------===//
diff --git a/lib/Dialect/TTNN/IR/TTNNOps.cpp b/lib/Dialect/TTNN/IR/TTNNOps.cpp
index 8e41368cbb..94f2507dc9 100644
--- a/lib/Dialect/TTNN/IR/TTNNOps.cpp
+++ b/lib/Dialect/TTNN/IR/TTNNOps.cpp
@@ -974,4 +974,125 @@ ::mlir::LogicalResult ReduceScatterOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// UpdateCacheOp
+//===----------------------------------------------------------------------===//
+
+::mlir::LogicalResult UpdateCacheOp::verify() {
+  if (getBatchOffset() != 0) {
+    return emitOpError(
+        "Only single-batch is supported. Batch offset must be 0");
+  }
+
+  const ::mlir::RankedTensorType cacheType = getCache().getType();
+  const ::mlir::RankedTensorType inputType = getInput().getType();
+
+  const DataType cacheDataType =
+      elementTypeToDataType(cacheType.getElementType());
+  const DataType inputDataType =
+      elementTypeToDataType(inputType.getElementType());
+
+  if (cacheDataType != inputDataType) {
+    return emitOpError(
+        "Cache and input tensors must have the same dtype. "
+        "Got cache dtype = " +
+        DataTypeEnumToString(cacheDataType) +
+        ", input dtype = " + DataTypeEnumToString(inputDataType));
+  }
+
+  if (cacheType.getRank() != 4) {
+    return emitOpError("Cache tensor must be a 4D tensor");
+  }
+
+  if (inputType.getRank() != 4) {
+    return emitOpError("Input tensor must be a 4D tensor");
+  }
+
+  if (inputType.getShape()[2] != 1) {
+    return emitOpError("Input tensor requires that dim 2 have size 1, got "
+                       "input dim 2 size = " +
+                       std::to_string(inputType.getShape()[2]));
+  }
+
+  if (cacheType.getShape()[0] != inputType.getShape()[0] ||
+      cacheType.getShape()[1] != inputType.getShape()[1] ||
+      cacheType.getShape()[3] != inputType.getShape()[3]) {
+    return emitOpError("Cache tensor shape must match input tensor shape on "
+                       "all dimensions except dim 2. Got cache shape (" +
+                       std::to_string(cacheType.getShape()[0]) + ", " +
+                       std::to_string(cacheType.getShape()[1]) + ", " +
+                       std::to_string(cacheType.getShape()[2]) + ", " +
+                       std::to_string(cacheType.getShape()[3]) +
+                       "), input shape ()" +
+                       std::to_string(inputType.getShape()[0]) + "x" +
+                       std::to_string(inputType.getShape()[1]) + "x" +
+                       std::to_string(inputType.getShape()[2]) + "x" +
+                       std::to_string(inputType.getShape()[3]) + ")");
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// FillCacheOp
+//===----------------------------------------------------------------------===//
+
+::mlir::LogicalResult FillCacheOp::verify() {
+  if (getBatchOffset() != 0) {
+    return emitOpError(
+        "Only single-batch is supported. Batch offset must be 0");
+  }
+
+  const ::mlir::RankedTensorType cacheType = getCache().getType();
+  const ::mlir::RankedTensorType inputType = getInput().getType();
+
+  const DataType cacheDataType =
+      elementTypeToDataType(cacheType.getElementType());
+  const DataType inputDataType =
+      elementTypeToDataType(inputType.getElementType());
+
+  if (cacheDataType != inputDataType) {
+    return emitOpError(
+        "Cache and input tensors must have the same dtype. "
+        "Got cache dtype = " +
+        DataTypeEnumToString(cacheDataType) +
+        ", input dtype = " + DataTypeEnumToString(inputDataType));
+  }
+
+  if (cacheType.getRank() != 4) {
+    return emitOpError("Cache tensor must be a 4D tensor");
+  }
+
+  if (inputType.getRank() != 4) {
+    return emitOpError("Input tensor must be a 4D tensor");
+  }
+
+  if (inputType.getShape()[2] > cacheType.getShape()[2]) {
+    return emitOpError(
+        "Input tensor requires that dim 2 have a size which is less than or "
+        "equal to the size of dim 2 of the cache tensor. Got cache dim 2 size "
+        "= " +
+        std::to_string(cacheType.getShape()[2]) +
+        ", input dim 2 size = " + std::to_string(inputType.getShape()[2]));
+  }
+
+  if (cacheType.getShape()[0] != inputType.getShape()[0] ||
+      cacheType.getShape()[1] != inputType.getShape()[1] ||
+      cacheType.getShape()[3] != inputType.getShape()[3]) {
+    return emitOpError("Cache tensor shape must match input tensor shape on "
+                       "all dimensions except dim 2. Got cache shape (" +
+                       std::to_string(cacheType.getShape()[0]) + ", " +
+                       std::to_string(cacheType.getShape()[1]) + ", " +
+                       std::to_string(cacheType.getShape()[2]) + ", " +
+                       std::to_string(cacheType.getShape()[3]) +
+                       "), input shape (" +
+                       std::to_string(inputType.getShape()[0]) + ", " +
+                       std::to_string(inputType.getShape()[1]) + ", " +
+                       std::to_string(inputType.getShape()[2]) + ", " +
+                       std::to_string(inputType.getShape()[3]) + ")");
+  }
+
+  return success();
+}
+
 } // namespace mlir::tt::ttnn
diff --git a/lib/Target/TTNN/TTNNToFlatbuffer.cpp b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
index 65c5b1d5ce..827901516d 100644
--- a/lib/Target/TTNN/TTNNToFlatbuffer.cpp
+++ b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
@@ -436,6 +436,30 @@ createEltwiseOpParams(FlatbufferObjectCache &cache, EltwiseOp op) {
   }
 }
 
+::flatbuffers::Offset<::tt::target::ttnn::UpdateCacheOp>
+createOp(FlatbufferObjectCache &cache, UpdateCacheOp op) {
+  auto cacheOperand =
+      cache.at<::tt::target::TensorRef>(getOperandThroughDPSOps(op.getCache()));
+  auto input =
+      cache.at<::tt::target::TensorRef>(getOperandThroughDPSOps(op.getInput()));
+  auto updateIndex = cache.at<::tt::target::TensorRef>(
+      getOperandThroughDPSOps(op.getUpdateIndex()));
+
+  return ::tt::target::ttnn::CreateUpdateCacheOp(
+      *cache.fbb, cacheOperand, input, updateIndex, op.getBatchOffset());
+}
+
+::flatbuffers::Offset<::tt::target::ttnn::FillCacheOp>
+createOp(FlatbufferObjectCache &cache, FillCacheOp op) {
+  auto cacheOperand =
+      cache.at<::tt::target::TensorRef>(getOperandThroughDPSOps(op.getCache()));
+  auto input =
+      cache.at<::tt::target::TensorRef>(getOperandThroughDPSOps(op.getInput()));
+
+  return ::tt::target::ttnn::CreateFillCacheOp(*cache.fbb, cacheOperand, input,
+                                               op.getBatchOffset());
+}
+
 template <typename EltwiseOp>
 ::flatbuffers::Offset<::tt::target::ttnn::EltwiseOp>
 createNonDPSEltwiseOp(FlatbufferObjectCache &cache, EltwiseOp op) {
@@ -971,6 +995,14 @@ emitTTNNOperation(FlatbufferObjectCache &cache, Operation *op,
     return createOperation(cache, createEltwiseOp(cache, tanhOp), debugString,
                            locInfo);
   }
+  if (auto updateCacheOp = dyn_cast<UpdateCacheOp>(op); updateCacheOp) {
+    return createOperation(cache, createOp(cache, updateCacheOp), debugString,
+                           locInfo);
+  }
+  if (auto fillCacheOp = dyn_cast<FillCacheOp>(op); fillCacheOp) {
+    return createOperation(cache, createOp(cache, fillCacheOp), debugString,
+                           locInfo);
+  }
 
   llvm_unreachable("unhandled op in emitTTNNOperation");
 }
diff --git a/runtime/include/tt/runtime/detail/ttnn.h b/runtime/include/tt/runtime/detail/ttnn.h
index 31b979e139..e57300162a 100644
--- a/runtime/include/tt/runtime/detail/ttnn.h
+++ b/runtime/include/tt/runtime/detail/ttnn.h
@@ -15,6 +15,7 @@
 #include "ttnn/operations/copy.hpp"
 #include "ttnn/operations/core/core.hpp"
 #include "ttnn/operations/creation.hpp"
+#include "ttnn/operations/data_movement/clone/clone.hpp"
 #include "ttnn/operations/data_movement/concat/concat.hpp"
 #include "ttnn/operations/data_movement/permute/permute.hpp"
 #include "ttnn/operations/data_movement/transpose/transpose.hpp"
@@ -23,11 +24,14 @@
 #include "ttnn/operations/eltwise/ternary/where.hpp"
 #include "ttnn/operations/eltwise/unary/unary.hpp"
 #include "ttnn/operations/embedding/embedding.hpp"
+#include "ttnn/operations/kv_cache/kv_cache.hpp"
 #include "ttnn/operations/matmul/matmul.hpp"
 #include "ttnn/operations/normalization/softmax/softmax.hpp"
 #include "ttnn/operations/pool/generic/generic_pools.hpp"
 #include "ttnn/operations/reduction/generic/generic_reductions.hpp"
 #include "ttnn/tensor/host_buffer/functions.hpp"
+#include "ttnn/tensor/host_buffer/owned_buffer.hpp"
+#include "ttnn/tensor/shape/shape.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/types.hpp"
 
diff --git a/runtime/include/tt/runtime/detail/workarounds.h b/runtime/include/tt/runtime/detail/workarounds.h
index c0d3b62a1c..a586757522 100644
--- a/runtime/include/tt/runtime/detail/workarounds.h
+++ b/runtime/include/tt/runtime/detail/workarounds.h
@@ -15,12 +15,13 @@ struct Env {
 #else
   constexpr static Env
 #endif
-  get(bool maxpool2dPreshard = true, bool swapBinaryOperands = true)
+  get(bool maxpool2dPreshard = true, bool swapBinaryOperands = true,
+      bool readUpdateIndexFromDeviceForKVCache = true)
 #if defined(TT_RUNTIME_WORKAROUNDS) && TT_RUNTIME_WORKAROUNDS == 1
       ;
 #else
   {
-    return Env(true, true);
+    return Env(true, true, true);
   }
 #endif
   // TODO(bug #855): Ideally we should have an op that preshards for maxpool2d
@@ -32,10 +33,19 @@ struct Env {
   // rhs operand). We should add this check in the compiler.
   bool swapBinaryOperands;
 
+  // TODO(bug #1510) ttnn::update_cache takes a single update index as a uint32
+  // as a function argument. The tt-torch frontend and likely others model this
+  // as a tensor with integer elements. For now, to get this op to work we need
+  // to be able to pluck this update index from a runtime tensor.
+  bool readUpdateIndexFromDeviceForKVCache;
+
 private:
-  constexpr Env(bool maxpool2dPreshard, bool swapBinaryOperands)
+  constexpr Env(bool maxpool2dPreshard, bool swapBinaryOperands,
+                bool readUpdateIndexFromDeviceForKVCache)
       : maxpool2dPreshard(maxpool2dPreshard),
-        swapBinaryOperands(swapBinaryOperands) {}
+        swapBinaryOperands(swapBinaryOperands),
+        readUpdateIndexFromDeviceForKVCache(
+            readUpdateIndexFromDeviceForKVCache) {}
 };
 
 inline std::ostream &operator<<(std::ostream &os, const Env &env) {
@@ -43,7 +53,10 @@ inline std::ostream &operator<<(std::ostream &os, const Env &env) {
   os << "\t"
      << "maxpool2dPreshard: " << env.maxpool2dPreshard << ",\n";
   os << "\t"
-     << "swapBinaryOperands: " << env.swapBinaryOperands << "\n";
+     << "swapBinaryOperands: " << env.swapBinaryOperands << ",\n";
+  os << "\t"
+     << "readUpdateIndexFromDeviceForKVCache: "
+     << env.readUpdateIndexFromDeviceForKVCache << "\n";
   os << "}";
   return os;
 }
diff --git a/runtime/lib/common/workarounds.cpp b/runtime/lib/common/workarounds.cpp
index aeeb16c651..a9dbf7564a 100644
--- a/runtime/lib/common/workarounds.cpp
+++ b/runtime/lib/common/workarounds.cpp
@@ -6,8 +6,10 @@
 
 namespace tt::runtime::workaround {
 #if defined(TT_RUNTIME_WORKAROUNDS) && TT_RUNTIME_WORKAROUNDS == 1
-const Env &Env::get(bool maxpool2dPreshard, bool swapBinaryOperands) {
-  static const Env config(maxpool2dPreshard, swapBinaryOperands);
+const Env &Env::get(bool maxpool2dPreshard, bool swapBinaryOperands,
+                    bool readUpdateIndexFromDeviceForKVCache) {
+  static const Env config(maxpool2dPreshard, swapBinaryOperands,
+                          readUpdateIndexFromDeviceForKVCache);
   return config;
 }
 #endif
diff --git a/runtime/lib/ttnn/operations/CMakeLists.txt b/runtime/lib/ttnn/operations/CMakeLists.txt
index 4d18e3f1ce..d7d9357b5f 100644
--- a/runtime/lib/ttnn/operations/CMakeLists.txt
+++ b/runtime/lib/ttnn/operations/CMakeLists.txt
@@ -19,6 +19,8 @@ set(TTNN_OPS_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/eltwise/unary/unary_composite.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/eltwise/ternary/ternary.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/embedding/embedding.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/kv_cache/fill_cache.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/kv_cache/update_cache.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/layout/to_device.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/layout/from_device.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/layout/to_layout.cpp
diff --git a/runtime/lib/ttnn/operations/creation/arange.cpp b/runtime/lib/ttnn/operations/creation/arange.cpp
index 8ddb199136..f51937462a 100644
--- a/runtime/lib/ttnn/operations/creation/arange.cpp
+++ b/runtime/lib/ttnn/operations/creation/arange.cpp
@@ -6,8 +6,9 @@
 #include "tt/runtime/detail/logger.h"
 #include "tt/runtime/ttnn/operations/utils.h"
 #include "tt/runtime/ttnn/utils.h"
+#include "ttnn/types.hpp"
+
 #include <functional>
-#include <ttnn/types.hpp>
 #include <variant>
 
 namespace tt::runtime::ttnn::operations::creation {
diff --git a/runtime/lib/ttnn/operations/kv_cache/fill_cache.cpp b/runtime/lib/ttnn/operations/kv_cache/fill_cache.cpp
new file mode 100644
index 0000000000..89022f64a1
--- /dev/null
+++ b/runtime/lib/ttnn/operations/kv_cache/fill_cache.cpp
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "fill_cache.h"
+
+namespace tt::runtime::ttnn::operations::kv_cache {
+void run(const ::tt::target::ttnn::FillCacheOp *op, ProgramContext &context) {
+
+  ProgramTensorPool &tensorPool = context.getTensorPool();
+  const ::ttnn::Tensor &cache = tensorPool.at(op->cache()->global_id());
+  const ::ttnn::Tensor &input = tensorPool.at(op->input()->global_id());
+
+  ::ttnn::fill_cache(cache, input, op->batch_offset());
+}
+} // namespace tt::runtime::ttnn::operations::kv_cache
diff --git a/runtime/lib/ttnn/operations/kv_cache/fill_cache.h b/runtime/lib/ttnn/operations/kv_cache/fill_cache.h
new file mode 100644
index 0000000000..4187cb604b
--- /dev/null
+++ b/runtime/lib/ttnn/operations/kv_cache/fill_cache.h
@@ -0,0 +1,15 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef RUNTIME_LIB_TTNN_OPERATIONS_FILL_CACHE_H
+#define RUNTIME_LIB_TTNN_OPERATIONS_FILL_CACHE_H
+
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::kv_cache {
+void run(const ::tt::target::ttnn::FillCacheOp *op, ProgramContext &context);
+} // namespace tt::runtime::ttnn::operations::kv_cache
+
+#endif
diff --git a/runtime/lib/ttnn/operations/kv_cache/update_cache.cpp b/runtime/lib/ttnn/operations/kv_cache/update_cache.cpp
new file mode 100644
index 0000000000..fae1da40c6
--- /dev/null
+++ b/runtime/lib/ttnn/operations/kv_cache/update_cache.cpp
@@ -0,0 +1,35 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "update_cache.h"
+
+#include "tt/runtime/detail/logger.h"
+#include "tt/runtime/detail/workarounds.h"
+
+namespace tt::runtime::ttnn::operations::kv_cache {
+void run(const ::tt::target::ttnn::UpdateCacheOp *op, ProgramContext &context) {
+
+  ProgramTensorPool &tensorPool = context.getTensorPool();
+
+  const ::ttnn::Tensor &cache = tensorPool.at(op->cache()->global_id());
+  const ::ttnn::Tensor &input = tensorPool.at(op->input()->global_id());
+  const ::ttnn::Tensor &updateIndex =
+      tensorPool.at(op->update_index()->global_id());
+  if (workaround::Env::get().readUpdateIndexFromDeviceForKVCache) {
+
+    const ::ttnn::Tensor indexOnHost = ::ttnn::from_device(updateIndex);
+    const auto storage = indexOnHost.get_storage();
+    const auto ownedStorage = std::get<tt_metal::OwnedStorage>(storage);
+    const auto buffer = ownedStorage.get_buffer();
+    const auto buf = std::get<tt_metal::owned_buffer::Buffer<uint32_t>>(buffer);
+    uint32_t upIdx = *buf.begin();
+
+    ::ttnn::update_cache(cache, input, upIdx, op->batch_offset(), std::nullopt);
+  } else {
+    LOG_FATAL("Currently, the only way to execute ttnn::update_cache is to use "
+              "the workaround enabled by the flag "
+              "\"readUpdateIndexFromDeviceForKVCache\"");
+  }
+}
+} // namespace tt::runtime::ttnn::operations::kv_cache
diff --git a/runtime/lib/ttnn/operations/kv_cache/update_cache.h b/runtime/lib/ttnn/operations/kv_cache/update_cache.h
new file mode 100644
index 0000000000..1c4115f1eb
--- /dev/null
+++ b/runtime/lib/ttnn/operations/kv_cache/update_cache.h
@@ -0,0 +1,15 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef RUNTIME_LIB_TTNN_OPERATIONS_UPDATE_CACHE_H
+#define RUNTIME_LIB_TTNN_OPERATIONS_UPDATE_CACHE_H
+
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::kv_cache {
+void run(const ::tt::target::ttnn::UpdateCacheOp *op, ProgramContext &context);
+} // namespace tt::runtime::ttnn::operations::kv_cache
+
+#endif
diff --git a/runtime/lib/ttnn/program.cpp b/runtime/lib/ttnn/program.cpp
index a45c2de9a0..f38bfe83ce 100644
--- a/runtime/lib/ttnn/program.cpp
+++ b/runtime/lib/ttnn/program.cpp
@@ -18,6 +18,8 @@
 #include "operations/eltwise/unary/unary.h"
 #include "operations/eltwise/unary/unary_composite.h"
 #include "operations/embedding/embedding.h"
+#include "operations/kv_cache/fill_cache.h"
+#include "operations/kv_cache/update_cache.h"
 #include "operations/layout/from_device.h"
 #include "operations/layout/to_device.h"
 #include "operations/layout/to_layout.h"
@@ -212,6 +214,12 @@ void ProgramExecutor::runOperation(const ::tt::target::ttnn::Operation *op) {
   case ::tt::target::ttnn::OpType::ArangeOp: {
     return operations::creation::run(op->type_as_ArangeOp(), context);
   }
+  case ::tt::target::ttnn::OpType::UpdateCacheOp: {
+    return operations::kv_cache::run(op->type_as_UpdateCacheOp(), context);
+  }
+  case ::tt::target::ttnn::OpType::FillCacheOp: {
+    return operations::kv_cache::run(op->type_as_FillCacheOp(), context);
+  }
   default: {
     LOG_FATAL("Unsupported operation type");
   }
diff --git a/runtime/tools/python/ttrt/common/run.py b/runtime/tools/python/ttrt/common/run.py
index 5dc93a50d9..19ad61e241 100644
--- a/runtime/tools/python/ttrt/common/run.py
+++ b/runtime/tools/python/ttrt/common/run.py
@@ -138,6 +138,13 @@ def initialize_api():
             choices=[True, False],
             help="disable swap binary operands workaround",
         )
+        Run.register_arg(
+            name="--disable-read-update-index-for-kv-cache",
+            type=bool,
+            default=False,
+            choices=[True, False],
+            help="disable read update index for kv cache workaround",
+        )
         Run.register_arg(
             name="--result-file",
             type=str,
@@ -351,6 +358,7 @@ def _execute(binaries):
             workaround_env = ttrt.runtime.WorkaroundEnv.get(
                 not self["--disable-maxpool2d-preshard"],
                 not self["--disable-swap-binary-operands"],
+                not self["--disable-read-update-index-for-kv-cache"],
             )
             self.logging.debug(f"setting tt runtime workaround env={workaround_env}")
             self.logging.debug(f"setting torch manual seed={self['--seed']}")
diff --git a/test/ttmlir/Dialect/TTNN/simple_constant.mlir b/test/ttmlir/Dialect/TTNN/simple_constant.mlir
index 88df7aad24..017a1baf0c 100644
--- a/test/ttmlir/Dialect/TTNN/simple_constant.mlir
+++ b/test/ttmlir/Dialect/TTNN/simple_constant.mlir
@@ -3,31 +3,31 @@
 module attributes {} {
   func.func @test_empty_int8() -> tensor<64x128xi8> {
     %0 = "ttir.constant"() <{value = dense<0> : tensor<64x128xi8>}> : () -> tensor<64x128xi8>
-    // CHECK: %{{[0-9]+}} = "ttnn.empty"
+    // CHECK: %{{[0-9]+}} = "ttnn.full"
     return %0 : tensor<64x128xi8>
   }
 
   func.func @test_empty_int16() -> tensor<64x128xi16> {
     %0 = "ttir.constant"() <{value = dense<0> : tensor<64x128xi16>}> : () -> tensor<64x128xi16>
-    // CHECK: %{{[0-9]+}} = "ttnn.empty"
+    // CHECK: %{{[0-9]+}} = "ttnn.full"
     return %0 : tensor<64x128xi16>
   }
 
   func.func @test_empty_int() -> tensor<64x128xi32> {
     %0 = "ttir.constant"() <{value = dense<0> : tensor<64x128xi32>}> : () -> tensor<64x128xi32>
-    // CHECK: %{{[0-9]+}} = "ttnn.empty"
+    // CHECK: %{{[0-9]+}} = "ttnn.full"
     return %0 : tensor<64x128xi32>
   }
 
   func.func @test_empty_bfloat16() -> tensor<64x128xbf16> {
     %0 = "ttir.constant"() <{value = dense<0.000000e+00> : tensor<64x128xbf16>}> : () -> tensor<64x128xbf16>
-    // CHECK: %{{[0-9]+}} = "ttnn.empty"
+    // CHECK: %{{[0-9]+}} = "ttnn.full"
     return %0 : tensor<64x128xbf16>
   }
 
   func.func @test_empty_float() -> tensor<64x128xf32> {
     %0 = "ttir.constant"() <{value = dense<0.000000e+00> : tensor<64x128xf32>}> : () -> tensor<64x128xf32>
-    // CHECK: %{{[0-9]+}} = "ttnn.empty"
+    // CHECK: %{{[0-9]+}} = "ttnn.full"
     return %0 : tensor<64x128xf32>
   }
 
diff --git a/test/ttmlir/Silicon/StableHLO/Constant/constant_bf16.mlir b/test/ttmlir/Silicon/StableHLO/Constant/constant_bf16.mlir
index 1a24e07595..636ea27167 100644
--- a/test/ttmlir/Silicon/StableHLO/Constant/constant_bf16.mlir
+++ b/test/ttmlir/Silicon/StableHLO/Constant/constant_bf16.mlir
@@ -18,7 +18,7 @@ module @jit_constant attributes {} {
 
   func.func public @test_bfloat16_scalar_empty() -> tensor<bf16> {
     // CHECK-LABEL: func.func public @test_bfloat16_scalar_empty
-    // CHECK: ttnn.empty
+    // CHECK: ttnn.full
     // CHECK-SAME: -> tensor<1xbf16
     %0 = stablehlo.constant dense<0.0> : tensor<bf16>
     return %0 : tensor<bf16>
@@ -26,7 +26,7 @@ module @jit_constant attributes {} {
 
   func.func public @test_bfloat16_empty() -> tensor<64x128xbf16> {
     // CHECK-LABEL: func.func public @test_bfloat16_empty
-    // CHECK: ttnn.empty
+    // CHECK: ttnn.full
     // CHECK-SAME: -> tensor<64x128xbf16
     %0 = stablehlo.constant dense<0.0> : tensor<64x128xbf16>
     return %0 : tensor<64x128xbf16>
diff --git a/test/ttmlir/Silicon/StableHLO/Constant/constant_bool.mlir b/test/ttmlir/Silicon/StableHLO/Constant/constant_bool.mlir
index 0c51294e3a..6486ff99c6 100644
--- a/test/ttmlir/Silicon/StableHLO/Constant/constant_bool.mlir
+++ b/test/ttmlir/Silicon/StableHLO/Constant/constant_bool.mlir
@@ -18,7 +18,7 @@ module @jit_constant attributes {} {
 
   func.func public @test_boolean_scalar_empty() -> tensor<i1> {
     // CHECK-LABEL: func.func public @test_boolean_scalar_empty
-    // CHECK: ttnn.empty
+    // CHECK: ttnn.full
     // CHECK-SAME: -> tensor<1xbf16
     %0 = stablehlo.constant dense<false> : tensor<i1>
     return %0 : tensor<i1>
@@ -26,7 +26,7 @@ module @jit_constant attributes {} {
 
   func.func public @test_boolean_empty() -> tensor<64x128xi1> {
     // CHECK-LABEL: func.func public @test_boolean_empty
-    // CHECK: ttnn.empty
+    // CHECK: ttnn.full
     // CHECK-SAME: -> tensor<64x128xbf16
     %0 = stablehlo.constant dense<false> : tensor<64x128xi1>
     return %0 : tensor<64x128xi1>
diff --git a/test/ttmlir/Silicon/StableHLO/Constant/constant_f32.mlir b/test/ttmlir/Silicon/StableHLO/Constant/constant_f32.mlir
index 5a29facc78..3fecd90fb0 100644
--- a/test/ttmlir/Silicon/StableHLO/Constant/constant_f32.mlir
+++ b/test/ttmlir/Silicon/StableHLO/Constant/constant_f32.mlir
@@ -18,7 +18,7 @@ module @jit_constant attributes {} {
 
   func.func public @test_float_scalar_empty() -> tensor<f32> {
     // CHECK-LABEL: func.func public @test_float_scalar_empty
-    // CHECK: ttnn.empty
+    // CHECK: ttnn.full
     // CHECK-SAME: -> tensor<1xf32
     %0 = stablehlo.constant dense<0.0> : tensor<f32>
     return %0 : tensor<f32>
@@ -26,7 +26,7 @@ module @jit_constant attributes {} {
 
   func.func public @test_float_empty() -> tensor<64x128xf32> {
     // CHECK-LABEL: func.func public @test_float_empty
-    // CHECK: ttnn.empty
+    // CHECK: ttnn.full
     // CHECK-SAME: -> tensor<64x128xf32
     %0 = stablehlo.constant dense<0.0> : tensor<64x128xf32>
     return %0 : tensor<64x128xf32>
diff --git a/test/ttmlir/Silicon/StableHLO/Constant/constant_f64.mlir b/test/ttmlir/Silicon/StableHLO/Constant/constant_f64.mlir
index cc39178165..c286745a09 100644
--- a/test/ttmlir/Silicon/StableHLO/Constant/constant_f64.mlir
+++ b/test/ttmlir/Silicon/StableHLO/Constant/constant_f64.mlir
@@ -18,7 +18,7 @@ module @jit_constant attributes {} {
 
   func.func public @test_f64_scalar_empty() -> tensor<f64> {
     // CHECK-LABEL: func.func public @test_f64_scalar_empty
-    // CHECK: ttnn.empty
+    // CHECK: ttnn.full
     // CHECK-SAME: -> tensor<1xf32
     %0 = stablehlo.constant dense<0.0> : tensor<f64>
     return %0 : tensor<f64>
@@ -26,7 +26,7 @@ module @jit_constant attributes {} {
 
   func.func public @test_f64_empty() -> tensor<64x128xf64> {
     // CHECK-LABEL: func.func public @test_f64_empty
-    // CHECK: ttnn.empty
+    // CHECK: ttnn.full
     // CHECK-SAME: -> tensor<64x128xf32
     %0 = stablehlo.constant dense<0.0> : tensor<64x128xf64>
     return %0 : tensor<64x128xf64>
diff --git a/test/ttmlir/Silicon/StableHLO/Constant/constant_i16.mlir b/test/ttmlir/Silicon/StableHLO/Constant/constant_i16.mlir
index 8f4dc247f1..792cdc9d0f 100644
--- a/test/ttmlir/Silicon/StableHLO/Constant/constant_i16.mlir
+++ b/test/ttmlir/Silicon/StableHLO/Constant/constant_i16.mlir
@@ -18,7 +18,7 @@ module @jit_constant attributes {} {
 
   func.func public @test_int16_scalar_empty() -> tensor<i16> {
     // CHECK-LABEL: func.func public @test_int16_scalar_empty
-    // CHECK: ttnn.empty
+    // CHECK: ttnn.full
     // CHECK-SAME: -> tensor<1xi16
     %0 = stablehlo.constant dense<0> : tensor<i16>
     return %0 : tensor<i16>
@@ -26,7 +26,7 @@ module @jit_constant attributes {} {
 
   func.func public @test_int16_empty() -> tensor<64x128xi16> {
     // CHECK-LABEL: func.func public @test_int16_empty
-    // CHECK: ttnn.empty
+    // CHECK: ttnn.full
     // CHECK-SAME: -> tensor<64x128xi16
     %0 = stablehlo.constant dense<0> : tensor<64x128xi16>
     return %0 : tensor<64x128xi16>
diff --git a/test/ttmlir/Silicon/StableHLO/Constant/constant_i32.mlir b/test/ttmlir/Silicon/StableHLO/Constant/constant_i32.mlir
index b5c73da0b9..813b08bcf8 100644
--- a/test/ttmlir/Silicon/StableHLO/Constant/constant_i32.mlir
+++ b/test/ttmlir/Silicon/StableHLO/Constant/constant_i32.mlir
@@ -18,7 +18,7 @@ module @jit_constant attributes {} {
 
   func.func public @test_int32_scalar_empty() -> tensor<i32> {
     // CHECK-LABEL: func.func public @test_int32_scalar_empty
-    // CHECK: ttnn.empty
+    // CHECK: ttnn.full
     // CHECK-SAME: -> tensor<1xi32
     %0 = stablehlo.constant dense<0> : tensor<i32>
     return %0 : tensor<i32>
@@ -26,7 +26,7 @@ module @jit_constant attributes {} {
 
   func.func public @test_int32_empty() -> tensor<64x128xi32> {
     // CHECK-LABEL: func.func public @test_int32_empty
-    // CHECK: ttnn.empty
+    // CHECK: ttnn.full
     // CHECK-SAME: -> tensor<64x128xi32
     %0 = stablehlo.constant dense<0> : tensor<64x128xi32>
     return %0 : tensor<64x128xi32>
diff --git a/test/ttmlir/Silicon/StableHLO/Constant/constant_i64.mlir b/test/ttmlir/Silicon/StableHLO/Constant/constant_i64.mlir
index bf4a3e8cb2..0bcae491b5 100644
--- a/test/ttmlir/Silicon/StableHLO/Constant/constant_i64.mlir
+++ b/test/ttmlir/Silicon/StableHLO/Constant/constant_i64.mlir
@@ -18,7 +18,7 @@ module @jit_constant attributes {} {
 
   func.func public @test_int64_scalar_empty() -> tensor<i64> {
     // CHECK-LABEL: func.func public @test_int64_scalar_empty
-    // CHECK: ttnn.empty
+    // CHECK: ttnn.full
     // CHECK-SAME: -> tensor<1xi32
     %0 = stablehlo.constant dense<0> : tensor<i64>
     return %0 : tensor<i64>
@@ -26,7 +26,7 @@ module @jit_constant attributes {} {
 
   func.func public @test_int64_empty() -> tensor<64x128xi64> {
     // CHECK-LABEL: func.func public @test_int64_empty
-    // CHECK: ttnn.empty
+    // CHECK: ttnn.full
     // CHECK-SAME: -> tensor<64x128xi32
     %0 = stablehlo.constant dense<0> : tensor<64x128xi64>
     return %0 : tensor<64x128xi64>
diff --git a/test/ttmlir/Silicon/TTNN/kv_cache/fill_cache.mlir b/test/ttmlir/Silicon/TTNN/kv_cache/fill_cache.mlir
new file mode 100644
index 0000000000..67bf8387b1
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/kv_cache/fill_cache.mlir
@@ -0,0 +1,14 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+#any_device = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module {
+  func.func @forward(%arg0: tensor<1x32x64x512xbf16>, %arg1: tensor<1x32x3x512xbf16>) -> tensor<1x32x64x512xbf16> {
+    // CHECK: "ttnn.fill_cache"[[C:.*]]
+    %1 = "ttir.fill_cache"(%arg0, %arg1) <{batch_offset = 0: i32, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32x64x512xbf16>, tensor<1x32x3x512xbf16>) -> tensor<1x32x64x512xbf16>
+    %cst = "ttir.constant"() <{value = dense<1.000000e+00> : tensor<1x32x64x512xbf16>}> : () -> tensor<1x32x64x512xbf16>
+    %addition_dps = tensor.empty() : tensor<1x32x64x512xbf16>
+    %2 = "ttir.add"(%1, %cst, %addition_dps) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32x64x512xbf16>, tensor<1x32x64x512xbf16>, tensor<1x32x64x512xbf16>) -> tensor<1x32x64x512xbf16>
+    return %2 : tensor<1x32x64x512xbf16>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/kv_cache/update_cache.mlir b/test/ttmlir/Silicon/TTNN/kv_cache/update_cache.mlir
new file mode 100644
index 0000000000..63a08b3023
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/kv_cache/update_cache.mlir
@@ -0,0 +1,15 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+#any_device = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module {
+  func.func @forward(%arg0: tensor<1x32x64x512xbf16>, %arg1: tensor<1x32x1x512xbf16>) -> tensor<1x32x64x512xbf16> {
+    // CHECK: "ttnn.update_cache"[[C:.*]]
+    %update_index = "ttir.constant"() <{value = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+    %1 = "ttir.update_cache"(%arg0, %arg1, %update_index) <{batch_offset = 0: i32, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32x64x512xbf16>, tensor<1x32x1x512xbf16>, tensor<1xi32>) -> tensor<1x32x64x512xbf16>
+    %cst = "ttir.constant"() <{value = dense<1.000000e+00> : tensor<1x32x64x512xbf16>}> : () -> tensor<1x32x64x512xbf16>
+    %addition_dps = tensor.empty() : tensor<1x32x64x512xbf16>
+    %2 = "ttir.add"(%1, %cst, %addition_dps) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32x64x512xbf16>, tensor<1x32x64x512xbf16>, tensor<1x32x64x512xbf16>) -> tensor<1x32x64x512xbf16>
+    return %2 : tensor<1x32x64x512xbf16>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/simple_constant.mlir b/test/ttmlir/Silicon/TTNN/simple_constant.mlir
index 4f33870c0f..35728f0a93 100644
--- a/test/ttmlir/Silicon/TTNN/simple_constant.mlir
+++ b/test/ttmlir/Silicon/TTNN/simple_constant.mlir
@@ -4,19 +4,19 @@
 module @sysmem_creation attributes {} {
   func.func @test_empty_int() -> tensor<64x128xi32> {
     %0 = "ttir.constant"() <{value = dense<0> : tensor<64x128xi32>}> : () -> tensor<64x128xi32>
-    // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.full"[[C:.*]]
     return %0 : tensor<64x128xi32>
   }
 
   func.func @test_empty_float() -> tensor<64x128xf32> {
     %0 = "ttir.constant"() <{value = dense<0.000000e+00> : tensor<64x128xf32>}> : () -> tensor<64x128xf32>
-    // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.full"[[C:.*]]
     return %0 : tensor<64x128xf32>
   }
 
   func.func @test_empty_float_scalar() -> tensor<1x1xf32> {
     %0 = "ttir.constant"() <{value = dense<0.000000e+00> : tensor<1x1xf32>}> : () -> tensor<1x1xf32>
-    // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.full"[[C:.*]]
     return %0 : tensor<1x1xf32>
   }
 

From fa326aa2b3c1f58e9b303d60250776bd2e0ee550 Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <157983820+vmilosevic@users.noreply.github.com>
Date: Thu, 5 Dec 2024 19:48:43 +0100
Subject: [PATCH 61/84] Uplift third_party/tt-metal to
 3389120b3747b521fa9b7ef333a379554359d961 2024-12-05 (#1512)

* Uplift third_party/tt-metal to 3389120b3747b521fa9b7ef333a379554359d961 2024-12-05

* Fixed namespace of conv2d configuration

---------

Co-authored-by: kmitrovicTT <169657397+kmitrovicTT@users.noreply.github.com>
Co-authored-by: Andrej Jakovljevic <ajakovljevic@tenstorrent.com>
---
 runtime/lib/ttnn/operations/conv/conv2d.cpp    |  2 +-
 runtime/lib/ttnn/operations/pool/maxpool2d.cpp | 13 ++++++-------
 third_party/CMakeLists.txt                     |  2 +-
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/runtime/lib/ttnn/operations/conv/conv2d.cpp b/runtime/lib/ttnn/operations/conv/conv2d.cpp
index 5e00b929e7..dfc60d4445 100644
--- a/runtime/lib/ttnn/operations/conv/conv2d.cpp
+++ b/runtime/lib/ttnn/operations/conv/conv2d.cpp
@@ -21,7 +21,7 @@ void run(const ::tt::target::ttnn::Conv2dOp *op, ProgramContext &context) {
   std::optional<::ttnn::Tensor> bias =
       op->bias() ? std::make_optional(tensorPool.at(op->bias()->global_id()))
                  : std::nullopt;
-  auto config = ::ttnn::operations::conv::conv2d::Conv2dConfig();
+  auto config = ::ttnn::operations::conv::Conv2dConfig();
   config.dtype = utils::getDataType(op->input());
   config.weights_dtype = utils::getDataType(op->weight());
   ::ttnn::MemoryConfig outMemConfig =
diff --git a/runtime/lib/ttnn/operations/pool/maxpool2d.cpp b/runtime/lib/ttnn/operations/pool/maxpool2d.cpp
index ddbe639c74..a20bdc51b4 100644
--- a/runtime/lib/ttnn/operations/pool/maxpool2d.cpp
+++ b/runtime/lib/ttnn/operations/pool/maxpool2d.cpp
@@ -33,13 +33,12 @@ preshardForMaxPool2d(const ::tt::target::ttnn::MaxPool2dOp *op,
 
   constexpr bool en_ch_padding = false;
 
-  auto parallel_config =
-      ::ttnn::operations::conv::conv2d::determine_parallel_config(
-          ::ttnn::TensorMemoryLayout::HEIGHT_SHARDED, op->batch_size(),
-          op->channels(), output_height, output_width, op->channels(),
-          device.compute_with_storage_grid_size(), ShardOrientation::ROW_MAJOR,
-          en_ch_padding);
-  auto sharded_memory_config = ::ttnn::operations::conv::conv2d::
+  auto parallel_config = ::ttnn::operations::conv::determine_parallel_config(
+      ::ttnn::TensorMemoryLayout::HEIGHT_SHARDED, op->batch_size(),
+      op->channels(), output_height, output_width, op->channels(),
+      device.compute_with_storage_grid_size(), ShardOrientation::ROW_MAJOR,
+      en_ch_padding);
+  auto sharded_memory_config = ::ttnn::operations::conv::
       create_sharded_memory_config_from_parallel_config(inputShape,
                                                         parallel_config, 1);
   return ::ttnn::to_memory_config(input, sharded_memory_config, std::nullopt);
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index dac58a37fe..e6b7b5cd51 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -1,6 +1,6 @@
 include(ExternalProject)
 
-set(TT_METAL_VERSION "28e140381c515c053308c97cb952eba2176abf20")
+set(TT_METAL_VERSION "3389120b3747b521fa9b7ef333a379554359d961")
 
 if ("$ENV{ARCH_NAME}" STREQUAL "grayskull")
   set(ARCH_NAME "grayskull")

From 357853836b0a00ba296ceb8d2c679bc8886638d7 Mon Sep 17 00:00:00 2001
From: Milan Topalovic <163355844+mtopalovicTT@users.noreply.github.com>
Date: Fri, 6 Dec 2024 11:32:35 +0100
Subject: [PATCH 62/84] Removing ttnn::TensorMemoryLayout::None (#1502)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1.	Removed `None` from `ttnn::TensorMemoryLayout`:
	•	Going forward, when creating a tensor encoding attribute, you can set `TensorMemoryLayoutAttr` to `nullptr` to represent tensors on the host.
2.	Removed some duplicate methods from `TTNNLayout`:
	•	Removed duplicate methods such as `isSystemBufferType, isDeviceBufferType, and isL1BufferType`.
3.	`MemoryConfigAttr` Updates:
	•	`TensorMemoryLayout` is for now optional parameter. This change addresses cases where tensors are transferred from a device to the host using layout operation. In these scenarios, the attribute can be `nullptr`.
	•	This happens during conversion pass from `TTIR` to `TTNN` in ToLayoutOp. It looks that there is growing need to create new composite op in `TTNN` which will be used instead of `ToLayoutOp`.
4.	`TTNNLayout` Updates:
	•	`TensorMemoryLayout` is now optional parameter. For tensors on the host, this attribute can now be set to nullptr.
5.	Python API Adjustments/CAPI:
	•	When constructing `TTNNLayoutAttr` or `MemoryConfigAttr` via the Python API/CAPI, if `TensorMemoryLayout` is not provided, it will default to nullptr.
	•	Updated the getter for `memory_layout_as_int` to raise an exception if TensorMemoryLayout is not set.
6.     TTNNWorkarounds:
        * TensorMemoryLayout workaround is now optional, since we want to handle cases where tensor is on host. In other places I did this by creating TensorMemoryLayoutAttr and if tensor is on host I set it to nullptr, but this is not an option for workarounds because it would require to introduce dependency on MlirContext, which is not needed for simple workaround.
---
 include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h | 20 +++--
 .../ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td    | 77 +++++++++---------
 .../ttmlir/Dialect/TTNN/IR/TTNNOpsEnums.td    |  2 -
 .../ttmlir/Dialect/TTNN/IR/TTNNWorkarounds.h  |  5 +-
 include/ttmlir/Target/TTNN/utils.h            |  2 -
 lib/CAPI/TTNNAttrs.cpp                        | 28 ++++---
 lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp      | 25 +++---
 lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp    |  2 -
 .../TTNN/Analysis/DFShardingPolicy.cpp        |  4 +-
 lib/Dialect/TTNN/IR/TTNNOps.cpp               | 24 +++---
 lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp          | 81 ++++++++++---------
 lib/Dialect/TTNN/IR/TTNNWorkarounds.cpp       | 10 ++-
 lib/Dialect/TTNN/Transforms/Optimizer.cpp     | 23 +++---
 lib/Dialect/TTNN/Transforms/Passes.cpp        | 16 ++--
 lib/Dialect/TTNN/Transforms/TTNNLayout.cpp    | 34 ++++----
 .../TTNN/Transforms/TTNNWorkarounds.cpp       | 32 +++++---
 lib/Dialect/TTNN/Utils/Utils.cpp              | 10 +--
 lib/Target/TTNN/TTNNToFlatbuffer.cpp          | 15 ++--
 python/TTNNModule.cpp                         | 45 +++++++----
 .../Workarounds/simple_workaround.mlir        | 12 +--
 .../TTNN/eltwise/unary/relu/simple_relu.mlir  |  2 +-
 .../Dialect/TTNN/matmul/simple_matmul.mlir    |  2 +-
 .../optimizer/input_layout_loc_override.mlir  |  4 +-
 .../all_l1_interleaved_policy.mlir            |  4 +-
 .../l1_interleaved_policy/fork_join.mlir      |  6 +-
 .../mnist_l1_interleaved.mlir                 |  4 +-
 .../simple_join_tests/dram_ABC_l1_None.mlir   |  2 +-
 .../simple_join_tests/dram_AB_l1_C.mlir       |  6 +-
 .../simple_join_tests/dram_AC_l1_B.mlir       |  4 +-
 .../simple_join_tests/dram_A_l1_BC.mlir       |  4 +-
 .../simple_join_tests/dram_BC_l1_A.mlir       |  4 +-
 .../simple_join_tests/dram_B_l1_AC.mlir       |  4 +-
 .../simple_join_tests/dram_C_l1_AB.mlir       |  6 +-
 .../simple_join_tests/dram_None_l1_ABC.mlir   |  2 +-
 .../TTNN/optimizer/mnist_sharding.mlir        |  4 +-
 .../TTNN/optimizer/multiple_add_with_loc.mlir |  2 +-
 ...le_add_with_loc_input_layout_override.mlir |  4 +-
 ...e_add_with_loc_output_layout_override.mlir |  6 +-
 .../Dialect/TTNN/optimizer/test_grid_set.mlir | 14 ++--
 .../test_override_reshard_edges.mlir          | 22 ++---
 .../TTNN/optimizer/ttir_to_ttnn_pipeline.mlir |  2 +-
 .../TTNN/test_remove_dead_values_pass.mlir    | 34 ++++----
 .../ttir_to_ttnn_pipeline_custom_opt.mlir     |  2 +-
 .../eltwise_binary_op_chain.mlir              | 10 +--
 .../ttmlir/Silicon/TTNN/emitc/simple_add.mlir |  2 +-
 .../TTNN/optimizer/mnist_sharding_tiled.mlir  |  4 +-
 .../TTNN/perf_unit/test_perf_matmul.mlir      |  2 +-
 test/ttmlir/Silicon/TTNN/simple_matmul.mlir   |  2 +-
 .../Optimizer/TestL1InterleavedPolicy.cpp     |  6 +-
 test/unittests/Optimizer/TestShardSolver.cpp  |  6 +-
 50 files changed, 346 insertions(+), 297 deletions(-)

diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h b/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h
index 944157846d..790c49228c 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h
@@ -14,14 +14,21 @@
 
 namespace mlir::tt::ttnn {
 
-inline bool isSystemBufferType(mlir::tt::ttnn::BufferType bufferType) {
-  return bufferType == mlir::tt::ttnn::BufferType::SystemMemory;
+inline bool isSystemBufferType(BufferType bufferType) {
+  return bufferType == BufferType::SystemMemory;
 }
 
-inline bool isDeviceBufferType(mlir::tt::ttnn::BufferType bufferType) {
-  return bufferType == mlir::tt::ttnn::BufferType::L1 ||
-         bufferType == mlir::tt::ttnn::BufferType::DRAM ||
-         bufferType == mlir::tt::ttnn::BufferType::L1Small;
+inline bool isDeviceBufferType(BufferType bufferType) {
+  return bufferType == BufferType::L1 || bufferType == BufferType::DRAM ||
+         bufferType == BufferType::L1Small;
+}
+
+inline bool isL1BufferType(BufferType bufferType) {
+  return bufferType == BufferType::L1;
+}
+
+inline bool isDRAMBufferType(BufferType bufferType) {
+  return bufferType == BufferType::DRAM;
 }
 
 inline bool isShardedMemoryLayout(TensorMemoryLayout layout) {
@@ -29,6 +36,7 @@ inline bool isShardedMemoryLayout(TensorMemoryLayout layout) {
          layout == TensorMemoryLayout::WidthSharded ||
          layout == TensorMemoryLayout::BlockSharded;
 }
+
 } // namespace mlir::tt::ttnn
 
 #define GET_ATTRDEF_CLASSES
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td b/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td
index d8aea834fb..ba8b7a724e 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td
@@ -81,9 +81,9 @@ def TTNN_MemoryConfigAttr : TTNN_Attr<"MemoryConfig", "memory_config"> {
     TTNN memory config attribute
   }];
 
-  let parameters = (ins AttrParameter<"TensorMemoryLayoutAttr", "">:$tensorMemoryLayout,
-                        AttrParameter<"BufferTypeAttr", "">:$bufferType,
-                        AttrParameter<"ShardSpecAttr", "">:$shardSpec);
+  let parameters = (ins AttrParameter<"BufferTypeAttr", "">:$bufferType,
+                        AttrParameter<"ShardSpecAttr", "">:$shardSpec,
+                        OptionalParameter<"TensorMemoryLayoutAttr">:$tensorMemoryLayout);
 
   let assemblyFormat = "`<` params `>`";
 
@@ -124,7 +124,7 @@ def TTNN_TTNNLayoutAttr: TTNN_Attr<"TTNNLayout", "ttnn_layout"> {
   let parameters = (ins AttrParameter<"AffineMap", "An affine map that defines how the logical tensor dimensions map to a grid shape.">:$linear,
                         AttrParameter<"GridAttr", "The grid shape that this tensor is divided onto.">:$grid,
                         AttrParameter<"MemRefType", "A memref that describes the physical footprint allocation of the shard. It must also have a shape with rank equal to grid.">:$memref,
-                        DefaultValuedParameter<"TensorMemoryLayout", "TensorMemoryLayout::None", "The layout of the tensor in memory.">:$mem_layout);
+                        OptionalParameter<"TensorMemoryLayoutAttr", "TTNN tensor memory layout">:$mem_layout);
   let assemblyFormat = "`<` $linear`,` $grid`,` $memref (`,` $mem_layout^)? `>`";
   let extraClassDeclaration = [{
     static TTNNLayoutAttr get(::mlir::MLIRContext *context,
@@ -132,40 +132,43 @@ def TTNN_TTNNLayoutAttr: TTNN_Attr<"TTNNLayout", "ttnn_layout"> {
                         Type elementType,
                         BufferType bufferType,
                         GridAttr grid,
-                        TensorMemoryLayout memoryLayout,
+                        TensorMemoryLayoutAttr memoryLayoutAttr = nullptr,
                         ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}});
-      uint64_t getShardSizeInBytes() const;
-      BufferType getBufferType() const;
-      TTNNLayoutAttr withGrid(::mlir::MLIRContext *context, ArrayRef<int64_t> tensorShape, GridAttr grid, ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}});
-      TTNNLayoutAttr withGrid(::mlir::MLIRContext *context,
-                          RankedTensorType ty,
-                          GridAttr grid,
-                          ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}});
-      TTNNLayoutAttr withElementType(::mlir::MLIRContext *context, Type elementType);
-      TTNNLayoutAttr withBufferType(::mlir::MLIRContext *context, BufferType bufferType);
-      TTNNLayoutAttr withMemoryLayout(::mlir::MLIRContext *context, TensorMemoryLayout memLayout);
-      TTNNLayoutAttr withShardShape(::mlir::MLIRContext *context, llvm::SmallVector<int64_t> shardShape);
-
-      bool isSystemBufferType() const { return ::mlir::tt::ttnn::isSystemBufferType(getBufferType()); }
-      bool isDeviceBufferType() const { return ::mlir::tt::ttnn::isDeviceBufferType(getBufferType()); }
-      bool hasShardedTensorMemoryLayout() const;
-      bool hasShardedL1TensorMemoryLayout() const;
-      bool hasInterleavedL1TensorMemoryLayout() const;
-      bool hasInterleavedDRAMTensorMemoryLayout() const;
-      bool hasL1BufferType() const;
-      bool hasDRAMBufferType() const;
-      bool isTiled() const;
-      Layout getLayout() const;
-      Type getElementType() const;
-      DataType getDataType() const;
-      uint64_t getElementSizeBytes() const;
-      int64_t getTensorSizeInBytes(ArrayRef<int64_t> tensorShape, ::mlir::tt::DeviceAttr device) const;
-      llvm::SmallVector<int64_t> getStride(ArrayRef<int64_t> logicalShape) const;
-      llvm::SmallVector<int64_t> getShardShape() const;
-      llvm::SmallVector<int64_t> getScalarShardShape() const;
-      AffineMap replaceMemoryMapSymbolsWithShardShape(AffineMap physicalMemoryMap) const;
-      AffineMap getIdentityTileLinearMap() const;
-      llvm::SmallVector<int64_t> getTiledShape(ArrayRef<int64_t> logicalTensorShape) const;
+
+    TTNNLayoutAttr withGrid(::mlir::MLIRContext *context, ArrayRef<int64_t> tensorShape, GridAttr grid, ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}});
+    TTNNLayoutAttr withGrid(::mlir::MLIRContext *context,
+                        RankedTensorType ty,
+                        GridAttr grid,
+                        ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}});
+    TTNNLayoutAttr withElementType(::mlir::MLIRContext *context, Type elementType);
+    TTNNLayoutAttr withBufferType(::mlir::MLIRContext *context, BufferType bufferType);
+    TTNNLayoutAttr withMemoryLayout(::mlir::MLIRContext *context, TensorMemoryLayoutAttr memLayoutAttr);
+    TTNNLayoutAttr withMemoryLayout(::mlir::MLIRContext *context, TensorMemoryLayout memLayout);
+    TTNNLayoutAttr withShardShape(::mlir::MLIRContext *context, llvm::SmallVector<int64_t> shardShape);
+
+    bool isSystemBufferType() const { return ::mlir::tt::ttnn::isSystemBufferType(getBufferType()); }
+    bool isDeviceBufferType() const { return ::mlir::tt::ttnn::isDeviceBufferType(getBufferType()); }
+    bool isTiled() const;
+    bool hasShardedTensorMemoryLayout() const;
+    bool hasShardedL1TensorMemoryLayout() const;
+    bool hasInterleavedL1TensorMemoryLayout() const;
+    bool hasInterleavedDRAMTensorMemoryLayout() const;
+    bool hasDRAMBufferType() const;
+    bool hasL1BufferType() const;
+    Layout getLayout() const;
+    std::optional<TensorMemoryLayout> getMemLayoutOpt() const;
+    Type getElementType() const;
+    uint64_t getShardSizeInBytes() const;
+    BufferType getBufferType() const;
+    DataType getDataType() const;
+    uint64_t getElementSizeBytes() const;
+    int64_t getTensorSizeInBytes(ArrayRef<int64_t> tensorShape, ::mlir::tt::DeviceAttr device) const;
+    llvm::SmallVector<int64_t> getStride(ArrayRef<int64_t> logicalShape) const;
+    llvm::SmallVector<int64_t> getShardShape() const;
+    llvm::SmallVector<int64_t> getScalarShardShape() const;
+    AffineMap getIdentityTileLinearMap() const;
+    llvm::SmallVector<int64_t> getTiledShape(ArrayRef<int64_t> logicalTensorShape) const;
+    AffineMap replaceMemoryMapSymbolsWithShardShape(AffineMap physicalMemoryMap) const;
   }];
 }
 
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOpsEnums.td b/include/ttmlir/Dialect/TTNN/IR/TTNNOpsEnums.td
index 1b580a3a8b..0dfe811965 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNOpsEnums.td
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNOpsEnums.td
@@ -21,7 +21,6 @@ def TTNN_Layout : I32EnumAttr<"Layout", "TTNN Layout",
   let cppNamespace = "::mlir::tt::ttnn";
 }
 
-def TTNN_TensorMemoryLayout_None : I32EnumAttrCase<"None", 0, "none">;
 def TTNN_TensorMemoryLayout_Interleaved : I32EnumAttrCase<"Interleaved", 1, "interleaved">;
 def TTNN_TensorMemoryLayout_SingleBank : I32EnumAttrCase<"SingleBank", 2, "single_bank">;
 def TTNN_TensorMemoryLayout_HeightSharded : I32EnumAttrCase<"HeightSharded", 3, "height_sharded">;
@@ -30,7 +29,6 @@ def TTNN_TensorMemoryLayout_BlockSharded : I32EnumAttrCase<"BlockSharded", 5, "b
 
 def TTNN_TensorMemoryLayout : I32EnumAttr<"TensorMemoryLayout", "TTNN Tensor Memory Layout",
                            [
-                            TTNN_TensorMemoryLayout_None,
                             TTNN_TensorMemoryLayout_Interleaved,
                             TTNN_TensorMemoryLayout_SingleBank,
                             TTNN_TensorMemoryLayout_HeightSharded,
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNWorkarounds.h b/include/ttmlir/Dialect/TTNN/IR/TTNNWorkarounds.h
index 7795623384..4122b0ca03 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNWorkarounds.h
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNWorkarounds.h
@@ -93,8 +93,9 @@ struct WorkaroundResult {
   // Target tensor buffer type.
   std::pair<BufferType, bool> targetTensorBufferTypeResult;
 
-  // Target tensor memory layout.
-  std::pair<TensorMemoryLayout, bool> targetTensorMemoryLayoutResult;
+  // Target tensor memory layout. Can be nullopt for tensors on host.
+  std::pair<std::optional<TensorMemoryLayout>, bool>
+      targetTensorMemoryLayoutResult;
 
   // Returns true if any of the workarounds were applied.
   bool modified() const {
diff --git a/include/ttmlir/Target/TTNN/utils.h b/include/ttmlir/Target/TTNN/utils.h
index e3f642a2d9..201cc1ee3b 100644
--- a/include/ttmlir/Target/TTNN/utils.h
+++ b/include/ttmlir/Target/TTNN/utils.h
@@ -26,8 +26,6 @@ ::tt::target::TensorMemoryLayout toTargetTensorMemoryLayout(
     return ::tt::target::TensorMemoryLayout::WidthSharded;
   case ::mlir::tt::ttnn::TensorMemoryLayout::BlockSharded:
     return ::tt::target::TensorMemoryLayout::BlockSharded;
-  case ::mlir::tt::ttnn::TensorMemoryLayout::None:
-    return ::tt::target::TensorMemoryLayout::None;
   }
 
   llvm_unreachable("Unsupported TensorMemoryLayout");
diff --git a/lib/CAPI/TTNNAttrs.cpp b/lib/CAPI/TTNNAttrs.cpp
index 677f22fb42..467d8c0044 100644
--- a/lib/CAPI/TTNNAttrs.cpp
+++ b/lib/CAPI/TTNNAttrs.cpp
@@ -53,10 +53,9 @@ MlirAttribute ttmlirTTNNMemoryConfigAttrGet(
     MlirContext ctx, MlirAttribute tensorMemoryLayoutAttr,
     MlirAttribute bufferTypeAttr, MlirAttribute shardSpecAttr) {
   return wrap(MemoryConfigAttr::get(
-      unwrap(ctx),
-      mlir::cast<TensorMemoryLayoutAttr>(unwrap(tensorMemoryLayoutAttr)),
-      mlir::cast<BufferTypeAttr>(unwrap(bufferTypeAttr)),
-      mlir::cast<ShardSpecAttr>(unwrap(shardSpecAttr))));
+      unwrap(ctx), mlir::cast<BufferTypeAttr>(unwrap(bufferTypeAttr)),
+      mlir::cast<ShardSpecAttr>(unwrap(shardSpecAttr)),
+      mlir::cast<TensorMemoryLayoutAttr>(unwrap(tensorMemoryLayoutAttr))));
 }
 
 MlirAttribute ttmlirTTNNShapeAttrGet(MlirContext ctx, int64_t *shape,
@@ -69,14 +68,25 @@ MlirAttribute ttmlirTTNNMeshShapeAttrGet(MlirContext ctx, int64_t y,
   return wrap(MeshShapeAttr::get(unwrap(ctx), y, x));
 }
 
+// Get layout TTNNLayout attribute
+//
+// param ctx: mlir context
+// param linear Affine map for mapping tensor from logical to physical space
+// param grid Grid of cores where tensor is mapped to
+// param memref Memref which holds shard size, shard scalar type and memory
+// param memLayout Memory layout of the tensor
 MlirAttribute ttmlirTTNNTTNNLayoutAttrGet(MlirContext ctx, MlirAffineMap linear,
                                           MlirAttribute grid, MlirType memref,
-                                          unsigned memLayout) {
+                                          unsigned *memLayout = nullptr) {
   mlir::AffineMap affineMap = mlir::AffineMap::getFromOpaquePointer(linear.ptr);
-  return wrap(TTNNLayoutAttr::get(unwrap(ctx), affineMap,
-                                  mlir::cast<GridAttr>(unwrap(grid)),
-                                  mlir::cast<MemRefType>(unwrap(memref)),
-                                  static_cast<TensorMemoryLayout>(memLayout)));
+  TensorMemoryLayoutAttr memLayoutAttr;
+  if (memLayout) {
+    memLayoutAttr = TensorMemoryLayoutAttr::get(
+        unwrap(ctx), static_cast<TensorMemoryLayout>(*memLayout));
+  }
+  return wrap(TTNNLayoutAttr::get(
+      unwrap(ctx), affineMap, mlir::cast<GridAttr>(unwrap(grid)),
+      mlir::cast<MemRefType>(unwrap(memref)), memLayoutAttr));
 }
 
 } // namespace mlir::tt::ttnn
diff --git a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
index b7404ec4d1..bf216d3629 100644
--- a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
+++ b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
@@ -63,8 +63,8 @@ class TensorEmptyConversionPattern
     // If the tensor is not going to device, we can create the op without
     // device-specific attributes
     //
-    ttnn::TensorMemoryLayout memLayout = layoutAttr.getMemLayout();
-    if (memLayout == ttnn::TensorMemoryLayout::None) {
+    ttnn::TensorMemoryLayoutAttr memLayout = layoutAttr.getMemLayout();
+    if (!memLayout) {
       rewriter.replaceOpWithNewOp<ttnn::EmptyOp>(
           op, this->getTypeConverter()->convertType(op.getType()), nullptr,
           shapeAttr, dTypeAttr, tensorLayoutAttr, nullptr);
@@ -79,12 +79,10 @@ class TensorEmptyConversionPattern
     auto device = ::ttnn::utils::getOrInsertDevice(rewriter, op);
     llvm::SmallVector<int64_t> shardShape = layoutAttr.getShardShape();
     ttnn::MemoryConfigAttr memoryConfigAttr = ttnn::MemoryConfigAttr::get(
-        op.getContext(),
-        ttnn::TensorMemoryLayoutAttr::get(op.getContext(), memLayout),
-        ttnn::BufferTypeAttr::get(op.getContext(), bufferType),
+        op.getContext(), ttnn::BufferTypeAttr::get(op.getContext(), bufferType),
         ttnn::ShardSpecAttr::get(
-            op.getContext(),
-            ttnn::ShapeAttr::get(op.getContext(), shardShape)));
+            op.getContext(), ttnn::ShapeAttr::get(op.getContext(), shardShape)),
+        memLayout);
 
     rewriter.replaceOpWithNewOp<ttnn::EmptyOp>(
         op, this->getTypeConverter()->convertType(op.getType()), device,
@@ -159,17 +157,13 @@ class ToLayoutOpConversionPattern
     llvm::SmallVector<int64_t> outputShardShape =
         outputLayoutAttr.getShardShape();
 
-    // Determine output memory config attr
-    ttnn::TensorMemoryLayout outputTensorMemoryLayout =
-        outputLayoutAttr.getMemLayout();
     ttnn::MemoryConfigAttr outputMemConfigAttr = ttnn::MemoryConfigAttr::get(
         rewriter.getContext(),
-        ttnn::TensorMemoryLayoutAttr::get(rewriter.getContext(),
-                                          outputTensorMemoryLayout),
         ttnn::BufferTypeAttr::get(rewriter.getContext(), outputBufferType),
         ttnn::ShardSpecAttr::get(
             op.getContext(),
-            ttnn::ShapeAttr::get(rewriter.getContext(), outputShardShape)));
+            ttnn::ShapeAttr::get(rewriter.getContext(), outputShardShape)),
+        outputLayoutAttr.getMemLayout());
 
     rewriter.replaceOpWithNewOp<ttnn::ToLayoutOp>(
         op, this->getTypeConverter()->convertType(result), adaptor.getInput(),
@@ -950,11 +944,10 @@ class ArangeOpConversionPattern : public OpConversionPattern<ttir::ArangeOp> {
 
     ttnn::MemoryConfigAttr memConfigAttr =
         rewriter.getAttr<ttnn::MemoryConfigAttr>(
-            rewriter.getAttr<ttnn::TensorMemoryLayoutAttr>(
-                layoutAttr.getMemLayout()),
             rewriter.getAttr<ttnn::BufferTypeAttr>(layoutAttr.getBufferType()),
             rewriter.getAttr<ttnn::ShardSpecAttr>(
-                rewriter.getAttr<ttnn::ShapeAttr>(layoutAttr.getShardShape())));
+                rewriter.getAttr<ttnn::ShapeAttr>(layoutAttr.getShardShape())),
+            layoutAttr.getMemLayout());
 
     rewriter.replaceOpWithNewOp<ttnn::ArangeOp>(
         op, outputType, adaptor.getStart(), adaptor.getEnd(), adaptor.getStep(),
diff --git a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
index dc3fc1cde8..3986438e64 100644
--- a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
+++ b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
@@ -86,8 +86,6 @@ emitc::OpaqueAttr convertTensorMemoryLayout(Builder &builder,
   case ttnn::TensorMemoryLayout::WidthSharded:
     return builder.getType<emitc::OpaqueAttr>(
         "ttnn::TensorMemoryLayout::WIDTH_SHARDED");
-  case ttnn::TensorMemoryLayout::None:
-    llvm_unreachable("Unsupported ttnn::TensorMemoryLayout");
   }
 }
 
diff --git a/lib/Dialect/TTNN/Analysis/DFShardingPolicy.cpp b/lib/Dialect/TTNN/Analysis/DFShardingPolicy.cpp
index b83409d477..8d5f22bfc4 100644
--- a/lib/Dialect/TTNN/Analysis/DFShardingPolicy.cpp
+++ b/lib/Dialect/TTNN/Analysis/DFShardingPolicy.cpp
@@ -217,9 +217,11 @@ void DFShardingPolicy::pickOpShardLayouts(ShardSolver &shardSolver,
         maxCoreUsage = accMaxCoreUsage[op][layoutIterator.index()];
         selectedLayout = layoutIterator.get();
       } else if (accMaxCoreUsage[op][layoutIterator.index()] == maxCoreUsage) {
+        assert(layoutIterator->getMemLayout() &&
+               "TensorMemoryLayout is not set");
         // If we have a tie, prefer layout that is not BlockSharded.
         //
-        if (layoutIterator->getMemLayout() !=
+        if (layoutIterator->getMemLayout().getValue() !=
             ttnn::TensorMemoryLayout::BlockSharded) {
           selectedLayout = layoutIterator.get();
         }
diff --git a/lib/Dialect/TTNN/IR/TTNNOps.cpp b/lib/Dialect/TTNN/IR/TTNNOps.cpp
index 94f2507dc9..00fa36c278 100644
--- a/lib/Dialect/TTNN/IR/TTNNOps.cpp
+++ b/lib/Dialect/TTNN/IR/TTNNOps.cpp
@@ -205,10 +205,11 @@ ::mlir::LogicalResult mlir::tt::ttnn::EmptyOp::verify() {
   //
   if (getMemoryConfig().has_value()) {
     ttnn::BufferType bufferType = layoutAttr.getBufferType();
-    ttnn::TensorMemoryLayout tensorMemoryLayout = layoutAttr.getMemLayout();
+    ttnn::TensorMemoryLayoutAttr tensorMemoryLayoutAttr =
+        layoutAttr.getMemLayout();
     assert(bufferType == getMemoryConfig()->getBufferType().getValue());
-    assert(tensorMemoryLayout ==
-           getMemoryConfig()->getTensorMemoryLayout().getValue());
+    assert(tensorMemoryLayoutAttr ==
+           getMemoryConfig()->getTensorMemoryLayout());
   }
   //
   // ==============================
@@ -547,9 +548,10 @@ ::mlir::LogicalResult mlir::tt::ttnn::EmbeddingOp::verify() {
 //===----------------------------------------------------------------------===//
 
 // Utility methods
-static bool isValidDeviceLayout(TensorMemoryLayout layout) {
-  return layout == TensorMemoryLayout::Interleaved ||
-         isShardedMemoryLayout(layout);
+static bool isValidDeviceLayout(TensorMemoryLayoutAttr memLayoutAttr) {
+  return memLayoutAttr &&
+         (memLayoutAttr.getValue() == TensorMemoryLayout::Interleaved ||
+          isShardedMemoryLayout(memLayoutAttr.getValue()));
 }
 
 // ToMemoryConfigOp verification
@@ -567,11 +569,7 @@ ::mlir::LogicalResult mlir::tt::ttnn::ToMemoryConfigOp::verify() {
     return emitOpError("Output tensor type missing layout attribute");
   }
   BufferType outputBufferType = outputLayout.getBufferType();
-  TensorMemoryLayout outputMemoryLayout = outputLayout.getMemLayout();
-  if (isSystemBufferType(outputBufferType) &&
-      outputMemoryLayout != TensorMemoryLayout::None) {
-    return emitOpError("System memory space only supports undef memory layout");
-  }
+  TensorMemoryLayoutAttr outputMemoryLayout = outputLayout.getMemLayout();
 
   if (isDeviceBufferType(outputBufferType) &&
       !isValidDeviceLayout(outputMemoryLayout)) {
@@ -580,7 +578,7 @@ ::mlir::LogicalResult mlir::tt::ttnn::ToMemoryConfigOp::verify() {
   }
 
   if (outputBufferType == BufferType::DRAM &&
-      outputMemoryLayout != TensorMemoryLayout::Interleaved) {
+      outputMemoryLayout.getValue() != TensorMemoryLayout::Interleaved) {
     return emitOpError(
         "Device DRAM memory space only supports interleaved memory layout");
   }
@@ -594,7 +592,7 @@ ::mlir::LogicalResult mlir::tt::ttnn::ToMemoryConfigOp::verify() {
     if (shardShape.size() != 2) {
       return emitOpError("Shard shape must be 2D");
     }
-    if (outputMemoryLayout == TensorMemoryLayout::BlockSharded) {
+    if (outputMemoryLayout.getValue() == TensorMemoryLayout::BlockSharded) {
       // TTNN tiles are (32, 32), shard shape must evenly divide the tile shape
       if (shardShape[0] % TILE_HEIGHT != 0 or shardShape[1] % TILE_WIDTH != 0) {
         return emitOpError(
diff --git a/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp b/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp
index fc692b0f1d..3f6c88e2b1 100644
--- a/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp
+++ b/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp
@@ -14,26 +14,6 @@
 
 using namespace mlir::tt::ttnn;
 
-// Check if tensor is on host
-inline bool isSystemBufferType(BufferType bufferType) {
-  return bufferType == BufferType::SystemMemory;
-}
-
-// Check if the tensor is on device
-inline bool isDeviceBufferType(BufferType bufferType) {
-  return bufferType == BufferType::DRAM || bufferType == BufferType::L1;
-}
-
-// Check if tensor is in DRAM memory
-inline bool isDRAMBufferType(BufferType bufferType) {
-  return bufferType == BufferType::DRAM;
-}
-
-// Check if tensor is in L1 memory
-inline bool isL1BufferType(BufferType bufferType) {
-  return bufferType == BufferType::L1;
-}
-
 // Check if the tensor is tiled
 bool TTNNLayoutAttr::isTiled() const {
   return ::mlir::isa<::mlir::tt::TileType>(getElementType());
@@ -44,6 +24,12 @@ Layout TTNNLayoutAttr::getLayout() const {
   return isTiled() ? Layout::Tile : Layout::RowMajor;
 }
 
+// Get optinoal memory layout
+std::optional<TensorMemoryLayout> TTNNLayoutAttr::getMemLayoutOpt() const {
+  return getMemLayout() ? std::make_optional(getMemLayout().getValue())
+                        : std::nullopt;
+}
+
 // Check if the tensor memory buffer type is L1
 bool TTNNLayoutAttr::hasL1BufferType() const {
   return isL1BufferType(getBufferType());
@@ -56,29 +42,30 @@ bool TTNNLayoutAttr::hasDRAMBufferType() const {
 
 // Check if the tensor memory layout is sharded
 bool TTNNLayoutAttr::hasShardedTensorMemoryLayout() const {
-  return (getMemLayout() == TensorMemoryLayout::HeightSharded ||
-          getMemLayout() == TensorMemoryLayout::WidthSharded ||
-          getMemLayout() == TensorMemoryLayout::BlockSharded);
+  return isDeviceBufferType() &&
+         (getMemLayout().getValue() == TensorMemoryLayout::HeightSharded ||
+          getMemLayout().getValue() == TensorMemoryLayout::WidthSharded ||
+          getMemLayout().getValue() == TensorMemoryLayout::BlockSharded);
 }
 
 // Check if the tensor memory layout is sharded in L1 memory
 bool TTNNLayoutAttr::hasShardedL1TensorMemoryLayout() const {
   return hasL1BufferType() &&
-         (getMemLayout() == TensorMemoryLayout::HeightSharded ||
-          getMemLayout() == TensorMemoryLayout::WidthSharded ||
-          getMemLayout() == TensorMemoryLayout::BlockSharded);
+         (getMemLayout().getValue() == TensorMemoryLayout::HeightSharded ||
+          getMemLayout().getValue() == TensorMemoryLayout::WidthSharded ||
+          getMemLayout().getValue() == TensorMemoryLayout::BlockSharded);
 }
 
 // Check if the tensor memory layout is interleaved and in L1 memory
 bool TTNNLayoutAttr::hasInterleavedL1TensorMemoryLayout() const {
   return hasL1BufferType() &&
-         (getMemLayout() == TensorMemoryLayout::Interleaved);
+         (getMemLayout().getValue() == TensorMemoryLayout::Interleaved);
 }
 
 // Check if the tensor memory layout is interleaved and in DRAM memory
 bool TTNNLayoutAttr::hasInterleavedDRAMTensorMemoryLayout() const {
   return hasDRAMBufferType() &&
-         (getMemLayout() == TensorMemoryLayout::Interleaved);
+         (getMemLayout().getValue() == TensorMemoryLayout::Interleaved);
 }
 
 // Get stride given tensor logical shape
@@ -393,15 +380,32 @@ TTNNLayoutAttr TTNNLayoutAttr::withBufferType(::mlir::MLIRContext *context,
 // replaces the memory layout with the given one.
 //
 // param context The MLIR context.
-// param memLayout The new memory layout.
+// param memLayoutAttr The new memory layout.
 // return The new TTNNLayoutAttr with the given memory layout.
-TTNNLayoutAttr TTNNLayoutAttr::withMemoryLayout(::mlir::MLIRContext *context,
-                                                TensorMemoryLayout memLayout) {
+TTNNLayoutAttr
+TTNNLayoutAttr::withMemoryLayout(::mlir::MLIRContext *context,
+                                 TensorMemoryLayoutAttr memLayoutAttr) {
   return TTNNLayoutAttr::get(
       context, getLinear(), getGrid(),
       buildMemRef<BufferType, BufferTypeAttr>(
           context, getScalarShardShape(), getElementType(), getBufferType()),
-      memLayout);
+      memLayoutAttr);
+}
+
+// Construct a new TTNNLayoutAttr
+//
+// This function creates a deep copy of the current TTNNLayoutAttr and
+// replaces the memory layout with the given one.
+//
+// param context The MLIR context.
+// param memLayout The new memory layout.
+// return The new TTNNLayoutAttr with the given memory layout.
+TTNNLayoutAttr TTNNLayoutAttr::withMemoryLayout(::mlir::MLIRContext *context,
+                                                TensorMemoryLayout memLayout) {
+
+  TensorMemoryLayoutAttr memLayoutAttr =
+      TensorMemoryLayoutAttr::get(context, memLayout);
+  return withMemoryLayout(context, memLayoutAttr);
 }
 
 // Construct a new TTNNLayoutAttr
@@ -437,7 +441,7 @@ TTNNLayoutAttr::withShardShape(::mlir::MLIRContext *context,
 TTNNLayoutAttr TTNNLayoutAttr::get(
     ::mlir::MLIRContext *context, ArrayRef<int64_t> tensorShape,
     Type elementType, BufferType bufferType, GridAttr grid,
-    TensorMemoryLayout memLayout,
+    TensorMemoryLayoutAttr memLayoutAttr,
     ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals) {
   // Construct a new affine map which will be used to map from logical
   // space to physical space
@@ -450,7 +454,7 @@ TTNNLayoutAttr TTNNLayoutAttr::get(
   // Build memref type with the given parameters
   MemRefType memRefType = buildMemRef<BufferType, BufferTypeAttr>(
       context, shardShape, elementType, bufferType);
-  return get(context, linear, grid, memRefType, memLayout);
+  return get(context, linear, grid, memRefType, memLayoutAttr);
 }
 
 // Construct a new MemoryConfig
@@ -463,9 +467,9 @@ TTNNLayoutAttr TTNNLayoutAttr::get(
 // return The new MemoryConfigAttr with the given buffer type.
 MemoryConfigAttr MemoryConfigAttr::withBufferType(::mlir::MLIRContext *context,
                                                   BufferType bufferType) {
-  return MemoryConfigAttr::get(context, getTensorMemoryLayout(),
+  return MemoryConfigAttr::get(context,
                                BufferTypeAttr::get(context, bufferType),
-                               getShardSpec());
+                               getShardSpec(), getTensorMemoryLayout());
 }
 
 // Construct a new MemoryConfig
@@ -479,7 +483,6 @@ MemoryConfigAttr MemoryConfigAttr::withBufferType(::mlir::MLIRContext *context,
 MemoryConfigAttr
 MemoryConfigAttr::withMemoryLayout(::mlir::MLIRContext *context,
                                    TensorMemoryLayout memLayout) {
-  return MemoryConfigAttr::get(context,
-                               TensorMemoryLayoutAttr::get(context, memLayout),
-                               getBufferType(), getShardSpec());
+  return MemoryConfigAttr::get(context, getBufferType(), getShardSpec(),
+                               TensorMemoryLayoutAttr::get(context, memLayout));
 }
diff --git a/lib/Dialect/TTNN/IR/TTNNWorkarounds.cpp b/lib/Dialect/TTNN/IR/TTNNWorkarounds.cpp
index c1977747fb..0dd7eaaafd 100644
--- a/lib/Dialect/TTNN/IR/TTNNWorkarounds.cpp
+++ b/lib/Dialect/TTNN/IR/TTNNWorkarounds.cpp
@@ -46,12 +46,16 @@ WorkaroundResult applyWorkarounds(const TTNNOperandWorkarounds &workaround,
       result.targetTensorBufferTypeResult.first !=
       inputLayoutAttr.getBufferType();
 
+  // If the tensor memory layout workaround is present, apply it.
+  // Otherwise, return the input tensor memory layout, which may be
+  // nullopt if tensor is on host.
   result.targetTensorMemoryLayoutResult.first =
-      workaround.tensorMemoryLayoutWorkaround.value_or(
-          inputLayoutAttr.getMemLayout());
+      workaround.tensorMemoryLayoutWorkaround.has_value()
+          ? workaround.tensorMemoryLayoutWorkaround
+          : inputLayoutAttr.getMemLayoutOpt();
   result.targetTensorMemoryLayoutResult.second =
       result.targetTensorMemoryLayoutResult.first !=
-      inputLayoutAttr.getMemLayout();
+      inputLayoutAttr.getMemLayoutOpt();
 
   return result;
 }
diff --git a/lib/Dialect/TTNN/Transforms/Optimizer.cpp b/lib/Dialect/TTNN/Transforms/Optimizer.cpp
index 783f3ea07f..51f731841a 100644
--- a/lib/Dialect/TTNN/Transforms/Optimizer.cpp
+++ b/lib/Dialect/TTNN/Transforms/Optimizer.cpp
@@ -274,7 +274,8 @@ class TTNNOptimizer : public impl::TTNNOptimizerBase<TTNNOptimizer> {
           //
           if (isa<mlir::DestinationStyleOpInterface>(op)) {
             BufferType bufferType = layoutAttr.getBufferType();
-            TensorMemoryLayout tensorMemoryLayout = layoutAttr.getMemLayout();
+            TensorMemoryLayoutAttr tensorMemoryLayoutAttr =
+                layoutAttr.getMemLayout();
 
             op->getOperands().back().setType(newTensorType);
             EmptyOp emptyOp =
@@ -288,13 +289,12 @@ class TTNNOptimizer : public impl::TTNNOptimizerBase<TTNNOptimizer> {
             }
             emptyOp.setMemoryConfigAttr(ttnn::MemoryConfigAttr::get(
                 op->getContext(),
-                TensorMemoryLayoutAttr::get(op->getContext(),
-                                            tensorMemoryLayout),
                 BufferTypeAttr::get(op->getContext(), bufferType),
                 ShardSpecAttr::get(
                     op->getContext(),
                     ShapeAttr::get(op->getContext(),
-                                   layoutAttr.getMemref().getShape()))));
+                                   layoutAttr.getMemref().getShape())),
+                tensorMemoryLayoutAttr));
           }
           // TODO(mtopalovic): Temp workaround for generic ToLayoutOp. Allign
           // MemoryConfigAttr with layout attribute of its output tensor. This
@@ -303,19 +303,19 @@ class TTNNOptimizer : public impl::TTNNOptimizerBase<TTNNOptimizer> {
           //
           else if (isa<ttnn::ToLayoutOp>(op)) {
             BufferType bufferType = layoutAttr.getBufferType();
-            TensorMemoryLayout tensorMemoryLayout = layoutAttr.getMemLayout();
+            TensorMemoryLayoutAttr tensorMemoryLayoutAttr =
+                layoutAttr.getMemLayout();
             // Update the device op with the new tensor type.
             //
             ttnn::ToLayoutOp toLayoutOp = llvm::cast<ttnn::ToLayoutOp>(op);
             toLayoutOp.setMemoryConfigAttr(ttnn::MemoryConfigAttr::get(
                 op->getContext(),
-                ttnn::TensorMemoryLayoutAttr::get(op->getContext(),
-                                                  tensorMemoryLayout),
                 ttnn::BufferTypeAttr::get(op->getContext(), bufferType),
                 ttnn::ShardSpecAttr::get(
                     op->getContext(),
                     ttnn::ShapeAttr::get(op->getContext(),
-                                         layoutAttr.getMemref().getShape()))));
+                                         layoutAttr.getMemref().getShape())),
+                tensorMemoryLayoutAttr));
           }
         }
       });
@@ -451,19 +451,18 @@ class TTNNOptimizer : public impl::TTNNOptimizerBase<TTNNOptimizer> {
                         consumerOpOutputLayout.getGrid()));
 
       BufferType outputBufferType = consumerOpOutputLayout.getBufferType();
-      TensorMemoryLayout outputTensorMemoryLayout =
+      TensorMemoryLayoutAttr outputTensorMemoryLayoutAttr =
           consumerOpOutputLayout.getMemLayout();
 
       llvm::SmallVector<int64_t> shardShape =
           consumerOpOutputLayout.getShardShape();
       MemoryConfigAttr outputMemConfigAttr = MemoryConfigAttr::get(
           consumerOp->getContext(),
-          TensorMemoryLayoutAttr::get(consumerOp->getContext(),
-                                      outputTensorMemoryLayout),
           BufferTypeAttr::get(consumerOp->getContext(), outputBufferType),
           ShardSpecAttr::get(
               consumerOp->getContext(),
-              ShapeAttr::get(consumerOp->getContext(), shardShape)));
+              ShapeAttr::get(consumerOp->getContext(), shardShape)),
+          outputTensorMemoryLayoutAttr);
 
       // If producerOp is a toLayoutOp, adjust its output layout(update
       // inplace) to reflect consumerOp's output layout. If producerOp is not a
diff --git a/lib/Dialect/TTNN/Transforms/Passes.cpp b/lib/Dialect/TTNN/Transforms/Passes.cpp
index e22540a7da..01971b6c61 100644
--- a/lib/Dialect/TTNN/Transforms/Passes.cpp
+++ b/lib/Dialect/TTNN/Transforms/Passes.cpp
@@ -130,16 +130,15 @@ class TTNNDecomposeLayouts
     ttnn::BufferType bufferType;
     ttnn::Layout layoutEnum;
     DataType dataType;
-    ttnn::TensorMemoryLayout tensorMemoryLayout;
+    ttnn::TensorMemoryLayoutAttr tensorMemoryLayout;
     llvm::ArrayRef<int64_t> shardShape;
 
     ttnn::MemoryConfigAttr createMemoryConfigAttr(MLIRContext *context) const {
       return ttnn::MemoryConfigAttr::get(
-          context,
-          ttnn::TensorMemoryLayoutAttr::get(context, tensorMemoryLayout),
-          ttnn::BufferTypeAttr::get(context, bufferType),
+          context, ttnn::BufferTypeAttr::get(context, bufferType),
           ttnn::ShardSpecAttr::get(context,
-                                   ttnn::ShapeAttr::get(context, shardShape)));
+                                   ttnn::ShapeAttr::get(context, shardShape)),
+          tensorMemoryLayout);
     }
 
     bool isOnHost() const {
@@ -219,8 +218,7 @@ class TTNNDecomposeLayouts
     output.dataType = op.getDtype().value();
 
     input.tensorMemoryLayout = inputLayoutAttr.getMemLayout();
-    output.tensorMemoryLayout =
-        outputMemoryConfig.getTensorMemoryLayout().getValue();
+    output.tensorMemoryLayout = outputMemoryConfig.getTensorMemoryLayout();
 
     input.shardShape = inputLayoutAttr.getShardShape();
     output.shardShape = outputMemoryConfig.getShardShapeArray();
@@ -251,8 +249,8 @@ class TTNNDecomposeLayouts
     // device tensor
     if (not opsToCreate.createToDeviceOp and output.isOnDevice()) {
       opsToCreate.createToMemoryConfigOp =
-          (input.tensorMemoryLayout != output.tensorMemoryLayout) and
-          (output.tensorMemoryLayout != ttnn::TensorMemoryLayout::None);
+          output.tensorMemoryLayout &&
+          (input.tensorMemoryLayout != output.tensorMemoryLayout);
       opsToCreate.createToMemoryConfigOp |=
           (input.bufferType == ttnn::BufferType::DRAM and
            output.bufferType == ttnn::BufferType::L1) or
diff --git a/lib/Dialect/TTNN/Transforms/TTNNLayout.cpp b/lib/Dialect/TTNN/Transforms/TTNNLayout.cpp
index 2d4a2ff8f5..9036346a43 100644
--- a/lib/Dialect/TTNN/Transforms/TTNNLayout.cpp
+++ b/lib/Dialect/TTNN/Transforms/TTNNLayout.cpp
@@ -75,7 +75,7 @@ class TTNNLayoutTensorTypeConverter : public TypeConverter {
 
       TTNNLayoutAttr newLayout = TTNNLayoutAttr::get(
           ctx, type.getShape(), type.getElementType(), g_defaultMemorySpaceHost,
-          tensorGrid, TensorMemoryLayout::None, collapseDimsRef);
+          tensorGrid, nullptr /* memLayoutAttr */, collapseDimsRef);
       return RankedTensorType::get(type.getShape(), type.getElementType(),
                                    newLayout);
     });
@@ -154,23 +154,22 @@ class TTNNLayoutTensorTypeRewriter : public RewritePattern {
 static std::optional<Value>
 createToLayoutOp(PatternRewriter &rewriter, Location loc, Value input,
                  BufferType desiredBufferType,
-                 TensorMemoryLayout desiredMemLayout, bool tiled) {
+                 TensorMemoryLayoutAttr desiredMemLayoutAttr, bool tiled) {
 
   // Get type
   RankedTensorType ty = mlir::cast<RankedTensorType>(input.getType());
 
   // Get ttnn layout from the type
-  TTNNLayoutAttr tensorConfig = mlir::cast<TTNNLayoutAttr>(ty.getEncoding());
+  TTNNLayoutAttr ttnnLayoutAttr = mlir::cast<TTNNLayoutAttr>(ty.getEncoding());
 
   // Get buffer type (i.e DRAM/L1 etc)
-  BufferType currBufferType = tensorConfig.getBufferType();
+  BufferType currBufferType = ttnnLayoutAttr.getBufferType();
 
   // Get the current element type (i.e bf16/TileType etc)
-  Type currElementType = tensorConfig.getElementType();
+  Type currElementType = ttnnLayoutAttr.getElementType();
 
-  // Get the mem layout attribute (i.e interleaved/sharded or null in case of
-  // System)
-  TensorMemoryLayout currMemLayout = tensorConfig.getMemLayout();
+  // Get mem layout. If the tensor is on host layout is null
+  TensorMemoryLayoutAttr currMemLayout = ttnnLayoutAttr.getMemLayout();
 
   // Get element type that should be used in the new ttnn layout
   Type desiredElementType =
@@ -181,7 +180,7 @@ createToLayoutOp(PatternRewriter &rewriter, Location loc, Value input,
   // the desired ones, we don't need to do anything
   if (currBufferType == desiredBufferType &&
       currElementType == desiredElementType &&
-      currMemLayout == desiredMemLayout) {
+      currMemLayout == desiredMemLayoutAttr) {
     return std::nullopt;
   }
 
@@ -189,7 +188,7 @@ createToLayoutOp(PatternRewriter &rewriter, Location loc, Value input,
   // memory layout
   TTNNLayoutAttr desiredLayout = rewriter.getAttr<TTNNLayoutAttr>(
       ty.getShape(), desiredElementType, desiredBufferType,
-      tensorConfig.getGrid(), desiredMemLayout, g_defaultCollapseDims);
+      ttnnLayoutAttr.getGrid(), desiredMemLayoutAttr, g_defaultCollapseDims);
 
   // If the input tensor is a constant or empty tensor, we can replace it with a
   // new tensor with the desired layout
@@ -224,7 +223,7 @@ createToLayoutOp(PatternRewriter &rewriter, Location loc, Value input,
   if (existingArange) {
     TTNNLayoutAttr arangeLayout = rewriter.getAttr<TTNNLayoutAttr>(
         ty.getShape(), ty.getElementType(), desiredBufferType,
-        tensorConfig.getGrid(), desiredMemLayout, g_defaultCollapseDims);
+        ttnnLayoutAttr.getGrid(), desiredMemLayoutAttr, g_defaultCollapseDims);
     input =
         rewriter
             .replaceOpWithNewOp<ttir::ArangeOp>(
@@ -264,15 +263,20 @@ createToLayoutOp(PatternRewriter &rewriter, Location loc, Value input,
       utils::toTTTensorMemoryLayout(g_defaultMemoryLayout);
   tt::TensorMemoryLayout desiredMemoryLayout = getLegalTensorMemoryLayout(
       operandConstraint, desiredMemorySpace, ttMemoryLayout);
-  TensorMemoryLayout ttnnMemoryLayout =
-      utils::toTTNNTensorMemoryLayout(desiredMemoryLayout);
+  TensorMemoryLayoutAttr ttnnMemoryLayoutAttr;
+  if (desiredMemoryLayout != tt::TensorMemoryLayout::None) {
+    TensorMemoryLayout ttnnMemoryLayout =
+        utils::toTTNNTensorMemoryLayout(desiredMemoryLayout);
+    ttnnMemoryLayoutAttr =
+        TensorMemoryLayoutAttr::get(rewriter.getContext(), ttnnMemoryLayout);
+  }
 
   // Check if the tensor should be tiled
   bool tiled =
       !bitEnumContainsAny(operandConstraint, OperandConstraint::Scalar);
 
   return createToLayoutOp(rewriter, loc, input, desiredBufferType,
-                          ttnnMemoryLayout, tiled);
+                          ttnnMemoryLayoutAttr, tiled);
 }
 
 // Updates the layout of the operands of a TTIR ops which have DPS operands.
@@ -355,7 +359,7 @@ class TTNNLayoutFuncReturnRewriter
           appendInputSuffix(op.getLoc(), operand.getOperandNumber());
       std::optional<Value> layout = createToLayoutOp(
           rewriter, newLoc, operand.get(), BufferType::SystemMemory,
-          TensorMemoryLayout::None, false /* tiled */);
+          nullptr /* tensorMemoryLayoutAttr */, false /* tiled */);
       if (layout.has_value()) {
         rewriter.modifyOpInPlace(
             op, [&]() { op.setOperand(operand.getOperandNumber(), *layout); });
diff --git a/lib/Dialect/TTNN/Transforms/TTNNWorkarounds.cpp b/lib/Dialect/TTNN/Transforms/TTNNWorkarounds.cpp
index d3e40277b5..bba5d0bcd9 100644
--- a/lib/Dialect/TTNN/Transforms/TTNNWorkarounds.cpp
+++ b/lib/Dialect/TTNN/Transforms/TTNNWorkarounds.cpp
@@ -61,23 +61,29 @@ static mlir::Value
 createToLayoutOp(wa::TTNNWorkaroundInterface &op, OpOperand &inputOperand,
                  PatternRewriter &rewriter, Layout targetTensorLayout,
                  BufferType targetTensorBufferType,
-                 TensorMemoryLayout targetTensorMemoryLayout) {
+                 std::optional<TensorMemoryLayout> targetTensorMemoryLayout) {
   TTNNLayoutAttr inputLayoutAttr = getLayoutAttrFromOpOperand(inputOperand);
 
   // Create element type based on tensor layout.
   Type elementType = getElementType(rewriter.getContext(), targetTensorLayout,
                                     inputLayoutAttr.getDataType());
 
+  // Create tensor memory layout attribute.
+  ttnn::TensorMemoryLayoutAttr outputMemLayoutAttr =
+      targetTensorMemoryLayout.has_value()
+          ? ttnn::TensorMemoryLayoutAttr::get(rewriter.getContext(),
+                                              targetTensorMemoryLayout.value())
+          : nullptr;
+
   // Create the output memory config attribute.
   ttnn::MemoryConfigAttr outputMemConfigAttr = ttnn::MemoryConfigAttr::get(
       rewriter.getContext(),
-      ttnn::TensorMemoryLayoutAttr::get(rewriter.getContext(),
-                                        targetTensorMemoryLayout),
       ttnn::BufferTypeAttr::get(rewriter.getContext(), targetTensorBufferType),
       ttnn::ShardSpecAttr::get(
           op.getContext(),
           ttnn::ShapeAttr::get(rewriter.getContext(),
-                               inputLayoutAttr.getMemref().getShape())));
+                               inputLayoutAttr.getMemref().getShape())),
+      outputMemLayoutAttr);
 
   // Get the input operand type.
   RankedTensorType inputOperandType =
@@ -94,7 +100,7 @@ createToLayoutOp(wa::TTNNWorkaroundInterface &op, OpOperand &inputOperand,
                   .withElementType(rewriter.getContext(), elementType)
                   .withBufferType(rewriter.getContext(), targetTensorBufferType)
                   .withMemoryLayout(rewriter.getContext(),
-                                    targetTensorMemoryLayout)),
+                                    outputMemLayoutAttr)),
           inputOperand.get(),
           LayoutAttr::get(rewriter.getContext(), targetTensorLayout),
           DataTypeAttr::get(rewriter.getContext(),
@@ -185,6 +191,15 @@ static bool workaroundOutputOperand(
   RankedTensorType opResultType =
       mlir::cast<RankedTensorType>(opResult.getType());
 
+  // Create tensor memory layout attribute.
+  TensorMemoryLayoutAttr outputMemLayoutAttr =
+      outputWorkaroundResult.targetTensorMemoryLayoutResult.first.has_value()
+          ? ttnn::TensorMemoryLayoutAttr::get(
+                rewriter.getContext(),
+                outputWorkaroundResult.targetTensorMemoryLayoutResult.first
+                    .value())
+          : nullptr;
+
   // Create the new output result type with the updated tensor layout, buffer
   // type and memory layout.
   RankedTensorType newOutputResultType =
@@ -194,9 +209,7 @@ static bool workaroundOutputOperand(
               .withBufferType(
                   rewriter.getContext(),
                   outputWorkaroundResult.targetTensorBufferTypeResult.first)
-              .withMemoryLayout(
-                  rewriter.getContext(),
-                  outputWorkaroundResult.targetTensorMemoryLayoutResult.first));
+              .withMemoryLayout(rewriter.getContext(), outputMemLayoutAttr));
 
   // Update the type of result with applied workarounds.
   rewriter.modifyOpInPlace(op, [&]() {
@@ -231,7 +244,8 @@ static bool workaroundOutputOperand(
       if (outputWorkaroundResult.targetTensorMemoryLayoutResult.second) {
         currentMemoryConfig = currentMemoryConfig.withMemoryLayout(
             rewriter.getContext(),
-            outputWorkaroundResult.targetTensorMemoryLayoutResult.first);
+            outputWorkaroundResult.targetTensorMemoryLayoutResult.first
+                .value());
       }
 
       // Update the changed memory config attribute.
diff --git a/lib/Dialect/TTNN/Utils/Utils.cpp b/lib/Dialect/TTNN/Utils/Utils.cpp
index 0aa7b9272c..0156299218 100644
--- a/lib/Dialect/TTNN/Utils/Utils.cpp
+++ b/lib/Dialect/TTNN/Utils/Utils.cpp
@@ -38,11 +38,9 @@ mlir::tt::ttnn::TensorMemoryLayout toTTNNTensorMemoryLayout(
     return ttnn::TensorMemoryLayout::BlockSharded;
   case ::mlir::tt::TensorMemoryLayout::SingleBank:
     return ttnn::TensorMemoryLayout::SingleBank;
-  case ::mlir::tt::TensorMemoryLayout::None:
-    return ttnn::TensorMemoryLayout::None;
+  default:
+    llvm_unreachable("Unknown TensorMemoryLayout");
   }
-
-  llvm_unreachable("Unknown TensorMemoryLayout");
 }
 
 mlir::tt::TensorMemoryLayout toTTTensorMemoryLayout(
@@ -59,9 +57,9 @@ mlir::tt::TensorMemoryLayout toTTTensorMemoryLayout(
     return ::mlir::tt::TensorMemoryLayout::BlockSharded;
   case ttnn::TensorMemoryLayout::SingleBank:
     return ::mlir::tt::TensorMemoryLayout::SingleBank;
-  case ttnn::TensorMemoryLayout::None:
-    return ::mlir::tt::TensorMemoryLayout::None;
   }
+
+  llvm_unreachable("Unknown TensorMemoryLayout");
 }
 
 mlir::tt::MemorySpace
diff --git a/lib/Target/TTNN/TTNNToFlatbuffer.cpp b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
index 827901516d..d0d65ad874 100644
--- a/lib/Target/TTNN/TTNNToFlatbuffer.cpp
+++ b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
@@ -40,8 +40,13 @@
 namespace mlir::tt {
 
 ::tt::target::TensorMemoryLayout
-toFlatbuffer(FlatbufferObjectCache &, ttnn::TensorMemoryLayout memLayout) {
-  switch (memLayout) {
+toFlatbuffer(FlatbufferObjectCache &,
+             ttnn::TensorMemoryLayoutAttr memLayoutAttr) {
+  if (!memLayoutAttr) {
+    return ::tt::target::TensorMemoryLayout::None;
+  }
+
+  switch (memLayoutAttr.getValue()) {
   case ttnn::TensorMemoryLayout::SingleBank:
     return ::tt::target::TensorMemoryLayout::SingleBank;
   case ttnn::TensorMemoryLayout::Interleaved:
@@ -52,8 +57,6 @@ toFlatbuffer(FlatbufferObjectCache &, ttnn::TensorMemoryLayout memLayout) {
     return ::tt::target::TensorMemoryLayout::WidthSharded;
   case ttnn::TensorMemoryLayout::BlockSharded:
     return ::tt::target::TensorMemoryLayout::BlockSharded;
-  case ttnn::TensorMemoryLayout::None:
-    return ::tt::target::TensorMemoryLayout::None;
   }
 }
 
@@ -73,7 +76,7 @@ ::tt::target::MemorySpace toFlatbuffer(FlatbufferObjectCache &,
 
 flatbuffers::Offset<::tt::target::MemoryDesc>
 memrefAttrToFlatbuffer(FlatbufferObjectCache &cache, mlir::MemRefType memref,
-                       ttnn::TensorMemoryLayout memLayout) {
+                       ttnn::TensorMemoryLayoutAttr memLayoutAttr) {
   auto shapeInt64 = memref.getShape();
   std::vector<int32_t> shape(shapeInt64.begin(), shapeInt64.end());
   DataType dtype = DataType::Float32;
@@ -100,7 +103,7 @@ memrefAttrToFlatbuffer(FlatbufferObjectCache &cache, mlir::MemRefType memref,
       toFlatbuffer(
           cache,
           mlir::cast<ttnn::BufferTypeAttr>(memref.getMemorySpace()).getValue()),
-      toFlatbuffer(cache, memLayout), size);
+      toFlatbuffer(cache, memLayoutAttr), size);
 }
 
 flatbuffers::Offset<::tt::target::LayoutDesc> ttnnLayoutAttrToFlatbuffer(
diff --git a/python/TTNNModule.cpp b/python/TTNNModule.cpp
index 11e47982da..a2bdb6e041 100644
--- a/python/TTNNModule.cpp
+++ b/python/TTNNModule.cpp
@@ -85,23 +85,26 @@ void populateTTNNModule(py::module &m) {
                      tt::ttnn::BufferTypeAttr bufferTypeAttr,
                      tt::ttnn::ShardSpecAttr shardSpecAttr) {
                     return wrap(tt::ttnn::MemoryConfigAttr::get(
-                        unwrap(ctx), tensorMemoryLayoutAttr, bufferTypeAttr,
-                        shardSpecAttr));
+                        unwrap(ctx), bufferTypeAttr, shardSpecAttr,
+                        tensorMemoryLayoutAttr));
                   })
       .def_static(
           "get_by_value",
           [](MlirContext ctx, uint32_t tensorMemoryLayout, uint32_t bufferType,
              std::vector<int64_t> shardShape) {
-            return wrap(tt::ttnn::MemoryConfigAttr::get(
-                unwrap(ctx),
+            tt::ttnn::TensorMemoryLayoutAttr layoutAttr =
                 tt::ttnn::TensorMemoryLayoutAttr::get(
                     unwrap(ctx), static_cast<tt::ttnn::TensorMemoryLayout>(
-                                     tensorMemoryLayout)),
+                                     tensorMemoryLayout));
+
+            return wrap(tt::ttnn::MemoryConfigAttr::get(
+                unwrap(ctx),
                 tt::ttnn::BufferTypeAttr::get(
                     unwrap(ctx), static_cast<tt::ttnn::BufferType>(bufferType)),
                 tt::ttnn::ShardSpecAttr::get(
                     unwrap(ctx),
-                    tt::ttnn::ShapeAttr::get(unwrap(ctx), shardShape))));
+                    tt::ttnn::ShapeAttr::get(unwrap(ctx), shardShape)),
+                layoutAttr));
           })
       .def_property_readonly("tensor_memory_layout",
                              &tt::ttnn::MemoryConfigAttr::getTensorMemoryLayout)
@@ -130,15 +133,22 @@ void populateTTNNModule(py::module &m) {
       .def_property_readonly("x", &tt::ttnn::MeshShapeAttr::getX);
 
   tt_attribute_class<tt::ttnn::TTNNLayoutAttr>(m, "TTNNLayoutAttr")
-      .def_static("get",
-                  [](MlirContext ctx, MlirAffineMap linear, MlirAttribute grid,
-                     MlirType memref, unsigned memLayout) {
-                    return wrap(tt::ttnn::TTNNLayoutAttr::get(
-                        unwrap(ctx), mlir::cast<AffineMap>(unwrap(linear)),
-                        mlir::cast<tt::GridAttr>(unwrap(grid)),
-                        mlir::cast<MemRefType>(unwrap(memref)),
-                        static_cast<tt::ttnn::TensorMemoryLayout>(memLayout)));
-                  })
+      .def_static(
+          "get",
+          [](MlirContext ctx, MlirAffineMap linear, MlirAttribute grid,
+             MlirType memref,
+             std::optional<unsigned> memLayout = std::nullopt) {
+            tt::ttnn::TensorMemoryLayoutAttr memLayoutAttr;
+            if (memLayout.has_value()) {
+              memLayoutAttr = tt::ttnn::TensorMemoryLayoutAttr::get(
+                  unwrap(ctx),
+                  static_cast<tt::ttnn::TensorMemoryLayout>(memLayout.value()));
+            }
+            return wrap(tt::ttnn::TTNNLayoutAttr::get(
+                unwrap(ctx), mlir::cast<AffineMap>(unwrap(linear)),
+                mlir::cast<tt::GridAttr>(unwrap(grid)),
+                mlir::cast<MemRefType>(unwrap(memref)), memLayoutAttr));
+          })
       .def_property_readonly(
           "linear",
           [](tt::ttnn::TTNNLayoutAttr self) { return wrap(self.getLinear()); })
@@ -148,7 +158,10 @@ void populateTTNNModule(py::module &m) {
           [](tt::ttnn::TTNNLayoutAttr self) { return wrap(self.getMemref()); })
       .def_property_readonly(
           "memory_layout_as_int", [](tt::ttnn::TTNNLayoutAttr self) {
-            return static_cast<uint32_t>(self.getMemLayout());
+            if (!self.getMemLayout()) {
+              assert(false && "Memory layout is not set");
+            }
+            return static_cast<uint32_t>(self.getMemLayout().getValue());
           });
 }
 } // namespace mlir::ttmlir::python
diff --git a/test/ttmlir/Dialect/TTNN/Transforms/Workarounds/simple_workaround.mlir b/test/ttmlir/Dialect/TTNN/Transforms/Workarounds/simple_workaround.mlir
index 9eed399840..e08ffcd405 100644
--- a/test/ttmlir/Dialect/TTNN/Transforms/Workarounds/simple_workaround.mlir
+++ b/test/ttmlir/Dialect/TTNN/Transforms/Workarounds/simple_workaround.mlir
@@ -3,20 +3,20 @@
 #dram = #ttnn.buffer_type<dram>
 #system_memory = #ttnn.buffer_type<system_memory>
 #ttnn_layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #system_memory>>
-#ttnn_layout1 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #dram>, interleaved>
-#ttnn_layout2 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, f32>, #dram>, interleaved>
+#ttnn_layout1 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #dram>, <interleaved>>
+#ttnn_layout2 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, f32>, #dram>, <interleaved>>
 module attributes {tt.device = #device} {
   func.func @forward(%arg0: tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout> {
     %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
     // CHECK: %[[DEVICE_OP:.*]] = "ttnn.get_device"[[C:.*]]
-    %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<interleaved>, #dram, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#dram, <<2x4>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
     // CHECK-NEXT: %[[RM_DEVICE_LAYOUT_OP:.*]] = "ttnn.to_layout"(%arg0, %[[DEVICE_OP]])
     // CHECK-SAME: layout = #ttnn.layout<row_major>
     // CHECK-SAME: -> tensor<64x128xf32, #ttnn_layout1>
-    %2 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<<interleaved>, #dram, <<64x128>>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout2>
+    %2 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout2>
     // CHECK-NEXT: %[[EMPTY_OP:.*]] = "ttnn.empty"(%[[DEVICE_OP]])
     // CHECK-SAME: layout = #ttnn.layout<row_major>
-    // CHECK-SAME: memory_config = #ttnn.memory_config<<interleaved>, #dram, <<64x128>>>
+    // CHECK-SAME: memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>
     // CHECK-SAME: -> tensor<64x128xf32, #ttnn_layout1>
     %3 = "ttnn.abs"(%1, %2) <{operandSegmentSizes = array<i32: 1, 1>}> : (tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout2>) -> tensor<64x128xf32, #ttnn_layout2>
     // CHECK-NEXT: %[[TO_LAYOUT_LEFT:.*]] = "ttnn.to_layout"(%[[RM_DEVICE_LAYOUT_OP]], %[[DEVICE_OP]])
@@ -25,7 +25,7 @@ module attributes {tt.device = #device} {
     // CHECK-NEXT: %[[TO_LAYOUT_RIGHT:.*]] = "ttnn.to_layout"(%[[EMPTY_OP]], %[[DEVICE_OP]])
     // CHECK-SAME: layout = #ttnn.layout<tile>
     // CHECK-SAME: -> tensor<64x128xf32, #ttnn_layout2>
-    %4 = "ttnn.to_layout"(%3) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<none>, #system_memory, <<64x128>>>}> : (tensor<64x128xf32, #ttnn_layout2>) -> tensor<64x128xf32, #ttnn_layout>
+    %4 = "ttnn.to_layout"(%3) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#system_memory, <<64x128>>>}> : (tensor<64x128xf32, #ttnn_layout2>) -> tensor<64x128xf32, #ttnn_layout>
     return %4 : tensor<64x128xf32, #ttnn_layout>
   }
 }
diff --git a/test/ttmlir/Dialect/TTNN/eltwise/unary/relu/simple_relu.mlir b/test/ttmlir/Dialect/TTNN/eltwise/unary/relu/simple_relu.mlir
index ce6887e2a8..1d75b8ee02 100644
--- a/test/ttmlir/Dialect/TTNN/eltwise/unary/relu/simple_relu.mlir
+++ b/test/ttmlir/Dialect/TTNN/eltwise/unary/relu/simple_relu.mlir
@@ -4,7 +4,7 @@
 #system = #ttnn.buffer_type<system_memory>
 #ttnn_layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #system>>
 #ttnn_layout1 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8>, memref<8x16xf32, #system>>
-#ttnn_layout2 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8>, memref<8x16xf32, #l1>, interleaved>
+#ttnn_layout2 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8>, memref<8x16xf32, #l1>, <interleaved>>
 module attributes {} {
   func.func @forward(%arg0: tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout1> {
     // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
diff --git a/test/ttmlir/Dialect/TTNN/matmul/simple_matmul.mlir b/test/ttmlir/Dialect/TTNN/matmul/simple_matmul.mlir
index 63af0b5b49..f82ed85752 100644
--- a/test/ttmlir/Dialect/TTNN/matmul/simple_matmul.mlir
+++ b/test/ttmlir/Dialect/TTNN/matmul/simple_matmul.mlir
@@ -1,6 +1,6 @@
 // RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s | FileCheck %s
 #any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
-// CHECK: #[[TILED_LAYOUT:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>, interleaved>
+// CHECK: #[[TILED_LAYOUT:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>, <interleaved>>
 module attributes {} {
   func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x96xbf16>) -> tensor<64x96xbf16> {
     %0 = tensor.empty() : tensor<64x96xbf16>
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/input_layout_loc_override.mlir b/test/ttmlir/Dialect/TTNN/optimizer/input_layout_loc_override.mlir
index eb3bc04956..4a4575f8d1 100644
--- a/test/ttmlir/Dialect/TTNN/optimizer/input_layout_loc_override.mlir
+++ b/test/ttmlir/Dialect/TTNN/optimizer/input_layout_loc_override.mlir
@@ -4,13 +4,13 @@
 // CHECK-DAG: #[[LOC_MATMUL_IN0:.*]] = loc("matmul_1_in_0_layout"(#loc3))
 // CHECK-DAG: #[[LOC_MATMUL_IN1:.*]] = loc("matmul_1_in_1_layout"(#loc3))
 // CHECK-DAG: #[[LOC_MATMUL:.*]] = loc("matmul_1"(#loc3))
-// CHECK-DAG: #[[IN_1_LAYOUT:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<4x3x!tt.tile<32x32, bf16>, #l1_>, interleaved>
+// CHECK-DAG: #[[IN_1_LAYOUT:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<4x3x!tt.tile<32x32, bf16>, #l1_>, <interleaved>>
 
 module attributes {} {
   func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x96xbf16>) -> tensor<64x96xbf16> {
     %0 = tensor.empty() : tensor<64x96xbf16> loc(#loc2)
     // CHECK-DAG: %{{.*}} = "ttnn.to_device"{{.*}} loc(#[[LOC_MATMUL_IN0]])
-    // CHECK-DAG: %{{.*}} = "ttnn.to_device"{{.*}} <{memory_config = #ttnn.memory_config<<interleaved>, #l1_, <<4x3>>>}> : {{.*}} -> tensor<128x96xbf16, #[[IN_1_LAYOUT]]> loc(#[[LOC_MATMUL_IN1]])
+    // CHECK-DAG: %{{.*}} = "ttnn.to_device"{{.*}} <{memory_config = #ttnn.memory_config<#l1_, <<4x3>>, <interleaved>>}> : {{.*}} -> tensor<128x96xbf16, #[[IN_1_LAYOUT]]> loc(#[[LOC_MATMUL_IN1]])
     // CHECK-DAG: %{{.*}} = "ttnn.matmul"{{.*}} loc(#[[LOC_MATMUL]])
     %1 = "ttir.matmul"(%arg0, %arg1, %0) <{operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xbf16>, tensor<128x96xbf16>, tensor<64x96xbf16>) -> tensor<64x96xbf16> loc(#loc2)
     return %1 : tensor<64x96xbf16>
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/all_l1_interleaved_policy.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/all_l1_interleaved_policy.mlir
index 11eb41da17..70ebaddb8d 100644
--- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/all_l1_interleaved_policy.mlir
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/all_l1_interleaved_policy.mlir
@@ -3,8 +3,8 @@
 module attributes {} {
   func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x96xbf16>, %arg2: tensor<64x96xbf16>, %arg3: tensor<96x32xbf16>, %arg4: tensor<64x32xbf16>) -> tensor<64x32xbf16> {
     // CHECK: #[[L1_:.*]] = #ttnn.buffer_type<l1>
-    // CHECK: #[[LAYOUT_7:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<{{.*}}, #l1_>, interleaved>
-    // CHECK: #[[LAYOUT_10:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<{{.*}}, #l1_>, interleaved>
+    // CHECK: #[[LAYOUT_7:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<{{.*}}, #l1_>, <interleaved>>
+    // CHECK: #[[LAYOUT_10:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<{{.*}}, #l1_>, <interleaved>>
     %0 = tensor.empty() : tensor<64x96xbf16>
     // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<64x96xbf16, #[[LAYOUT_7]]>
     %1 = "ttir.matmul"(%arg0, %arg1, %0) <{operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xbf16>, tensor<128x96xbf16>, tensor<64x96xbf16>) -> tensor<64x96xbf16>
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/fork_join.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/fork_join.mlir
index fef8cdd489..ca0ec90e6f 100644
--- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/fork_join.mlir
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/fork_join.mlir
@@ -22,9 +22,9 @@
 module attributes {} {
   func.func @forward(%arg0: tensor<64x64xbf16>, %arg1: tensor<64x32xbf16>) -> tensor<64x32xbf16> {
     // CHECK: #[[L1_:.*]] = #ttnn.buffer_type<l1>
-    // CHECK: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8>, memref<8x8xbf16, #dram>, interleaved>
-    // CHECK: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8>, memref<8x4xbf16, #l1_>, interleaved>
-    // CHECK: #[[LAYOUT_6:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8>, memref<8x8xbf16, #l1_>, interleaved>
+    // CHECK: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8>, memref<8x8xbf16, #dram>, <interleaved>>
+    // CHECK: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8>, memref<8x4xbf16, #l1_>, <interleaved>>
+    // CHECK: #[[LAYOUT_6:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8>, memref<8x8xbf16, #l1_>, <interleaved>>
     %0 = tensor.empty() : tensor<64x64xbf16>
     // CHECK: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<64x64xbf16, #[[LAYOUT_3]]>
     %1 = "ttir.relu"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x64xbf16>, tensor<64x64xbf16>) -> tensor<64x64xbf16>
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/mnist_l1_interleaved.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/mnist_l1_interleaved.mlir
index 93a19ad6e3..a4cee76569 100644
--- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/mnist_l1_interleaved.mlir
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/mnist_l1_interleaved.mlir
@@ -4,8 +4,8 @@
 module @"tt-forge-graph" attributes {} {
   func.func @main(%arg0: tensor<1x784xf32> loc("MNISTLinear":4294967295:0), %arg1: tensor<1x10xf32> loc("MNISTLinear":4294967295:0), %arg2: tensor<256x10xf32> loc("MNISTLinear":4294967295:0), %arg3: tensor<1x256xf32> loc("MNISTLinear":4294967295:0), %arg4: tensor<784x256xf32> loc("MNISTLinear":4294967295:0)) -> tensor<1x10xf32> {
     // CHECK: #[[L1_:.*]] = #ttnn.buffer_type<l1>
-    // CHECK: #[[LAYOUT_6:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<{{.*}}, #l1_>, interleaved>
-    // CHECK: #[[LAYOUT_7:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<{{.*}}, #l1_>, interleaved>
+    // CHECK: #[[LAYOUT_6:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<{{.*}}, #l1_>, <interleaved>>
+    // CHECK: #[[LAYOUT_7:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<{{.*}}, #l1_>, <interleaved>>
     %0 = tensor.empty() : tensor<1x256xf32> loc(#loc8)
     // CHECK: %[[C:.*]] = "ttnn.matmul"[[C:.*]] -> tensor<1x256xf32, #[[LAYOUT_6]]>
     %1 = "ttir.matmul"(%arg0, %arg4, %0) <{operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x784xf32>, tensor<784x256xf32>, tensor<1x256xf32>) -> tensor<1x256xf32> loc(#loc8)
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_ABC_l1_None.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_ABC_l1_None.mlir
index acbb8d674a..74a2dc55c7 100644
--- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_ABC_l1_None.mlir
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_ABC_l1_None.mlir
@@ -13,7 +13,7 @@
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 module attributes {} {
   func.func @forward(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x8192xbf16>, %arg2: tensor<8192x8192xbf16>, %arg3: tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16> {
-    // CHECK-DAG: #[[LAYOUT_2:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<1024x1024xbf16, #dram>, interleaved>
+    // CHECK-DAG: #[[LAYOUT_2:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<1024x1024xbf16, #dram>, <interleaved>
     %0 = tensor.empty() : tensor<8192x8192xbf16>
     // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<8192x8192xbf16, #[[LAYOUT_2]]>
     %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x8192xbf16>, tensor<8192x8192xbf16>, tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16>
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AB_l1_C.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AB_l1_C.mlir
index 49aebb6a4c..7b5f069640 100644
--- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AB_l1_C.mlir
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AB_l1_C.mlir
@@ -14,9 +14,9 @@
 module attributes {} {
   func.func @forward(%arg0: tensor<5120x4096xbf16>, %arg1: tensor<5120x4096xbf16>, %arg2: tensor<4096x5120xbf16>, %arg3: tensor<4096x5120xbf16>) -> tensor<5120x5120xbf16> {
     // CHECK: #[[L1_:.*]] = #ttnn.buffer_type<l1>
-    // CHECK-DAG: #[[LAYOUT_4:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<512x640xbf16, #dram>, interleaved>
-    // CHECK-DAG: #[[LAYOUT_6:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<640x512xbf16, #dram>, interleaved>
-    // CHECK-DAG: #[[LAYOUT_7:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<640x640xbf16, #l1_>, interleaved>
+    // CHECK-DAG: #[[LAYOUT_4:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<512x640xbf16, #dram>, <interleaved>
+    // CHECK-DAG: #[[LAYOUT_6:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<640x512xbf16, #dram>, <interleaved>
+    // CHECK-DAG: #[[LAYOUT_7:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<640x640xbf16, #l1_>, <interleaved>
     %0 = tensor.empty() : tensor<5120x4096xbf16>
     // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<5120x4096xbf16, #[[LAYOUT_6]]>
     %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<5120x4096xbf16>, tensor<5120x4096xbf16>, tensor<5120x4096xbf16>) -> tensor<5120x4096xbf16>
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AC_l1_B.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AC_l1_B.mlir
index 7f41675cd4..edc2182a73 100644
--- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AC_l1_B.mlir
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AC_l1_B.mlir
@@ -14,8 +14,8 @@
 module attributes {} {
   func.func @forward(%arg0: tensor<4096x5120xbf16>, %arg1: tensor<4096x5120xbf16>, %arg2: tensor<5120x5120xbf16>, %arg3: tensor<5120x5120xbf16>) -> tensor<4096x5120xbf16> {
     // CHECK: #[[L1_:.*]] = #ttnn.buffer_type<l1>
-    // CHECK-DAG: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<512x640xbf16, #dram>, interleaved>
-    // CHECK-DAG: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<640x640xbf16, #l1_>, interleaved>
+    // CHECK-DAG: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<512x640xbf16, #dram>, <interleaved>
+    // CHECK-DAG: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<640x640xbf16, #l1_>, <interleaved>
     %0 = tensor.empty() : tensor<4096x5120xbf16>
     // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<4096x5120xbf16, #[[LAYOUT_3]]>
     %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<4096x5120xbf16>, tensor<4096x5120xbf16>, tensor<4096x5120xbf16>) -> tensor<4096x5120xbf16>
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_A_l1_BC.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_A_l1_BC.mlir
index 7d4c923b43..b5715b5a13 100644
--- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_A_l1_BC.mlir
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_A_l1_BC.mlir
@@ -14,8 +14,8 @@
 module attributes {} {
   func.func @forward(%arg0: tensor<2048x2048xbf16>, %arg1: tensor<2048x2048xbf16>, %arg2: tensor<2048x8192xbf16>, %arg3: tensor<2048x8192xbf16>) -> tensor<2048x8192xbf16> {
     // CHECK: #[[L1_:.*]] = #ttnn.buffer_type<l1>
-    // CHECK-DAG: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<256x256xbf16, #dram>, interleaved>
-    // CHECK-DAG: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<256x1024xbf16, #l1_>, interleaved>
+    // CHECK-DAG: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<256x256xbf16, #dram>, <interleaved>
+    // CHECK-DAG: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<256x1024xbf16, #l1_>, <interleaved>
     %0 = tensor.empty() : tensor<2048x2048xbf16>
     // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<2048x2048xbf16, #[[LAYOUT_3]]>
     %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<2048x2048xbf16>, tensor<2048x2048xbf16>, tensor<2048x2048xbf16>) -> tensor<2048x2048xbf16>
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_BC_l1_A.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_BC_l1_A.mlir
index c915fadd1c..43a2c1d8da 100644
--- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_BC_l1_A.mlir
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_BC_l1_A.mlir
@@ -14,8 +14,8 @@
 module attributes {} {
   func.func @forward(%arg0: tensor<5120x5120xbf16>, %arg1: tensor<5120x5120xbf16>, %arg2: tensor<5120x4096xbf16>, %arg3: tensor<5120x4096xbf16>) -> tensor<5120x4096xbf16> {
     // CHECK: #[[L1_:.*]] = #ttnn.buffer_type<l1>
-    // CHECK-DAG: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<640x512xbf16, #dram>, interleaved>
-    // CHECK-DAG: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<640x640xbf16, #l1_>, interleaved>
+    // CHECK-DAG: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<640x512xbf16, #dram>, <interleaved>
+    // CHECK-DAG: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<640x640xbf16, #l1_>, <interleaved>
     %0 = tensor.empty() : tensor<5120x5120xbf16>
     // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<5120x5120xbf16, #[[LAYOUT_5]]>
     %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<5120x5120xbf16>, tensor<5120x5120xbf16>, tensor<5120x5120xbf16>) -> tensor<5120x5120xbf16>
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_B_l1_AC.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_B_l1_AC.mlir
index 3d2538e245..f32f6a5afe 100644
--- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_B_l1_AC.mlir
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_B_l1_AC.mlir
@@ -14,8 +14,8 @@
 module attributes {} {
   func.func @forward(%arg0: tensor<8192x2048xbf16>, %arg1: tensor<8192x2048xbf16>, %arg2: tensor<2048x2048xbf16>, %arg3: tensor<2048x2048xbf16>) -> tensor<8192x2048xbf16> {
     // CHECK: #[[L1_:.*]] = #ttnn.buffer_type<l1>
-    // CHECK-DAG: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<256x256xbf16, #dram>, interleaved>
-    // CHECK-DAG: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<1024x256xbf16, #l1_>, interleaved>
+    // CHECK-DAG: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<256x256xbf16, #dram>, <interleaved>
+    // CHECK-DAG: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<1024x256xbf16, #l1_>, <interleaved>
     %0 = tensor.empty() : tensor<8192x2048xbf16>
     // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<8192x2048xbf16, #[[LAYOUT_5]]>
     %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x2048xbf16>, tensor<8192x2048xbf16>, tensor<8192x2048xbf16>) -> tensor<8192x2048xbf16>
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_C_l1_AB.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_C_l1_AB.mlir
index 320f00ce3c..4c3358368e 100644
--- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_C_l1_AB.mlir
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_C_l1_AB.mlir
@@ -14,9 +14,9 @@
 module attributes {} {
   func.func @forward(%arg0: tensor<2048x8192xbf16>, %arg1: tensor<2048x8192xbf16>, %arg2: tensor<8192x2048xbf16>, %arg3: tensor<8192x2048xbf16>) -> tensor<2048x2048xbf16> {
     // CHECK: #[[L1_:.*]] = #ttnn.buffer_type<l1>
-    // CHECK-DAG: #[[LAYOUT_4:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<256x1024xbf16, #l1_>, interleaved>
-    // CHECK-DAG: #[[LAYOUT_6:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<1024x256xbf16, #l1_>, interleaved>
-    // CHECK-DAG: #[[LAYOUT_7:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<256x256xbf16, #dram>, interleaved>
+    // CHECK-DAG: #[[LAYOUT_4:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<256x1024xbf16, #l1_>, <interleaved>
+    // CHECK-DAG: #[[LAYOUT_6:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<1024x256xbf16, #l1_>, <interleaved>
+    // CHECK-DAG: #[[LAYOUT_7:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<256x256xbf16, #dram>, <interleaved>
     %0 = tensor.empty() : tensor<2048x8192xbf16>
     // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<2048x8192xbf16, #[[LAYOUT_4]]>
     %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<2048x8192xbf16>, tensor<2048x8192xbf16>, tensor<2048x8192xbf16>) -> tensor<2048x8192xbf16>
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_None_l1_ABC.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_None_l1_ABC.mlir
index a21a11f879..bf441ffbea 100644
--- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_None_l1_ABC.mlir
+++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_None_l1_ABC.mlir
@@ -14,7 +14,7 @@
 module attributes {} {
   func.func @forward(%arg0: tensor<32x32xbf16>, %arg1: tensor<32x32xbf16>, %arg2: tensor<32x32xbf16>, %arg3: tensor<32x32xbf16>) -> tensor<32x32xbf16> {
     // CHECK: #[[L1_:.*]] = #ttnn.buffer_type<l1>
-    // CHECK-DAG: #[[LAYOUT_2:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<4x4xbf16, #l1_>, interleaved>
+    // CHECK-DAG: #[[LAYOUT_2:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<4x4xbf16, #l1_>, <interleaved>
     %0 = tensor.empty() : tensor<32x32xbf16>
     // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<32x32xbf16, #[[LAYOUT_2]]>
     %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<32x32xbf16>, tensor<32x32xbf16>, tensor<32x32xbf16>) -> tensor<32x32xbf16>
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/mnist_sharding.mlir b/test/ttmlir/Dialect/TTNN/optimizer/mnist_sharding.mlir
index 9e6aa15cea..55f3a60548 100644
--- a/test/ttmlir/Dialect/TTNN/optimizer/mnist_sharding.mlir
+++ b/test/ttmlir/Dialect/TTNN/optimizer/mnist_sharding.mlir
@@ -3,8 +3,8 @@
 #loc = loc("MNISTLinear":4294967295:0)
 module @"tt-forge-graph" attributes {} {
   func.func @main(%arg0: tensor<1x784xf32> loc("MNISTLinear":4294967295:0), %arg1: tensor<1x10xf32> loc("MNISTLinear":4294967295:0), %arg2: tensor<256x10xf32> loc("MNISTLinear":4294967295:0), %arg3: tensor<1x256xf32> loc("MNISTLinear":4294967295:0), %arg4: tensor<784x256xf32> loc("MNISTLinear":4294967295:0)) -> tensor<1x10xf32> {
-    // CHECK: #[[LAYOUT_10:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x8>, memref<1x32xf32, #l1_>, width_sharded>
-    // CHECK: #[[LAYOUT_11:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x10xf32, #l1_>, width_sharded>
+    // CHECK: #[[LAYOUT_10:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x8>, memref<1x32xf32, #l1_>, <width_sharded>>
+    // CHECK: #[[LAYOUT_11:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x10xf32, #l1_>, <width_sharded>>
     %0 = tensor.empty() : tensor<1x256xf32> loc(#loc8)
     // CHECK: %[[C:.*]] = "ttnn.matmul"[[C:.*]] -> tensor<1x256xf32, #[[LAYOUT_10]]>
     %1 = "ttir.matmul"(%arg0, %arg4, %0) <{operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x784xf32>, tensor<784x256xf32>, tensor<1x256xf32>) -> tensor<1x256xf32> loc(#loc8)
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/multiple_add_with_loc.mlir b/test/ttmlir/Dialect/TTNN/optimizer/multiple_add_with_loc.mlir
index 5dba3ac5d4..80c44648a7 100644
--- a/test/ttmlir/Dialect/TTNN/optimizer/multiple_add_with_loc.mlir
+++ b/test/ttmlir/Dialect/TTNN/optimizer/multiple_add_with_loc.mlir
@@ -3,7 +3,7 @@
 #loc = loc("test_ops.py:17_0_0":0:0)
 module attributes {} {
   func.func @main(%arg0: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg1: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg2: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0)) -> (tensor<1x32x32xf32>, tensor<1x32x32xf32>) {
-    // CHECK: #[[LAYOUT_2:.*]] = #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), <8x8>, memref<4x4xf32, #dram>, interleaved>
+    // CHECK: #[[LAYOUT_2:.*]] = #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), <8x8>, memref<4x4xf32, #dram>, <interleaved>>
     %0 = tensor.empty() : tensor<1x32x32xf32> loc(#loc5)
     // CHECK: %[[C:.*]] = "ttnn.add"[[C:.*]] -> tensor<1x32x32xf32, #[[LAYOUT_2]]>
     %1 = "ttir.add"(%arg1, %arg2, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32x32xf32>, tensor<1x32x32xf32>, tensor<1x32x32xf32>) -> tensor<1x32x32xf32> loc(#loc5)
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/multiple_add_with_loc_input_layout_override.mlir b/test/ttmlir/Dialect/TTNN/optimizer/multiple_add_with_loc_input_layout_override.mlir
index a5a5d0ba35..b492a54c13 100644
--- a/test/ttmlir/Dialect/TTNN/optimizer/multiple_add_with_loc_input_layout_override.mlir
+++ b/test/ttmlir/Dialect/TTNN/optimizer/multiple_add_with_loc_input_layout_override.mlir
@@ -4,8 +4,8 @@
 module attributes {} {
   func.func @main(%arg0: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg1: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg2: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0)) -> tensor<1x32x32xf32> {
     // CHECK: #[[L1_:.*]] = #ttnn.buffer_type<l1>
-    // CHECK-DAG: #[[LAYOUT_1:.*]] = #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), <1x1>, memref<32x32xf32, #l1_>, width_sharded>
-    // CHECK-DAG: #[[LAYOUT_2:.*]] = #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), <1x1>, memref<32x32xf32, #dram>, interleaved>
+    // CHECK-DAG: #[[LAYOUT_1:.*]] = #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), <1x1>, memref<32x32xf32, #l1_>, <width_sharded>>
+    // CHECK-DAG: #[[LAYOUT_2:.*]] = #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), <1x1>, memref<32x32xf32, #dram>, <interleaved>>
     %0 = tensor.empty() : tensor<1x32x32xf32> loc(#loc5)
     // CHECK: %[[C:.*]] = "ttnn.add"{{.*}} -> tensor<1x32x32xf32, #[[LAYOUT_2]]>
     %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32x32xf32>, tensor<1x32x32xf32>, tensor<1x32x32xf32>) -> tensor<1x32x32xf32> loc(#loc5)
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/multiple_add_with_loc_output_layout_override.mlir b/test/ttmlir/Dialect/TTNN/optimizer/multiple_add_with_loc_output_layout_override.mlir
index a527f211b7..a43c21ab61 100644
--- a/test/ttmlir/Dialect/TTNN/optimizer/multiple_add_with_loc_output_layout_override.mlir
+++ b/test/ttmlir/Dialect/TTNN/optimizer/multiple_add_with_loc_output_layout_override.mlir
@@ -5,9 +5,9 @@ module attributes {} {
   func.func @main(%arg0: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg1: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg2: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0)) -> (tensor<1x32x32xf32>, tensor<1x32x32xf32>) {
     // CHECK: #[[L1_:.*]] = #ttnn.buffer_type<l1>
     // CHECK: #[[LAYOUT_0:.*]] = #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), <1x1>, memref<32x32xf32, #system_memory>>
-    // CHECK: #[[LAYOUT_1:.*]] = #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), <4x4>, memref<8x8xbf16, #dram>, interleaved>
-    // CHECK: #[[LAYOUT_2:.*]] = #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), <4x4>, memref<1x1x!tt.tile<32x32, f32>, #l1_>, interleaved>
-    // CHECK: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), <8x8>, memref<4x4xf32, #dram>, interleaved>
+    // CHECK: #[[LAYOUT_1:.*]] = #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), <4x4>, memref<8x8xbf16, #dram>, <interleaved>>
+    // CHECK: #[[LAYOUT_2:.*]] = #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), <4x4>, memref<1x1x!tt.tile<32x32, f32>, #l1_>, <interleaved>>
+    // CHECK: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), <8x8>, memref<4x4xf32, #dram>, <interleaved>>
     %0 = tensor.empty() : tensor<1x32x32xf32> loc(#loc5)
     // CHECK: %[[C:.*]] = "ttnn.add"[[C:.*]] -> tensor<1x32x32xf32, #[[LAYOUT_1]]>
     %1 = "ttir.add"(%arg1, %arg2, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32x32xf32>, tensor<1x32x32xf32>, tensor<1x32x32xf32>) -> tensor<1x32x32xf32> loc(#loc5)
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/test_grid_set.mlir b/test/ttmlir/Dialect/TTNN/optimizer/test_grid_set.mlir
index f02327598e..814cd0c459 100644
--- a/test/ttmlir/Dialect/TTNN/optimizer/test_grid_set.mlir
+++ b/test/ttmlir/Dialect/TTNN/optimizer/test_grid_set.mlir
@@ -3,18 +3,18 @@
 #dram = #ttnn.buffer_type<dram>
 #system_memory = #ttnn.buffer_type<system_memory>
 #ttnn_layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #system_memory>>
-#ttnn_layout1 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, f32>, #dram>, interleaved>
-#ttnn_layout2 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #dram>, interleaved>
+#ttnn_layout1 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, f32>, #dram>, <interleaved>>
+#ttnn_layout2 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #dram>, <interleaved>>
 module attributes {tt.device = #device} {
   func.func @forward(%arg0: tensor<64x128xf32, #ttnn_layout>, %arg1: tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout> {
     %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
-    %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<<interleaved>, <dram>, <<64x128>>>}> : (tensor<64x128xf32, #ttnn_layout>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
-    %2 = "ttnn.to_layout"(%arg1, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<<interleaved>, <dram>, <<64x128>>>}> : (tensor<64x128xf32, #ttnn_layout>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
-    %3 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<interleaved>, <dram>, <<64x128>>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout2>
+    %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<<dram>, <<64x128>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    %2 = "ttnn.to_layout"(%arg1, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<<dram>, <<64x128>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    %3 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<dram>, <<64x128>>, <interleaved>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout2>
     %4 = "ttnn.multiply"(%1, %2, %3) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout2>) -> tensor<64x128xf32, #ttnn_layout2>
-    // CHECK: #[[LAYOUT_2:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8>, memref<8x16xf32, #dram>, interleaved>
+    // CHECK: #[[LAYOUT_2:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8>, memref<8x16xf32, #dram>, <interleaved>>
     // CHECK: %{{.+}} = "ttnn.multiply"{{.+}} -> tensor<64x128xf32, #[[LAYOUT_2]]>
-    %5 = "ttnn.to_layout"(%4) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<none>, <system_memory>, <<64x128>>>}> : (tensor<64x128xf32, #ttnn_layout2>) -> tensor<64x128xf32, #ttnn_layout>
+    %5 = "ttnn.to_layout"(%4) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<system_memory>, <<64x128>>>}> : (tensor<64x128xf32, #ttnn_layout2>) -> tensor<64x128xf32, #ttnn_layout>
     return %5 : tensor<64x128xf32, #ttnn_layout>
   }
 }
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/test_override_reshard_edges.mlir b/test/ttmlir/Dialect/TTNN/optimizer/test_override_reshard_edges.mlir
index 328a26b545..16986408cd 100644
--- a/test/ttmlir/Dialect/TTNN/optimizer/test_override_reshard_edges.mlir
+++ b/test/ttmlir/Dialect/TTNN/optimizer/test_override_reshard_edges.mlir
@@ -3,25 +3,25 @@
 #dram = #ttnn.buffer_type<dram>
 #system_memory = #ttnn.buffer_type<system_memory>
 #ttnn_layout = #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), <1x1>, memref<32x32xf32, #system_memory>>
-#ttnn_layout1 = #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), <1x1>, memref<32x32xf32, #dram>, interleaved>
+#ttnn_layout1 = #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), <1x1>, memref<32x32xf32, #dram>, <interleaved>>
 module attributes {tt.device = #device} {
   func.func @main(%arg0: tensor<1x32x32xf32, #ttnn_layout>, %arg1: tensor<1x32x32xf32, #ttnn_layout>, %arg2: tensor<1x32x32xf32, #ttnn_layout>) -> tensor<1x32x32xf32, #ttnn_layout> {
-    // CHECK: #[[LAYOUT_1:.*]] = #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), <1x1>, memref<32x32xf32, #dram>, interleaved>
-    // CHECK: #[[LAYOUT_2:.*]] = #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), <1x1>, memref<32x32xf32, #l1_>, width_sharded>
-    // CHECK: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), <8x8>, memref<4x4xf32, #dram>, interleaved>
+    // CHECK: #[[LAYOUT_1:.*]] = #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), <1x1>, memref<32x32xf32, #dram>, <interleaved>>
+    // CHECK: #[[LAYOUT_2:.*]] = #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), <1x1>, memref<32x32xf32, #l1_>, <width_sharded>>
+    // CHECK: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), <8x8>, memref<4x4xf32, #dram>, <interleaved>>
     %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
-    %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<<interleaved>, <dram>, <<32x32>>>}> : (tensor<1x32x32xf32, #ttnn_layout>, !tt.device<#device>) -> tensor<1x32x32xf32, #ttnn_layout1>
-    %2 = "ttnn.to_layout"(%arg1, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<<interleaved>, <dram>, <<32x32>>>}> : (tensor<1x32x32xf32, #ttnn_layout>, !tt.device<#device>) -> tensor<1x32x32xf32, #ttnn_layout1>
-    %3 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<interleaved>, <dram>, <<32x32>>>, shape = #ttnn.shape<1x32x32>}> : (!tt.device<#device>) -> tensor<1x32x32xf32, #ttnn_layout1> loc(#loc1)
+    %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<<dram>, <<32x32>>, <interleaved>>}> : (tensor<1x32x32xf32, #ttnn_layout>, !tt.device<#device>) -> tensor<1x32x32xf32, #ttnn_layout1>
+    %2 = "ttnn.to_layout"(%arg1, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<<dram>, <<32x32>>, <interleaved>>}> : (tensor<1x32x32xf32, #ttnn_layout>, !tt.device<#device>) -> tensor<1x32x32xf32, #ttnn_layout1>
+    %3 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<dram>, <<32x32>>, <interleaved>>, shape = #ttnn.shape<1x32x32>}> : (!tt.device<#device>) -> tensor<1x32x32xf32, #ttnn_layout1> loc(#loc1)
     // CHECK: %[[C:.*]] = "ttnn.add"{{.*}} -> tensor<1x32x32xf32, #[[LAYOUT_1]]>
     %4 = "ttnn.add"(%1, %2, %3) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<1x32x32xf32, #ttnn_layout1>, tensor<1x32x32xf32, #ttnn_layout1>, tensor<1x32x32xf32, #ttnn_layout1>) -> tensor<1x32x32xf32, #ttnn_layout1> loc(#loc1)
-    %5 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<<interleaved>, <dram>, <<32x32>>>}> : (tensor<1x32x32xf32, #ttnn_layout>, !tt.device<#device>) -> tensor<1x32x32xf32, #ttnn_layout1>
-    %6 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<interleaved>, <dram>, <<32x32>>>, shape = #ttnn.shape<1x32x32>}> : (!tt.device<#device>) -> tensor<1x32x32xf32, #ttnn_layout1> loc(#loc2)
+    %5 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<<dram>, <<32x32>>, <interleaved>>}> : (tensor<1x32x32xf32, #ttnn_layout>, !tt.device<#device>) -> tensor<1x32x32xf32, #ttnn_layout1>
+    %6 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<dram>, <<32x32>>, <interleaved>>, shape = #ttnn.shape<1x32x32>}> : (!tt.device<#device>) -> tensor<1x32x32xf32, #ttnn_layout1> loc(#loc2)
     // CHECK: %{{.*}} = "ttnn.to_layout"(%[[C]], %0) {{.*}} -> tensor<1x32x32xf32, #[[LAYOUT_2]]>
     %7 = "ttnn.add"(%4, %6, %6) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<1x32x32xf32, #ttnn_layout1>, tensor<1x32x32xf32, #ttnn_layout1>, tensor<1x32x32xf32, #ttnn_layout1>) -> tensor<1x32x32xf32, #ttnn_layout1> loc(#loc2)
-    %8 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<interleaved>, <dram>, <<32x32>>>, shape = #ttnn.shape<1x32x32>}> : (!tt.device<#device>) -> tensor<1x32x32xf32, #ttnn_layout1> loc(#loc3)
+    %8 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<dram>, <<32x32>>, <interleaved>>, shape = #ttnn.shape<1x32x32>}> : (!tt.device<#device>) -> tensor<1x32x32xf32, #ttnn_layout1> loc(#loc3)
     %9 = "ttnn.relu"(%7, %8) <{operandSegmentSizes = array<i32: 1, 1>}> : (tensor<1x32x32xf32, #ttnn_layout1>, tensor<1x32x32xf32, #ttnn_layout1>) -> tensor<1x32x32xf32, #ttnn_layout1> loc(#loc3)
-    %10 = "ttnn.to_layout"(%9) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<none>, <system_memory>, <<32x32>>>}> : (tensor<1x32x32xf32, #ttnn_layout1>) -> tensor<1x32x32xf32, #ttnn_layout>
+    %10 = "ttnn.to_layout"(%9) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<system_memory>, <<32x32>>>}> : (tensor<1x32x32xf32, #ttnn_layout1>) -> tensor<1x32x32xf32, #ttnn_layout>
     return %10 : tensor<1x32x32xf32, #ttnn_layout>
   }
 }
diff --git a/test/ttmlir/Dialect/TTNN/optimizer/ttir_to_ttnn_pipeline.mlir b/test/ttmlir/Dialect/TTNN/optimizer/ttir_to_ttnn_pipeline.mlir
index 3df46a2707..725d7b83f6 100644
--- a/test/ttmlir/Dialect/TTNN/optimizer/ttir_to_ttnn_pipeline.mlir
+++ b/test/ttmlir/Dialect/TTNN/optimizer/ttir_to_ttnn_pipeline.mlir
@@ -2,7 +2,7 @@
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 module attributes {} {
   func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
-    // CHECK: #[[LAYOUT_2:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8>, memref<8x16xf32, #dram>, interleaved>
+    // CHECK: #[[LAYOUT_2:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8>, memref<8x16xf32, #dram>, <interleaved>>
     // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
     %0 = tensor.empty() : tensor<64x128xf32>
     // CHECK: %[[C:.*]] = "ttnn.multiply"[[C:.*]] -> tensor<64x128xf32, #[[LAYOUT_2]]>
diff --git a/test/ttmlir/Dialect/TTNN/test_remove_dead_values_pass.mlir b/test/ttmlir/Dialect/TTNN/test_remove_dead_values_pass.mlir
index f3231730f5..ea59aae1c0 100644
--- a/test/ttmlir/Dialect/TTNN/test_remove_dead_values_pass.mlir
+++ b/test/ttmlir/Dialect/TTNN/test_remove_dead_values_pass.mlir
@@ -4,65 +4,65 @@
 #system_desc = #tt.system_desc<[{role = host, target_triple = "x86_64-pc-linux-gnu"}], [{arch = <wormhole_b0>, grid = 8x8, l1_size = 1499136, num_dram_channels = 12, dram_channel_size = 1073741824, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32, l1_unreserved_base = 1024, erisc_l1_unreserved_base = 1024, dram_unreserved_base = 1024, dram_unreserved_end = 1073741824, physical_cores = {worker = [ 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  1x0,  1x1,  1x2,  1x3,  1x4,  1x5,  1x6,  1x7,  2x0,  2x1,  2x2,  2x3,  2x4,  2x5,  2x6,  2x7,  3x0,  3x1,  3x2,  3x3,  3x4,  3x5,  3x6,  3x7,  4x0,  4x1,  4x2,  4x3,  4x4,  4x5,  4x6,  4x7,  5x0,  5x1,  5x2,  5x3,  5x4,  5x5,  5x6,  5x7,  6x0,  6x1,  6x2,  6x3,  6x4,  6x5,  6x6,  6x7,  7x0,  7x1,  7x2,  7x3,  7x4,  7x5,  7x6,  7x7] dram = [ 8x0,  9x0,  10x0,  8x1,  9x1,  10x1,  8x2,  9x2,  10x2,  8x3,  9x3,  10x3]}, supported_data_types = [<f32>, <f16>, <bf16>, <bfp_f8>, <bfp_bf8>, <bfp_f4>, <bfp_bf4>, <bfp_f2>, <bfp_bf2>, <u32>, <u16>, <u8>], supported_tile_sizes = [ 4x16,  16x16,  32x16,  4x32,  16x32,  32x32], num_cbs = 32}], [0], [3 : i32], [ 0x0x0x0]>
 #system_memory = #ttnn.buffer_type<system_memory>
 #ttnn_layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #system_memory>>
-#ttnn_layout1 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, f32>, #dram>, interleaved>
-#ttnn_layout2 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #dram>, interleaved>
+#ttnn_layout1 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, f32>, #dram>, <interleaved>>
+#ttnn_layout2 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #dram>, <interleaved>>
 module attributes {tt.device = #device, tt.system_desc = #system_desc} {
   func.func @forward(%arg0: tensor<64x128xf32, #ttnn_layout>, %arg1: tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout> {
     %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
     %1 = "ttnn.to_layout"(%arg0) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout1>
-    %2 = "ttnn.to_device"(%1, %0) <{memory_config = #ttnn.memory_config<<interleaved>, #dram, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    %2 = "ttnn.to_device"(%1, %0) <{memory_config = #ttnn.memory_config<#dram, <<2x4>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
     "ttnn.deallocate"(%1) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
     %3 = "ttnn.to_layout"(%arg1) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout1>
-    %4 = "ttnn.to_device"(%3, %0) <{memory_config = #ttnn.memory_config<<interleaved>, #dram, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    %4 = "ttnn.to_device"(%3, %0) <{memory_config = #ttnn.memory_config<#dram, <<2x4>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
     "ttnn.deallocate"(%3) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
-    %5 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<interleaved>, #dram, <<64x128>>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout2>
+    %5 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout2>
     // CHECK: %[[C:.*]] = "ttnn.multiply"[[C:.*]]
     %6 = "ttnn.multiply"(%2, %4, %5) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout2>) -> tensor<64x128xf32, #ttnn_layout2>
     "ttnn.deallocate"(%4) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
     "ttnn.deallocate"(%2) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
     %7 = "ttnn.to_layout"(%arg0) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout1>
-    %8 = "ttnn.to_device"(%7, %0) <{memory_config = #ttnn.memory_config<<interleaved>, #dram, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    %8 = "ttnn.to_device"(%7, %0) <{memory_config = #ttnn.memory_config<#dram, <<2x4>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
     "ttnn.deallocate"(%7) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
     %9 = "ttnn.to_layout"(%arg1) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout1>
-    %10 = "ttnn.to_device"(%9, %0) <{memory_config = #ttnn.memory_config<<interleaved>, #dram, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    %10 = "ttnn.to_device"(%9, %0) <{memory_config = #ttnn.memory_config<#dram, <<2x4>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
     "ttnn.deallocate"(%9) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
-    %11 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<interleaved>, #dram, <<64x128>>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout2>
+    %11 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout2>
     // CHECK-NOT: %[[C:.*]] = "ttnn.add"[[C:.*]]
     %12 = "ttnn.add"(%8, %10, %11) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout2>) -> tensor<64x128xf32, #ttnn_layout2>
     "ttnn.deallocate"(%11) <{force = false}> : (tensor<64x128xf32, #ttnn_layout2>) -> ()
     "ttnn.deallocate"(%10) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
     "ttnn.deallocate"(%8) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
     %13 = "ttnn.to_layout"(%arg0) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout1>
-    %14 = "ttnn.to_device"(%13, %0) <{memory_config = #ttnn.memory_config<<interleaved>, #dram, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    %14 = "ttnn.to_device"(%13, %0) <{memory_config = #ttnn.memory_config<#dram, <<2x4>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
     "ttnn.deallocate"(%13) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
     %15 = "ttnn.to_layout"(%arg1) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout1>
-    %16 = "ttnn.to_device"(%15, %0) <{memory_config = #ttnn.memory_config<<interleaved>, #dram, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    %16 = "ttnn.to_device"(%15, %0) <{memory_config = #ttnn.memory_config<#dram, <<2x4>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
     "ttnn.deallocate"(%15) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
-    %17 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<interleaved>, #dram, <<64x128>>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout2>
+    %17 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout2>
     // CHECK-NOT: %[[C:.*]] = "ttnn.subtract"[[C:.*]]
     %18 = "ttnn.subtract"(%14, %16, %17) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout2>) -> tensor<64x128xf32, #ttnn_layout2>
     "ttnn.deallocate"(%17) <{force = false}> : (tensor<64x128xf32, #ttnn_layout2>) -> ()
     "ttnn.deallocate"(%16) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
     "ttnn.deallocate"(%14) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
     %19 = "ttnn.to_layout"(%arg0) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout1>
-    %20 = "ttnn.to_device"(%19, %0) <{memory_config = #ttnn.memory_config<<interleaved>, #dram, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    %20 = "ttnn.to_device"(%19, %0) <{memory_config = #ttnn.memory_config<#dram, <<2x4>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
     "ttnn.deallocate"(%19) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
     %21 = "ttnn.to_layout"(%arg1) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout1>
-    %22 = "ttnn.to_device"(%21, %0) <{memory_config = #ttnn.memory_config<<interleaved>, #dram, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    %22 = "ttnn.to_device"(%21, %0) <{memory_config = #ttnn.memory_config<#dram, <<2x4>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
     "ttnn.deallocate"(%21) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
-    %23 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<interleaved>, #dram, <<64x128>>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout2>
+    %23 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout2>
     // CHECK-NOT: %[[C:.*]] = "ttnn.div"[[C:.*]]
     %24 = "ttnn.div"(%20, %22, %23) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout2>) -> tensor<64x128xf32, #ttnn_layout2>
     "ttnn.deallocate"(%23) <{force = false}> : (tensor<64x128xf32, #ttnn_layout2>) -> ()
     "ttnn.deallocate"(%22) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
     "ttnn.deallocate"(%20) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
     %25 = "ttnn.to_layout"(%arg0) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout1>
-    %26 = "ttnn.to_device"(%25, %0) <{memory_config = #ttnn.memory_config<<interleaved>, #dram, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    %26 = "ttnn.to_device"(%25, %0) <{memory_config = #ttnn.memory_config<#dram, <<2x4>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
     "ttnn.deallocate"(%25) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
     %27 = "ttnn.to_layout"(%arg1) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xf32, #ttnn_layout>) -> tensor<64x128xf32, #ttnn_layout1>
-    %28 = "ttnn.to_device"(%27, %0) <{memory_config = #ttnn.memory_config<<interleaved>, #dram, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
+    %28 = "ttnn.to_device"(%27, %0) <{memory_config = #ttnn.memory_config<#dram, <<2x4>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout1>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout1>
     "ttnn.deallocate"(%27) <{force = false}> : (tensor<64x128xf32, #ttnn_layout1>) -> ()
-    %29 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<interleaved>, #dram, <<64x128>>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout2>
+    %29 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout2>
     // CHECK-NOT: %[[C:.*]] = "ttnn.eq"[[C:.*]]
     %30 = "ttnn.eq"(%26, %28, %29) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout1>, tensor<64x128xf32, #ttnn_layout2>) -> tensor<64x128xf32, #ttnn_layout2>
     "ttnn.deallocate"(%29) <{force = false}> : (tensor<64x128xf32, #ttnn_layout2>) -> ()
diff --git a/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir b/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir
index d1e846bd6a..112a941a81 100644
--- a/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir
+++ b/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir
@@ -2,7 +2,7 @@
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 module attributes {} {
   func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
-    // CHECK: #[[LAYOUT_1:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #dram>, interleaved>
+    // CHECK: #[[LAYOUT_1:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #dram>, <interleaved>>
     // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
     %0 = tensor.empty() : tensor<64x128xf32>
     // CHECK: %[[C:.*]] = "ttnn.multiply"[[C:.*]] -> tensor<64x128xf32, #[[LAYOUT_1:.*]]>
diff --git a/test/ttmlir/Runtime/TTNN/runtime_stitching/eltwise_binary_op_chain.mlir b/test/ttmlir/Runtime/TTNN/runtime_stitching/eltwise_binary_op_chain.mlir
index 97690df780..35b4d90634 100644
--- a/test/ttmlir/Runtime/TTNN/runtime_stitching/eltwise_binary_op_chain.mlir
+++ b/test/ttmlir/Runtime/TTNN/runtime_stitching/eltwise_binary_op_chain.mlir
@@ -6,15 +6,15 @@
 #system_memory = #ttnn.buffer_type<system_memory>
 #dram = #ttnn.buffer_type<dram>
 #ttnn_layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xbf16, #system_memory>>
-#ttnn_layout1 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>, interleaved>
-#ttnn_layout2 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xbf16, #dram>, interleaved>
+#ttnn_layout1 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>, <interleaved>>
+#ttnn_layout2 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xbf16, #dram>, <interleaved>>
 
 module attributes {tt.device = #device} {
   func.func @add(%arg0: tensor<64x128xbf16, #ttnn_layout1>, %arg1: tensor<64x128xbf16, #ttnn_layout1>) -> tensor<64x128xbf16, #ttnn_layout> {
     %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
     %1 = "ttnn.to_layout"(%arg0) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xbf16, #ttnn_layout1>) -> tensor<64x128xbf16, #ttnn_layout1>
     %2 = "ttnn.to_layout"(%arg1) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xbf16, #ttnn_layout1>) -> tensor<64x128xbf16, #ttnn_layout1>
-    %3 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<bf16>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<interleaved>, #dram, <<64x128>>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout2>
+    %3 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<bf16>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout2>
     %4 = "ttnn.add"(%1, %2, %3) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<64x128xbf16, #ttnn_layout1>, tensor<64x128xbf16, #ttnn_layout1>, tensor<64x128xbf16, #ttnn_layout2>) -> tensor<64x128xbf16, #ttnn_layout2>
     %5 = "ttnn.from_device"(%4) : (tensor<64x128xbf16, #ttnn_layout2>) -> tensor<64x128xbf16, #ttnn_layout>
     %6 = "ttnn.to_layout"(%5) <{layout = #ttnn.layout<row_major>}> : (tensor<64x128xbf16, #ttnn_layout>) -> tensor<64x128xbf16, #ttnn_layout>
@@ -27,7 +27,7 @@ module attributes {tt.device = #device} {
     %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
     %1 = "ttnn.to_layout"(%arg0) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xbf16, #ttnn_layout1>) -> tensor<64x128xbf16, #ttnn_layout1>
     %2 = "ttnn.to_layout"(%arg1) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xbf16, #ttnn_layout1>) -> tensor<64x128xbf16, #ttnn_layout1>
-    %3 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<bf16>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<interleaved>, #dram, <<64x128>>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout2>
+    %3 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<bf16>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout2>
     %4 = "ttnn.multiply"(%1, %2, %3) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<64x128xbf16, #ttnn_layout1>, tensor<64x128xbf16, #ttnn_layout1>, tensor<64x128xbf16, #ttnn_layout2>) -> tensor<64x128xbf16, #ttnn_layout2>
     %5 = "ttnn.from_device"(%4) : (tensor<64x128xbf16, #ttnn_layout2>) -> tensor<64x128xbf16, #ttnn_layout>
     %6 = "ttnn.to_layout"(%5) <{layout = #ttnn.layout<row_major>}> : (tensor<64x128xbf16, #ttnn_layout>) -> tensor<64x128xbf16, #ttnn_layout>
@@ -40,7 +40,7 @@ module attributes {tt.device = #device} {
     %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
     %1 = "ttnn.to_layout"(%arg0) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xbf16, #ttnn_layout1>) -> tensor<64x128xbf16, #ttnn_layout1>
     %2 = "ttnn.to_layout"(%arg1) <{layout = #ttnn.layout<tile>}> : (tensor<64x128xbf16, #ttnn_layout1>) -> tensor<64x128xbf16, #ttnn_layout1>
-    %3 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<bf16>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<<interleaved>, #dram, <<64x128>>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout2>
+    %3 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<bf16>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>, shape = #ttnn.shape<64x128>}> : (!tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout2>
     %4 = "ttnn.subtract"(%1, %2, %3) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<64x128xbf16, #ttnn_layout1>, tensor<64x128xbf16, #ttnn_layout1>, tensor<64x128xbf16, #ttnn_layout2>) -> tensor<64x128xbf16, #ttnn_layout2>
     %5 = "ttnn.from_device"(%4) : (tensor<64x128xbf16, #ttnn_layout2>) -> tensor<64x128xbf16, #ttnn_layout>
     %6 = "ttnn.to_layout"(%5) <{layout = #ttnn.layout<row_major>}> : (tensor<64x128xbf16, #ttnn_layout>) -> tensor<64x128xbf16, #ttnn_layout>
diff --git a/test/ttmlir/Silicon/TTNN/emitc/simple_add.mlir b/test/ttmlir/Silicon/TTNN/emitc/simple_add.mlir
index 84e424cbc8..33645730ab 100644
--- a/test/ttmlir/Silicon/TTNN/emitc/simple_add.mlir
+++ b/test/ttmlir/Silicon/TTNN/emitc/simple_add.mlir
@@ -1,7 +1,7 @@
 // RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
 // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
 
-#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+#any_device = #tt.operand_constraint<dram|l1|interleaved>
 
 func.func @add(%arg0: tensor<32x32xbf16>, %arg1: tensor<32x32xbf16>) -> tensor<32x32xbf16> {
   %0 = tensor.empty() : tensor<32x32xbf16>
diff --git a/test/ttmlir/Silicon/TTNN/optimizer/mnist_sharding_tiled.mlir b/test/ttmlir/Silicon/TTNN/optimizer/mnist_sharding_tiled.mlir
index 938ec9709d..cf5a5b9553 100644
--- a/test/ttmlir/Silicon/TTNN/optimizer/mnist_sharding_tiled.mlir
+++ b/test/ttmlir/Silicon/TTNN/optimizer/mnist_sharding_tiled.mlir
@@ -5,8 +5,8 @@
 #loc = loc("MNISTLinear":4294967295:0)
 module @"tt-forge-graph" attributes {} {
   func.func @main(%arg0: tensor<32x784xf32> loc("MNISTLinear":4294967295:0), %arg1: tensor<32xf32> loc("MNISTLinear":4294967295:0), %arg2: tensor<256x32xf32> loc("MNISTLinear":4294967295:0), %arg3: tensor<256xf32> loc("MNISTLinear":4294967295:0), %arg4: tensor<784x256xf32> loc("MNISTLinear":4294967295:0)) -> tensor<32x32xf32> {
-    // CHECK-DAG: #[[LAYOUT_1:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x8>, memref<32x32xf32, #l1_>, width_sharded>
-    // CHECK-DAG: #[[LAYOUT_2:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<32x32xf32, #l1_>, width_sharded>
+    // CHECK-DAG: #[[LAYOUT_1:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x8>, memref<32x32xf32, #l1_>, <width_sharded>>
+    // CHECK-DAG: #[[LAYOUT_2:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<32x32xf32, #l1_>, <width_sharded>>
     %0 = tensor.empty() : tensor<32x256xf32> loc(#loc8)
     // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<32x256xf32, #[[LAYOUT_1]]>
     %1 = "ttir.matmul"(%arg0, %arg4, %0) <{operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<32x784xf32>, tensor<784x256xf32>, tensor<32x256xf32>) -> tensor<32x256xf32> loc(#loc8)
diff --git a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_matmul.mlir b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_matmul.mlir
index e1c672a6ec..9c240b0ab7 100644
--- a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_matmul.mlir
+++ b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_matmul.mlir
@@ -2,7 +2,7 @@
 // RUN: FileCheck %s --input-file=%t.mlir
 // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
 #any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
-// CHECK: #[[TILED_LAYOUT:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>, interleaved>
+// CHECK: #[[TILED_LAYOUT:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>, <interleaved>>
 module attributes {} {
   func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x96xbf16>) -> tensor<64x96xbf16> {
     %0 = tensor.empty() : tensor<64x96xbf16>
diff --git a/test/ttmlir/Silicon/TTNN/simple_matmul.mlir b/test/ttmlir/Silicon/TTNN/simple_matmul.mlir
index e1c672a6ec..9c240b0ab7 100644
--- a/test/ttmlir/Silicon/TTNN/simple_matmul.mlir
+++ b/test/ttmlir/Silicon/TTNN/simple_matmul.mlir
@@ -2,7 +2,7 @@
 // RUN: FileCheck %s --input-file=%t.mlir
 // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
 #any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
-// CHECK: #[[TILED_LAYOUT:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>, interleaved>
+// CHECK: #[[TILED_LAYOUT:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>, <interleaved>>
 module attributes {} {
   func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x96xbf16>) -> tensor<64x96xbf16> {
     %0 = tensor.empty() : tensor<64x96xbf16>
diff --git a/test/unittests/Optimizer/TestL1InterleavedPolicy.cpp b/test/unittests/Optimizer/TestL1InterleavedPolicy.cpp
index 7d02cef56f..b09b65245d 100644
--- a/test/unittests/Optimizer/TestL1InterleavedPolicy.cpp
+++ b/test/unittests/Optimizer/TestL1InterleavedPolicy.cpp
@@ -84,16 +84,18 @@ class L1InterleavedPolicyBase : public ::testing::Test {
                                      std::vector<TTNNLayoutAttr>> &legalLayouts,
                       BufferType memorySpace,
                       TensorMemoryLayout tensorMemoryLayout) {
+    TensorMemoryLayoutAttr tensorMemoryLayoutAttr =
+        TensorMemoryLayoutAttr::get(&context, tensorMemoryLayout);
     if (legalLayouts.find(op) == legalLayouts.end()) {
       legalLayouts[op] = std::vector<TTNNLayoutAttr>{TTNNLayoutAttr::get(
           &context, getTensorRankedType().getShape(), builder.getF32Type(),
           memorySpace, mlir::tt::GridAttr::get(&context, {8, 8}),
-          tensorMemoryLayout)};
+          tensorMemoryLayoutAttr)};
     } else {
       legalLayouts[op].push_back(TTNNLayoutAttr::get(
           &context, getTensorRankedType().getShape(), builder.getF32Type(),
           memorySpace, mlir::tt::GridAttr::get(&context, {8, 8}),
-          tensorMemoryLayout));
+          tensorMemoryLayoutAttr));
     }
   }
 
diff --git a/test/unittests/Optimizer/TestShardSolver.cpp b/test/unittests/Optimizer/TestShardSolver.cpp
index c2f73b8008..c43eacce71 100644
--- a/test/unittests/Optimizer/TestShardSolver.cpp
+++ b/test/unittests/Optimizer/TestShardSolver.cpp
@@ -94,13 +94,15 @@ class ShardSolverBase : public ::testing::Test {
           &context, getTensorRankedType().getShape(), builder.getF32Type(),
           memorySpace,
           mlir::tt::GridAttr::get(&context, {gridWidth, gridHeight}),
-          tensorMemoryLayout)};
+          mlir::tt::ttnn::TensorMemoryLayoutAttr::get(&context,
+                                                      tensorMemoryLayout))};
     } else {
       legalLayouts[op].push_back(TTNNLayoutAttr::get(
           &context, getTensorRankedType().getShape(), builder.getF32Type(),
           memorySpace,
           mlir::tt::GridAttr::get(&context, {gridWidth, gridHeight}),
-          tensorMemoryLayout));
+          mlir::tt::ttnn::TensorMemoryLayoutAttr::get(&context,
+                                                      tensorMemoryLayout)));
     }
   }
 

From bb1d960e9496ac07e0fa3396b3be371acf2ec81e Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <157983820+vmilosevic@users.noreply.github.com>
Date: Fri, 6 Dec 2024 16:05:09 +0100
Subject: [PATCH 63/84] Uplift third_party/tt-metal to
 4952f01a39068cba4e85c93cf34c4f95f10f5c34 2024-12-06 (#1522)

* Uplift third_party/tt-metal to 4952f01a39068cba4e85c93cf34c4f95f10f5c34 2024-12-06

* remove libnng and libuv dependencies, they are now part of libdevice.so

---------

Co-authored-by: kmitrovicTT <169657397+kmitrovicTT@users.noreply.github.com>
Co-authored-by: Bezulj Marko <mbezulj@tenstorrent.com>
---
 runtime/tools/python/setup.py        | 2 +-
 third_party/CMakeLists.txt           | 2 +-
 tools/ttnn-standalone/CMakeLists.txt | 3 ---
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/runtime/tools/python/setup.py b/runtime/tools/python/setup.py
index e227835029..d754250e01 100644
--- a/runtime/tools/python/setup.py
+++ b/runtime/tools/python/setup.py
@@ -80,7 +80,7 @@
     linklibs += ["TTRuntimeTTMetal", "tt_metal"]
 
 if enable_ttnn or enable_ttmetal:
-    runlibs += ["libdevice.so", "libnng.so.1", "libuv.so.1"]
+    runlibs += ["libdevice.so"]
     linklibs += ["TTRuntimeSysDesc", "TTRuntimeDebug", "TTRuntimeWorkarounds"]
 
 if enable_perf:
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index e6b7b5cd51..d1b8af59a1 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -1,6 +1,6 @@
 include(ExternalProject)
 
-set(TT_METAL_VERSION "3389120b3747b521fa9b7ef333a379554359d961")
+set(TT_METAL_VERSION "4952f01a39068cba4e85c93cf34c4f95f10f5c34")
 
 if ("$ENV{ARCH_NAME}" STREQUAL "grayskull")
   set(ARCH_NAME "grayskull")
diff --git a/tools/ttnn-standalone/CMakeLists.txt b/tools/ttnn-standalone/CMakeLists.txt
index 7de585bf0a..bc22737ee9 100644
--- a/tools/ttnn-standalone/CMakeLists.txt
+++ b/tools/ttnn-standalone/CMakeLists.txt
@@ -94,9 +94,6 @@ set(LINK_LIBS
     # The below libs have been added to tt-metal repo at some point, but are not
     # currently needed by the targets here - leaving them commented here for
     # reference
-    #
-    # nng
-    # uv
 
     # TTNN
     # _ttnn  # Why doesn't this work?

From e052baed639ba92e9b7865c695d5d7e5a9f873f9 Mon Sep 17 00:00:00 2001
From: Guangyu Feng <157328249+gfengTT@users.noreply.github.com>
Date: Fri, 6 Dec 2024 11:10:37 -0500
Subject: [PATCH 64/84] Fix TTIR to TTNN conversion for all gather (#1182)

---
 lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp     | 10 ++--------
 test/ttmlir/Dialect/TTNN/ccl/all_gather.mlir |  1 -
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
index bf216d3629..d77d095acc 100644
--- a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
+++ b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
@@ -905,15 +905,9 @@ class AllGatherOpConversionPattern
   LogicalResult
   matchAndRewrite(ttir::AllGatherOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    RankedTensorType type =
-        mlir::cast<RankedTensorType>(adaptor.getInput().getType());
-    Value device = ::ttnn::utils::getOrInsertDevice(rewriter, op);
-    tensor::EmptyOp emptyOp = rewriter.create<tensor::EmptyOp>(
-        op.getLoc(), this->getTypeConverter()->convertType(type), device);
-
     rewriter.replaceOpWithNewOp<ttnn::AllGatherOp>(
-        op, this->getTypeConverter()->convertType(op.getType()), emptyOp,
-        adaptor.getDim());
+        op, this->getTypeConverter()->convertType(op.getType()),
+        adaptor.getInput(), adaptor.getDim());
     return success();
   }
 };
diff --git a/test/ttmlir/Dialect/TTNN/ccl/all_gather.mlir b/test/ttmlir/Dialect/TTNN/ccl/all_gather.mlir
index f1f5a5965c..cb2a7ad2b3 100644
--- a/test/ttmlir/Dialect/TTNN/ccl/all_gather.mlir
+++ b/test/ttmlir/Dialect/TTNN/ccl/all_gather.mlir
@@ -2,7 +2,6 @@
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 module attributes {} {
   func.func @forward(%arg0: tensor<1x1x32x32xbf16>) -> tensor<1x1x32x128xbf16> {
-    // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
     %0 = tensor.empty() : tensor<1x1x32x128xbf16>
     // CHECK: %[[C:.*]] = "ttnn.all_gather"[[C:.*]]
     %1 = "ttir.all_gather"(%arg0, %0) <{dim = 3 : si32, operand_constraints = [#any_device, #any_device]}> : (tensor<1x1x32x32xbf16>, tensor<1x1x32x128xbf16>) -> tensor<1x1x32x128xbf16>

From 58605e094d74c704cb19389b2960702e1519ee85 Mon Sep 17 00:00:00 2001
From: Stefan Gligorijevic <189116645+sgligorijevicTT@users.noreply.github.com>
Date: Fri, 6 Dec 2024 17:44:35 +0100
Subject: [PATCH 65/84] Add TOSA to TTIR conversion for matmul (#1446)

* Add TOSA conversion for matmul

* Refactor test

* Check that quantization isn't used and improve tests

* cleanup
---
 .../TosaToTTIR/TosaToTTIRPatterns.cpp         | 48 +++++++++++++++++++
 .../Conversion/TosaToTTIR/matmul_op.mlir      | 11 +++++
 2 files changed, 59 insertions(+)
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/matmul_op.mlir

diff --git a/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp b/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp
index 7ab3ed5d27..23726c21b7 100644
--- a/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp
+++ b/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp
@@ -81,6 +81,47 @@ class TosaToTTIRMultiplyOpConversionPattern
   }
 };
 
+class TosaToTTIRMatmulOpConversionPattern
+    : public OpConversionPattern<tosa::MatMulOp> {
+  using OpConversionPattern<tosa::MatMulOp>::OpConversionPattern;
+  using Adaptor = tosa::MatMulOp::Adaptor;
+
+public:
+  LogicalResult
+  matchAndRewrite(tosa::MatMulOp srcOp, Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    LogicalResult legalityResult =
+        checkConversionLegality(srcOp, adaptor, rewriter);
+    if (!legalityResult.succeeded()) {
+      return legalityResult;
+    }
+    auto outputType = mlir::cast<RankedTensorType>(srcOp.getResult().getType());
+    auto outputTensor = rewriter.create<tensor::EmptyOp>(
+        srcOp.getLoc(), outputType.getShape(), outputType.getElementType());
+    auto operands = adaptor.getOperands();
+
+    rewriter.replaceOpWithNewOp<mlir::tt::ttir::MatmulOp>(
+        srcOp, TypeRange(outputTensor.getType()), operands[0], operands[1],
+        outputTensor,
+        rewriter.getArrayAttr(
+            SmallVector<Attribute>(adaptor.getOperands().size() + 1,
+                                   rewriter.getAttr<OperandConstraintAttr>(
+                                       OperandConstraint::AnyDeviceTile))));
+    return success();
+  }
+
+private:
+  LogicalResult
+  checkConversionLegality(tosa::MatMulOp srcOp, Adaptor adaptor,
+                          ConversionPatternRewriter &rewriter) const {
+    if (srcOp.getQuantizationInfo().has_value()) {
+      return rewriter.notifyMatchFailure(
+          srcOp, "TTIR MatmulOp currently doesn't support quantization.");
+    }
+    return success();
+  }
+};
+
 void addElementwiseUnaryOpsConversionPatterns(MLIRContext *ctx,
                                               RewritePatternSet &patterns,
                                               TypeConverter &typeConverter) {
@@ -162,6 +203,12 @@ void addCompareOpsConversionPatterns(MLIRContext *ctx,
       tosa::GreaterOp, mlir::tt::ttir::GreaterThanOp>>(typeConverter, ctx);
 }
 
+void addMatmulOpsConversionPatterns(MLIRContext *ctx,
+                                    RewritePatternSet &patterns,
+                                    TypeConverter &typeConverter) {
+  patterns.add<TosaToTTIRMatmulOpConversionPattern>(typeConverter, ctx);
+}
+
 } // namespace
 
 namespace mlir::tt {
@@ -173,6 +220,7 @@ void populateTosaToTTIRPatterns(MLIRContext *ctx, RewritePatternSet &patterns,
   addElementwiseTernaryOpsConversionPatterns(ctx, patterns, typeConverter);
   addLogicalOpsConversionPatterns(ctx, patterns, typeConverter);
   addCompareOpsConversionPatterns(ctx, patterns, typeConverter);
+  addMatmulOpsConversionPatterns(ctx, patterns, typeConverter);
 }
 
 } // namespace mlir::tt
diff --git a/test/ttmlir/Conversion/TosaToTTIR/matmul_op.mlir b/test/ttmlir/Conversion/TosaToTTIR/matmul_op.mlir
new file mode 100644
index 0000000000..5e12ee0e0b
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/matmul_op.mlir
@@ -0,0 +1,11 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_matmul(%arg0: tensor<13x21x16xf32>, %arg1: tensor<13x16x31xf32>) -> tensor<13x21x31xf32> {
+    // CHECK: func.func {{.+}}%arg{{[0-9]+}}: tensor<[[B:[0-9]+]]x[[I:[0-9]+]]x[[J:[0-9]+]]xf32>, %arg{{[0-9]+}}: tensor<[[B:[0-9]+]]x[[J:[0-9]+]]x[[K:[0-9]+]]xf32>
+    %0 = tosa.matmul %arg0, %arg1 : (tensor<13x21x16xf32>, tensor<13x16x31xf32>) -> tensor<13x21x31xf32>
+    // CHECK: %[[OP_OUT:[0-9]+]] = tensor.empty() : tensor<[[B]]x[[I]]x[[K]]xf32>
+    // CHECK: %[[VAL:[0-9]+]] = "ttir.matmul"(%arg{{[0-9]+}}, %arg{{[0-9]+}}, %[[OP_OUT]]){{.+}} (tensor<[[B]]x[[I]]x[[J]]xf32>, tensor<[[B]]x[[J]]x[[K]]xf32>, tensor<[[B]]x[[I]]x[[K]]xf32>) -> tensor<[[B]]x[[I]]x[[K]]xf32>
+    // CHECK: return %[[VAL]] : tensor<[[B]]x[[I]]x[[K]]xf32>
+    return %0 : tensor<13x21x31xf32>
+  }
+}

From 04d4d516299cba51c61d5de285c85f68ad00f81e Mon Sep 17 00:00:00 2001
From: Vraj Prajapati <vprajapati@tenstorrent.com>
Date: Fri, 6 Dec 2024 10:58:52 -0600
Subject: [PATCH 66/84] Added fix for redundant constant nodes + small carry
 over from #1401 (#1509)

* Added fix for redundant constant nodes + small carry over from #1401

* Switched from checking block to check against op
---
 runtime/tools/python/ttrt/common/perf.py         | 4 ----
 tools/explorer/tt_adapter/src/tt_adapter/mlir.py | 6 +++++-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/runtime/tools/python/ttrt/common/perf.py b/runtime/tools/python/ttrt/common/perf.py
index f70defa313..55ee255f91 100644
--- a/runtime/tools/python/ttrt/common/perf.py
+++ b/runtime/tools/python/ttrt/common/perf.py
@@ -23,10 +23,6 @@
 from ttrt.common.query import Query
 
 
-def get_loc_data_hook(binary, programContext, opContext):
-    op_debug_str = ttrt.runtime.get_op_debug_str(opContext)
-
-
 class Perf:
     registered_args = {}
 
diff --git a/tools/explorer/tt_adapter/src/tt_adapter/mlir.py b/tools/explorer/tt_adapter/src/tt_adapter/mlir.py
index 6b064b1558..e48eca4a8d 100644
--- a/tools/explorer/tt_adapter/src/tt_adapter/mlir.py
+++ b/tools/explorer/tt_adapter/src/tt_adapter/mlir.py
@@ -522,7 +522,11 @@ def build_graph(module):
                     op_to_graph_node[op] = graph_node
 
                     for operand in op.operands:
-                        if isinstance(operand, ir.Value):
+                        if isinstance(operand, ir.Value) and not isinstance(
+                            operand.owner, ir.Operation
+                        ):
+                            # If the owner is not an op, then it is a constant provided from the toplevel FuncOp.
+
                             # This is a constant and we need to create a node for it.
                             operand_node = operation.make_constant_node(
                                 name_dict, operand.get_name()

From f4dd5d9da8a641cbff9b76cfc09ab1f1233e79a2 Mon Sep 17 00:00:00 2001
From: Jackson Nie <jnie@tenstorrent.com>
Date: Fri, 6 Dec 2024 12:25:06 -0500
Subject: [PATCH 67/84] Add memcpy(void *, Tensor) API (#1519)

---
 runtime/include/tt/runtime/detail/ttnn.h     |  2 +
 runtime/include/tt/runtime/runtime.h         |  2 +
 runtime/lib/runtime.cpp                      | 15 +++++
 runtime/lib/ttnn/runtime.cpp                 | 14 ++++-
 runtime/test/python/ttnn/test_runtime_api.py | 62 ++++++++++++++++----
 runtime/tools/python/ttrt/runtime/module.cpp |  8 +++
 6 files changed, 88 insertions(+), 15 deletions(-)

diff --git a/runtime/include/tt/runtime/detail/ttnn.h b/runtime/include/tt/runtime/detail/ttnn.h
index e57300162a..268959e8a2 100644
--- a/runtime/include/tt/runtime/detail/ttnn.h
+++ b/runtime/include/tt/runtime/detail/ttnn.h
@@ -103,6 +103,8 @@ Tensor toLayout(Tensor tensor, Device device, Layout layout);
 Layout getLayout(Binary executableHandle, std::uint32_t programIndex,
                  std::uint32_t inputIndex);
 
+void memcpy(void *dst, Tensor src);
+
 void memcpy(Tensor dst, Tensor src);
 
 void deallocateTensor(Tensor &tensor, bool force = false);
diff --git a/runtime/include/tt/runtime/runtime.h b/runtime/include/tt/runtime/runtime.h
index 56666d564f..c3b725e0f9 100644
--- a/runtime/include/tt/runtime/runtime.h
+++ b/runtime/include/tt/runtime/runtime.h
@@ -87,6 +87,8 @@ Tensor toLayout(Tensor tensor, Device device, Layout layout);
 Layout getLayout(Binary executableHandle, std::uint32_t programIndex,
                  std::uint32_t inputIndex);
 
+void memcpy(void *dst, Tensor src);
+
 void memcpy(Tensor dst, Tensor src);
 
 void deallocateTensor(Tensor &tensor, bool force = false);
diff --git a/runtime/lib/runtime.cpp b/runtime/lib/runtime.cpp
index bf61133089..2da673ad19 100644
--- a/runtime/lib/runtime.cpp
+++ b/runtime/lib/runtime.cpp
@@ -323,6 +323,21 @@ Layout getLayout(Binary executableHandle, std::uint32_t programIndex,
   LOG_FATAL("runtime is not enabled");
 }
 
+void memcpy(void *dst, Tensor src) {
+#if defined(TT_RUNTIME_ENABLE_TTNN)
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    return ::tt::runtime::ttnn::memcpy(dst, src);
+  }
+#endif
+
+#if defined(TT_RUNTIME_ENABLE_TTMETAL)
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    LOG_FATAL("not implemented");
+  }
+#endif
+  LOG_FATAL("runtime is not enabled");
+}
+
 void memcpy(Tensor dst, Tensor src) {
 #if defined(TT_RUNTIME_ENABLE_TTNN)
   if (getCurrentRuntime() == DeviceRuntime::TTNN) {
diff --git a/runtime/lib/ttnn/runtime.cpp b/runtime/lib/ttnn/runtime.cpp
index 466bf318bc..0578557851 100644
--- a/runtime/lib/ttnn/runtime.cpp
+++ b/runtime/lib/ttnn/runtime.cpp
@@ -296,6 +296,17 @@ Layout getLayout(Binary executableHandle, std::uint32_t programIndex,
                 DeviceRuntime::TTNN);
 }
 
+void memcpy(void *dst, Tensor src) {
+  const ::ttnn::Tensor &srcTensor = src.as<::ttnn::Tensor>(DeviceRuntime::TTNN);
+  if (utils::isOnHost(srcTensor.storage_type())) {
+    const void *srcPtr = ::tt::tt_metal::get_raw_host_data_ptr(srcTensor);
+    size_t size = srcTensor.volume() * srcTensor.element_size();
+    std::memcpy(dst, srcPtr, size);
+  } else {
+    ::tt::tt_metal::memcpy(dst, srcTensor);
+  }
+}
+
 void memcpy(Tensor dst, Tensor src) {
   ::ttnn::Tensor &dstTensor = dst.as<::ttnn::Tensor>(DeviceRuntime::TTNN);
   const ::ttnn::Tensor &srcTensor = src.as<::ttnn::Tensor>(DeviceRuntime::TTNN);
@@ -304,11 +315,10 @@ void memcpy(Tensor dst, Tensor src) {
              "Input output tensor size mismatch in memcpy: ",
              srcTensor.volume(), " * ", srcTensor.element_size(),
              " != ", dstTensor.volume(), " * ", dstTensor.element_size());
-
   if (utils::isOnHost(srcTensor.storage_type()) and
       utils::isOnHost(dstTensor.storage_type())) {
     void *dstPtr = ::tt::tt_metal::get_raw_host_data_ptr(dstTensor);
-    void *srcPtr = ::tt::tt_metal::get_raw_host_data_ptr(srcTensor);
+    const void *srcPtr = ::tt::tt_metal::get_raw_host_data_ptr(srcTensor);
     size_t size = srcTensor.volume() * srcTensor.element_size();
     std::memcpy(dstPtr, srcPtr, size);
   } else {
diff --git a/runtime/test/python/ttnn/test_runtime_api.py b/runtime/test/python/ttnn/test_runtime_api.py
index d88232fa29..5454cbcd9a 100644
--- a/runtime/test/python/ttnn/test_runtime_api.py
+++ b/runtime/test/python/ttnn/test_runtime_api.py
@@ -43,7 +43,50 @@ def test_to_layout(helper: Helper, shape, dtype, request):
         ttrt.runtime.memcpy(runtime_output_tensor, host_tensor)
         ttrt.runtime.deallocate_tensor(host_tensor, force=True)
 
-    lambda: assert_pcc(torch_input_tensor, torch_result_tensor, threshold=0.999)
+    assert_pcc(torch_input_tensor, torch_result_tensor, threshold=0.99)
+    helper.teardown()
+
+
+@pytest.mark.parametrize("shape", [(64, 128)])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16])
+def test_memcpy_to_pointer(helper: Helper, shape, dtype, request):
+    helper.initialize(request.node.name)
+    helper.check_constraints()
+    runtime_dtype = Binary.Program.to_data_type(dtype)
+    torch_result_tensor = torch.zeros(shape, dtype=dtype)
+
+    # Device to host
+    torch_input_tensor = torch.randn(shape, dtype=dtype)
+    runtime_input_tensor = ttrt.runtime.create_tensor(
+        torch_input_tensor.data_ptr(),
+        list(torch_input_tensor.shape),
+        list(torch_input_tensor.stride()),
+        torch_input_tensor.element_size(),
+        runtime_dtype,
+    )
+    device_layout = ttrt.runtime.testing.get_dram_interleaved_row_major_layout(
+        runtime_dtype
+    )
+    with DeviceContext([helper.query.device_ids[0]]) as device:
+        device_tensor = ttrt.runtime.to_layout(
+            runtime_input_tensor, device, device_layout
+        )
+        ttrt.runtime.memcpy(torch_result_tensor.data_ptr(), device_tensor)
+        ttrt.runtime.deallocate_tensor(device_tensor, force=True)
+
+    assert_pcc(torch_input_tensor, torch_result_tensor, threshold=0.99)
+
+    # Host to host
+    torch_input_tensor2 = torch.randn(shape, dtype=dtype)
+    host_tensor = ttrt.runtime.create_tensor(
+        torch_input_tensor2.data_ptr(),
+        list(torch_input_tensor2.shape),
+        list(torch_input_tensor2.stride()),
+        torch_input_tensor2.element_size(),
+        runtime_dtype,
+    )
+    ttrt.runtime.memcpy(torch_result_tensor.data_ptr(), host_tensor)
+    assert_pcc(torch_input_tensor2, torch_result_tensor, threshold=0.99)
     helper.teardown()
 
 
@@ -80,12 +123,12 @@ def test_create_tensor_memcpy(helper: Helper, shape, dtype, request):
             list(torch_input_tensor.stride()),
             torch_input_tensor.element_size(),
         )
+        # Copy from host to device container
         ttrt.runtime.memcpy(device_tensor, runtime_input_tensor)
-        host_tensor = ttrt.runtime.to_host(device_tensor, untilize=True)
+        # Copy from device to host
+        ttrt.runtime.memcpy(runtime_output_tensor, device_tensor)
         ttrt.runtime.deallocate_tensor(device_tensor, force=True)
-        ttrt.runtime.memcpy(runtime_output_tensor, host_tensor)
-        ttrt.runtime.deallocate_tensor(host_tensor, force=True)
-    lambda: assert_pcc(torch_input_tensor, torch_result_tensor, threshold=0.999)
+    assert_pcc(torch_input_tensor, torch_result_tensor, threshold=0.99)
     helper.teardown()
 
 
@@ -145,14 +188,7 @@ def test_runtime_stitching_eltwise_binary_op_chain(helper: Helper, request):
             ]
         ),
     )
-    runtime_result_tensor = ttrt.runtime.create_tensor(
-        torch_result_tensor.data_ptr(),
-        list(torch_result_tensor.shape),
-        list(torch_result_tensor.stride()),
-        torch_result_tensor.element_size(),
-        Binary.Program.to_data_type(torch_result_tensor.dtype),
-    )
-    ttrt.runtime.memcpy(runtime_result_tensor, activations)
+    ttrt.runtime.memcpy(torch_result_tensor.data_ptr(), activations)
     golden = (
         (inputs_torch[0] + inputs_torch[1]).mul(inputs_torch[1]).sub(inputs_torch[1])
     )
diff --git a/runtime/tools/python/ttrt/runtime/module.cpp b/runtime/tools/python/ttrt/runtime/module.cpp
index e1db607c53..47b42eab56 100644
--- a/runtime/tools/python/ttrt/runtime/module.cpp
+++ b/runtime/tools/python/ttrt/runtime/module.cpp
@@ -157,6 +157,14 @@ PYBIND11_MODULE(_C, m) {
         "Get the debug string of the op");
   m.def("get_op_loc_info", &tt::runtime::getOpLocInfo,
         "Get the location info of the op");
+  m.def(
+      "memcpy",
+      [](std::uintptr_t dst, ::tt::runtime::Tensor src) {
+        void *dstPtr = reinterpret_cast<void *>(dst);
+        ::tt::runtime::memcpy(dstPtr, src);
+      },
+      py::arg("dst"), py::arg("src"),
+      "Copy the data from src tensor to dst pointer");
   m.def(
       "memcpy",
       [](::tt::runtime::Tensor dst, ::tt::runtime::Tensor src) {

From 8a6151bcd60d7262679d02f1b3922e75920166b4 Mon Sep 17 00:00:00 2001
From: Nick Smith <127986401+nsmithtt@users.noreply.github.com>
Date: Fri, 6 Dec 2024 09:33:49 -0800
Subject: [PATCH 68/84] Make macOS action nightly (#1516)

---
 .github/workflows/macos-build.yml | 5 +++--
 .github/workflows/on-pr.yml       | 3 ---
 .github/workflows/on-push.yml     | 3 ---
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/macos-build.yml b/.github/workflows/macos-build.yml
index 774feed21c..367c15787e 100644
--- a/.github/workflows/macos-build.yml
+++ b/.github/workflows/macos-build.yml
@@ -1,8 +1,9 @@
 name: Build on macos-latest
 
 on:
-  workflow_dispatch:
-  workflow_call:
+  schedule:
+    - cron: '0 4 * * *'  # Runs at 04:00 UTC every day
+  workflow_dispatch:  # Manual trigger
 
 env:
   SDK_VERSION: "0"
diff --git a/.github/workflows/on-pr.yml b/.github/workflows/on-pr.yml
index 76999f97df..76a781f886 100644
--- a/.github/workflows/on-pr.yml
+++ b/.github/workflows/on-pr.yml
@@ -12,9 +12,6 @@ jobs:
   spdx:
     uses: ./.github/workflows/spdx.yml
     secrets: inherit
-  macos-build:
-    uses: ./.github/workflows/macos-build.yml
-    secrets: inherit
   build-and-test:
     uses: ./.github/workflows/build-and-test.yml
     secrets: inherit
diff --git a/.github/workflows/on-push.yml b/.github/workflows/on-push.yml
index 58dcdc65d6..2d961e2204 100644
--- a/.github/workflows/on-push.yml
+++ b/.github/workflows/on-push.yml
@@ -12,9 +12,6 @@ jobs:
   spdx:
     uses: ./.github/workflows/spdx.yml
     secrets: inherit
-  macos-build:
-    uses: ./.github/workflows/macos-build.yml
-    secrets: inherit
   build-and-test:
     uses: ./.github/workflows/build-and-test.yml
     secrets: inherit

From 8fabbbd015e0d1677d01e0ce5872fe82b0b64cab Mon Sep 17 00:00:00 2001
From: Stefan Gligorijevic <189116645+sgligorijevicTT@users.noreply.github.com>
Date: Fri, 6 Dec 2024 22:03:14 +0100
Subject: [PATCH 69/84] Add TOSA to TTIR conversions for sum and max reductions
 (#1480)

* Add TOSA to TTIR conversions for sum and max reductions

* Add return checks to tests

* replace auto with concrete types
---
 .../TosaToTTIR/TosaToTTIRPatterns.cpp         | 37 +++++++++++++++++++
 .../Conversion/TosaToTTIR/reductions/max.mlir | 11 ++++++
 .../Conversion/TosaToTTIR/reductions/sum.mlir | 11 ++++++
 3 files changed, 59 insertions(+)
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/reductions/max.mlir
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/reductions/sum.mlir

diff --git a/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp b/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp
index 23726c21b7..5768b840dd 100644
--- a/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp
+++ b/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp
@@ -122,6 +122,32 @@ class TosaToTTIRMatmulOpConversionPattern
   }
 };
 
+template <typename SrcOp, typename DestOp,
+          typename Adaptor = typename SrcOp::Adaptor>
+class TosaToTTIRReduceOpConversionPattern : public OpConversionPattern<SrcOp> {
+  using OpConversionPattern<SrcOp>::OpConversionPattern;
+
+public:
+  LogicalResult
+  matchAndRewrite(SrcOp srcOp, Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    RankedTensorType outputType =
+        mlir::cast<RankedTensorType>(srcOp.getResult().getType());
+    tensor::EmptyOp outputTensor = rewriter.create<tensor::EmptyOp>(
+        srcOp.getLoc(), outputType.getShape(), outputType.getElementType());
+
+    rewriter.replaceOpWithNewOp<DestOp>(
+        srcOp, outputTensor.getType(), adaptor.getInput(), outputTensor,
+        true /*keepdim*/,
+        rewriter.getArrayAttr(SmallVector<Attribute>(1, adaptor.getAxisAttr())),
+        rewriter.getArrayAttr(
+            SmallVector<Attribute>(adaptor.getOperands().size() + 1,
+                                   rewriter.getAttr<OperandConstraintAttr>(
+                                       OperandConstraint::AnyDeviceTile))));
+    return success();
+  }
+};
+
 void addElementwiseUnaryOpsConversionPatterns(MLIRContext *ctx,
                                               RewritePatternSet &patterns,
                                               TypeConverter &typeConverter) {
@@ -209,6 +235,16 @@ void addMatmulOpsConversionPatterns(MLIRContext *ctx,
   patterns.add<TosaToTTIRMatmulOpConversionPattern>(typeConverter, ctx);
 }
 
+void addReductionOpsConversionPatterns(MLIRContext *ctx,
+                                       RewritePatternSet &patterns,
+                                       TypeConverter &typeConverter) {
+  patterns.add<TosaToTTIRReduceOpConversionPattern<tosa::ReduceMaxOp,
+                                                   mlir::tt::ttir::MaxOp>>(
+      typeConverter, ctx);
+  patterns.add<TosaToTTIRReduceOpConversionPattern<tosa::ReduceSumOp,
+                                                   mlir::tt::ttir::SumOp>>(
+      typeConverter, ctx);
+}
 } // namespace
 
 namespace mlir::tt {
@@ -221,6 +257,7 @@ void populateTosaToTTIRPatterns(MLIRContext *ctx, RewritePatternSet &patterns,
   addLogicalOpsConversionPatterns(ctx, patterns, typeConverter);
   addCompareOpsConversionPatterns(ctx, patterns, typeConverter);
   addMatmulOpsConversionPatterns(ctx, patterns, typeConverter);
+  addReductionOpsConversionPatterns(ctx, patterns, typeConverter);
 }
 
 } // namespace mlir::tt
diff --git a/test/ttmlir/Conversion/TosaToTTIR/reductions/max.mlir b/test/ttmlir/Conversion/TosaToTTIR/reductions/max.mlir
new file mode 100644
index 0000000000..021d5fd08f
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/reductions/max.mlir
@@ -0,0 +1,11 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_max(%arg0: tensor<13x21x3xf32>) -> tensor<13x1x3xf32> {
+    // CHECK: func.func {{.+}} [[IN_SIZE:tensor<[0-9]+x[0-9]+x[0-9]+xf32>]]{{.*}} ->
+    %0 = tosa.reduce_max %arg0 {axis = 1 : i32} : (tensor<13x21x3xf32>) -> tensor<13x1x3xf32>
+    // CHECK: %[[OP_OUT:[0-9]+]] = tensor.empty() : [[OUT_SIZE:tensor<[0-9]+x[0-9]+x[0-9]+xf32>]]
+    // CHECK: %[[VAL:[0-9]+]] = "ttir.max"(%arg{{[0-9]+}}, %[[OP_OUT]]){{.+}} ([[IN_SIZE]], [[OUT_SIZE]]) -> [[OUT_SIZE]]
+    // CHECK: return %[[VAL]] : [[OUT_SIZE]]
+    return %0 : tensor<13x1x3xf32>
+  }
+}
diff --git a/test/ttmlir/Conversion/TosaToTTIR/reductions/sum.mlir b/test/ttmlir/Conversion/TosaToTTIR/reductions/sum.mlir
new file mode 100644
index 0000000000..80f2045914
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/reductions/sum.mlir
@@ -0,0 +1,11 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_sum(%arg0: tensor<13x21x3xf32>) -> tensor<13x1x3xf32> {
+    // CHECK: func.func {{.+}} [[IN_SIZE:tensor<[0-9]+x[0-9]+x[0-9]+xf32>]]{{.*}} ->
+    %0 = tosa.reduce_sum %arg0 {axis = 1 : i32} : (tensor<13x21x3xf32>) -> tensor<13x1x3xf32>
+    // CHECK: %[[OP_OUT:[0-9]+]] = tensor.empty() : [[OUT_SIZE:tensor<[0-9]+x[0-9]+x[0-9]+xf32>]]
+    // CHECK: %[[VAL:[0-9]+]] = "ttir.sum"(%arg{{[0-9]+}}, %[[OP_OUT]]){{.+}} ([[IN_SIZE]], [[OUT_SIZE]]) -> [[OUT_SIZE]]
+    // CHECK: return %[[VAL]] : [[OUT_SIZE]]
+    return %0 : tensor<13x1x3xf32>
+  }
+}

From e9cd390795ddb4c6a5b778c4323804efcf2b0681 Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <157983820+vmilosevic@users.noreply.github.com>
Date: Sat, 7 Dec 2024 19:00:00 +0100
Subject: [PATCH 70/84] Uplift third_party/tt-metal to
 10eeea8080971e280fd33ffa0a81d9ec06d49917 2024-12-07 (#1532)

Co-authored-by: kmitrovicTT <169657397+kmitrovicTT@users.noreply.github.com>
---
 third_party/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index d1b8af59a1..e658bfc98a 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -1,6 +1,6 @@
 include(ExternalProject)
 
-set(TT_METAL_VERSION "4952f01a39068cba4e85c93cf34c4f95f10f5c34")
+set(TT_METAL_VERSION "10eeea8080971e280fd33ffa0a81d9ec06d49917")
 
 if ("$ENV{ARCH_NAME}" STREQUAL "grayskull")
   set(ARCH_NAME "grayskull")

From c1d9d27b85b864ffd35bc827c4a708b5f88a9f8d Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <157983820+vmilosevic@users.noreply.github.com>
Date: Sun, 8 Dec 2024 16:43:44 +0100
Subject: [PATCH 71/84] Uplift third_party/tt-metal to
 8ee1f823ccc6680fd4ba5aa9004487692da1c545 2024-12-08 (#1534)

Co-authored-by: kmitrovicTT <169657397+kmitrovicTT@users.noreply.github.com>
---
 third_party/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index e658bfc98a..acb8983839 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -1,6 +1,6 @@
 include(ExternalProject)
 
-set(TT_METAL_VERSION "10eeea8080971e280fd33ffa0a81d9ec06d49917")
+set(TT_METAL_VERSION "8ee1f823ccc6680fd4ba5aa9004487692da1c545")
 
 if ("$ENV{ARCH_NAME}" STREQUAL "grayskull")
   set(ARCH_NAME "grayskull")

From 223d24444c419dec906f87749f161f03b321fce7 Mon Sep 17 00:00:00 2001
From: Vladimir Canic <133228576+vcanicTT@users.noreply.github.com>
Date: Mon, 9 Dec 2024 08:40:39 +0100
Subject: [PATCH 72/84] Add Python bindings for override mechanism. (#1526)

* Add Python bindings for override mechanism.

* Add Python bindings for override mechanism.

* Add Python bindings for override mechanism.
---
 include/ttmlir/Bindings/Python/TTMLIRModule.h |   1 +
 .../Dialect/TTNN/Pipelines/TTNNPipelines.h    |  19 +--
 .../Dialect/TTNN/Utils/OptimizerOverrides.h   |  21 ++-
 .../ttmlir/Dialect/TTNN/Utils/PassOverrides.h |  17 ++
 lib/Dialect/TTNN/Utils/OptimizerOverrides.cpp |  82 +++++++---
 python/CMakeLists.txt                         |   7 +
 python/OptimizerOverrides.cpp                 | 153 ++++++++++++++++++
 python/TTMLIRModule.cpp                       |   3 +
 python/test_infra/test_optimizer_overrides.py | 120 ++++++++++++++
 python/ttmlir/optimizer_overrides.py          |   5 +
 10 files changed, 389 insertions(+), 39 deletions(-)
 create mode 100644 python/OptimizerOverrides.cpp
 create mode 100644 python/test_infra/test_optimizer_overrides.py
 create mode 100644 python/ttmlir/optimizer_overrides.py

diff --git a/include/ttmlir/Bindings/Python/TTMLIRModule.h b/include/ttmlir/Bindings/Python/TTMLIRModule.h
index 5f2d4e134d..d36529e676 100644
--- a/include/ttmlir/Bindings/Python/TTMLIRModule.h
+++ b/include/ttmlir/Bindings/Python/TTMLIRModule.h
@@ -60,6 +60,7 @@ void populateTTIRModule(py::module &m);
 void populateTTKernelModule(py::module &m);
 void populateTTNNModule(py::module &m);
 void populateOverridesModule(py::module &m);
+void populateOptimizerOverridesModule(py::module &m);
 void populatePassesModule(py::module &m);
 } // namespace mlir::ttmlir::python
 
diff --git a/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h b/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h
index e9d78b4d3c..27a7a88e6b 100644
--- a/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h
+++ b/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h
@@ -20,7 +20,7 @@ struct TTIRToTTNNBackendPipelineOptions
   // configuration for max performance. If this option is false, skip running
   // Optimizer pass, thus leaving all ops on default configuration.
   Option<bool> optimizerPassEnabled{
-      *this, "enable-optimizer",
+      *this, OptionNames::optimizerPassEnabled,
       llvm::cl::desc("Determine and set max valid grid for Op execution."),
       llvm::cl::init(false)};
 
@@ -38,7 +38,7 @@ struct TTIRToTTNNBackendPipelineOptions
   //
   Option<llvm::StringMap<InputLayoutOverrideParams>, InputLayoutOverrideParser>
       overrideInputLayout{
-          *this, "insert-memreconfig",
+          *this, OptionNames::overrideInputLayout,
           llvm::cl::desc(
               "Manually insert memory reconfig op for specific op's operand."),
           llvm::cl::init(llvm::StringMap<InputLayoutOverrideParams>())};
@@ -66,21 +66,21 @@ struct TTIRToTTNNBackendPipelineOptions
   Option<llvm::StringMap<OutputLayoutOverrideParams>,
          OutputLayoutOverrideParser>
       overrideOutputLayout{
-          *this, "override-output-layout",
+          *this, OptionNames::overrideOutputLayout,
           llvm::cl::desc("Override output tensor layout for specific ops."),
           llvm::cl::init(llvm::StringMap<OutputLayoutOverrideParams>())};
 
   // If this option is true, run memory layout analysis.
   //
   Option<bool> memoryLayoutAnalysisEnabled{
-      *this, "memory-layout-analysis-enabled",
+      *this, OptionNames::memoryLayoutAnalysisEnabled,
       llvm::cl::desc("Enable memory layout optimization."),
       llvm::cl::init(false)};
 
   // If this option is true, insert memory reconfiguration ops.
   //
   Option<bool> memReconfigEnabled{
-      *this, "memreconfig-enabled",
+      *this, OptionNames::memReconfigEnabled,
       llvm::cl::desc("Memory layout reconfiguration pass."),
       llvm::cl::init(true)};
 
@@ -88,7 +88,7 @@ struct TTIRToTTNNBackendPipelineOptions
   //
   Option<MemoryLayoutAnalysisPolicyType, MemoryLayoutAnalysisPolicyTypeParser>
       memoryLayoutAnalysisPolicy{
-          *this, "memory-layout-analysis-policy",
+          *this, OptionNames::memoryLayoutAnalysisPolicy,
           llvm::cl::desc("Specify policy for memory layout analysis."),
           llvm::cl::init(MemoryLayoutAnalysisPolicyType::DFSharding)};
 
@@ -96,7 +96,7 @@ struct TTIRToTTNNBackendPipelineOptions
   // against.
   //
   Option<std::string> systemDescPath{
-      *this, "system-desc-path",
+      *this, OptionNames::systemDescPath,
       llvm::cl::desc(
           "Pass in a system descriptor flatbuffer to compile against."),
       llvm::cl::init("")};
@@ -104,13 +104,14 @@ struct TTIRToTTNNBackendPipelineOptions
   // Option to override maximum number of legal layouts for grid analysis
   //
   Option<int64_t> maxLegalLayouts{
-      *this, "max-legal-layouts",
+      *this, OptionNames::maxLegalLayouts,
       llvm::cl::desc(
           "Override maximum number of legal layouts for grid analysis."),
       llvm::cl::init(64)};
 
   ListOption<int64_t> meshShape{
-      *this, "mesh-shape", llvm::cl::desc("Set the multi-device mesh shape.")};
+      *this, OptionNames::meshShape,
+      llvm::cl::desc("Set the multi-device mesh shape.")};
 
   // Option to enable/disable the workaround pass.
   //
diff --git a/include/ttmlir/Dialect/TTNN/Utils/OptimizerOverrides.h b/include/ttmlir/Dialect/TTNN/Utils/OptimizerOverrides.h
index eccc62f26d..b13d375647 100644
--- a/include/ttmlir/Dialect/TTNN/Utils/OptimizerOverrides.h
+++ b/include/ttmlir/Dialect/TTNN/Utils/OptimizerOverrides.h
@@ -5,7 +5,10 @@
 #ifndef TTMLIR_DIALECT_TTNN_UTILS_OPTIMIZEROVERRIDES_H
 #define TTMLIR_DIALECT_TTNN_UTILS_OPTIMIZEROVERRIDES_H
 
-#include "ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h"
+#include <iostream>
+#include <string>
+#include <unordered_map>
+
 #include "ttmlir/Dialect/TTNN/Utils/MemoryLayoutAnalysisParams.h"
 #include "ttmlir/Dialect/TTNN/Utils/PassOverrides.h"
 
@@ -65,11 +68,19 @@ class OptimizerOverridesHandler {
                                TensorMemoryLayout, tt::ttnn::Layout,
                                tt::DataType);
 
-private:
-  // Options for the TTIR to TTNN backend pipeline,
-  // we use them to extract the names and the deafulat values.
-  TTIRToTTNNBackendPipelineOptions pipelineOptions;
+  // Wrapper methods we use to expose the adders to the python bindings
+  std::unordered_map<std::string, InputLayoutOverrideParams>
+  getInputLayoutOverridesPybindWrapper() const;
+  std::unordered_map<std::string, OutputLayoutOverrideParams>
+  getOutputLayoutOverridesPybindWrapper() const;
 
+  // Wrapper methods we use to expose the adders to the python bindings
+  void addInputLayoutOverridePybindWrapper(std::string, std::vector<int64_t> &);
+  void addOutputLayoutOverridePybindWrapper(std::string, std::vector<int64_t> &,
+                                            BufferType, TensorMemoryLayout,
+                                            tt::ttnn::Layout, tt::DataType);
+
+private:
   // Flags for enabling/disabling the optimizer passes
   bool enableOptimizer = false;
 
diff --git a/include/ttmlir/Dialect/TTNN/Utils/PassOverrides.h b/include/ttmlir/Dialect/TTNN/Utils/PassOverrides.h
index 09e587c9c3..35f93062e8 100644
--- a/include/ttmlir/Dialect/TTNN/Utils/PassOverrides.h
+++ b/include/ttmlir/Dialect/TTNN/Utils/PassOverrides.h
@@ -5,6 +5,8 @@
 #ifndef TTMLIR_DIALECT_TTNN_UTILS_PASSOVERRIDES_H
 #define TTMLIR_DIALECT_TTNN_UTILS_PASSOVERRIDES_H
 
+#include <string_view>
+
 #include <llvm/Support/CommandLine.h>
 
 #include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
@@ -13,6 +15,21 @@
 
 namespace mlir::tt::ttnn {
 
+struct OptionNames {
+
+  static constexpr StringRef optimizerPassEnabled = "enable-optimizer";
+  static constexpr StringRef overrideInputLayout = "insert-memreconfig";
+  static constexpr StringRef overrideOutputLayout = "override-output-layout";
+  static constexpr StringRef memoryLayoutAnalysisEnabled =
+      "memory-layout-analysis-enabled";
+  static constexpr StringRef memReconfigEnabled = "memreconfig-enabled";
+  static constexpr StringRef memoryLayoutAnalysisPolicy =
+      "memory-layout-analysis-policy";
+  static constexpr StringRef systemDescPath = "system-desc-path";
+  static constexpr StringRef maxLegalLayouts = "max-legal-layouts";
+  static constexpr StringRef meshShape = "mesh-shape";
+};
+
 struct OutputLayoutOverrideParams {
 
   SmallVector<int64_t, 2> grid;
diff --git a/lib/Dialect/TTNN/Utils/OptimizerOverrides.cpp b/lib/Dialect/TTNN/Utils/OptimizerOverrides.cpp
index bbc456948e..157c1e50d3 100644
--- a/lib/Dialect/TTNN/Utils/OptimizerOverrides.cpp
+++ b/lib/Dialect/TTNN/Utils/OptimizerOverrides.cpp
@@ -81,65 +81,79 @@ OptimizerOverridesHandler::getOutputLayoutOverrides() const {
   return outputLayoutOverrides;
 }
 
+std::unordered_map<std::string, InputLayoutOverrideParams>
+OptimizerOverridesHandler::getInputLayoutOverridesPybindWrapper() const {
+  std::unordered_map<std::string, InputLayoutOverrideParams>
+      inputLayoutOverridesWrapper;
+  for (auto &entry : inputLayoutOverrides) {
+    inputLayoutOverridesWrapper[entry.getKey().str()] = entry.getValue();
+  }
+  return inputLayoutOverridesWrapper;
+}
+
+std::unordered_map<std::string, OutputLayoutOverrideParams>
+OptimizerOverridesHandler::getOutputLayoutOverridesPybindWrapper() const {
+  std::unordered_map<std::string, OutputLayoutOverrideParams>
+      outputLayoutOverridesWrapper;
+  for (auto &entry : outputLayoutOverrides) {
+    outputLayoutOverridesWrapper[entry.getKey().str()] = entry.getValue();
+  }
+  return outputLayoutOverridesWrapper;
+}
+
 std::string OptimizerOverridesHandler::toString() const {
 
   std::string options = "";
 
   if (enableOptimizer) {
-    options += std::string(pipelineOptions.optimizerPassEnabled.getArgStr()) +
-               "=true ";
+    options += OptionNames::optimizerPassEnabled.str() + "=true ";
   }
 
   if (enableMemoryReconfig) {
-    options +=
-        std::string(pipelineOptions.memReconfigEnabled.getArgStr()) + "=true ";
+    options += OptionNames::memReconfigEnabled.str() + "=true ";
   }
 
   if (enableMemoryLayoutAnalysis) {
-    options +=
-        std::string(pipelineOptions.memoryLayoutAnalysisEnabled.getArgStr()) +
-        "=true ";
+    options += OptionNames::memoryLayoutAnalysisEnabled.str() + "=true ";
   }
 
   if (enableMemoryLayoutAnalysisPolicy) {
-    options +=
-        std::string(pipelineOptions.memoryLayoutAnalysisPolicy.getArgStr()) +
-        MemoryLayoutAnalysisPolicyTypeParser::toString(
-            memoryLayoutAnalysisPolicy) +
-        " ";
+    options += OptionNames::memoryLayoutAnalysisPolicy.str() + "=" +
+               MemoryLayoutAnalysisPolicyTypeParser::toString(
+                   memoryLayoutAnalysisPolicy) +
+               " ";
   }
 
   // Create input layout overrides.
-  //  Example: insert-memreconfig=input0=0:1,input1=0,input2=0:1:2
+  //  Example:
+  //    insert-memreconfig=input0=0:1,input1=0,input2=0:1:2
   if (inputLayoutOverrides.size() > 0) {
-    options += std::string(pipelineOptions.overrideInputLayout.getArgStr()) +
-               "=" + InputLayoutOverrideParser::toString(inputLayoutOverrides) +
-               " ";
+    options += OptionNames::overrideInputLayout.str() + "=" +
+               InputLayoutOverrideParser::toString(inputLayoutOverrides) + " ";
   }
 
   // Create output layout overrides.
   //  Example:
-  //  override-output-layout=op1=2x2:dram:interleaved:tile:fp32,op2=4x4:l1:block_sharded:row_major:fp16
+  //    override-output-layout=op1=2x2:dram:interleaved:tile:fp32,op2=4x4:l1:block_sharded:row_major:fp16
   //  Example:
-  //  override-output-layout=add_1_2=1x1:dram:interleaved:row_major:f32"
+  //    override-output-layout=add_1_2=1x1:dram:interleaved:row_major:f32"
   if (outputLayoutOverrides.size() > 0) {
-    options +=
-        std::string(pipelineOptions.overrideOutputLayout.getArgStr()) + "=" +
-        OutputLayoutOverrideParser::toString(outputLayoutOverrides) + " ";
+    options += OptionNames::overrideOutputLayout.str() + "=" +
+               OutputLayoutOverrideParser::toString(outputLayoutOverrides) +
+               " ";
   }
 
   if (systemDescPath.size() > 0) {
-    options += std::string(pipelineOptions.systemDescPath.getArgStr()) +
-               systemDescPath + " ";
+    options += OptionNames::systemDescPath.str() + "=" + systemDescPath + " ";
   }
 
   if (maxLegalLayouts > 0) {
-    options += std::string(pipelineOptions.maxLegalLayouts.getArgStr()) +
+    options += OptionNames::maxLegalLayouts.str() + "=" +
                std::to_string(maxLegalLayouts) + " ";
   }
 
   if (meshShape.size() > 0) {
-    options += std::string(pipelineOptions.meshShape.getArgStr()) + "=";
+    options += OptionNames::meshShape.str() + "=";
     for (int64_t meshShapeValue : meshShape) {
       options += std::to_string(meshShapeValue) + ",";
     }
@@ -175,4 +189,22 @@ void OptimizerOverridesHandler::addOutputLayoutOverride(
       std::move(grid), bufferType, tensorMemoryLayout, memoryLayout, dataType};
 }
 
+void OptimizerOverridesHandler::addInputLayoutOverridePybindWrapper(
+    std::string opName, std::vector<int64_t> &operandIdxes) {
+  StringRef opNameStringRef(opName);
+  SmallVector<int64_t> operandIdxesSmallVector(operandIdxes.begin(),
+                                               operandIdxes.end());
+  addInputLayoutOverride(opNameStringRef, operandIdxesSmallVector);
+}
+
+void OptimizerOverridesHandler::addOutputLayoutOverridePybindWrapper(
+    std::string opName, std::vector<int64_t> &grid, BufferType bufferType,
+    TensorMemoryLayout tensorMemoryLayout, tt::ttnn::Layout memoryLayout,
+    tt::DataType dataType) {
+  StringRef opNameStringRef(opName);
+  SmallVector<int64_t> gridSmallVector(grid.begin(), grid.end());
+  addOutputLayoutOverride(opNameStringRef, gridSmallVector, bufferType,
+                          tensorMemoryLayout, memoryLayout, dataType);
+}
+
 } // namespace mlir::tt::ttnn
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index e43cb858d4..cbfc3bf95f 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -63,6 +63,12 @@ declare_mlir_python_sources(TTMLIRPythonSources.Overrides
   SOURCES overrides.py
 )
 
+declare_mlir_python_sources(TTMLIRPythonSources.OptimizerOverrides
+  ROOT_DIR "${TTMLIR_PYTHON_ROOT_DIR}"
+  ADD_TO_PARENT TTMLIRPythonSources
+  SOURCES optimizer_overrides.py
+)
+
 declare_mlir_python_sources(TTMLIRPythonSources.Passes
   ROOT_DIR "${TTMLIR_PYTHON_ROOT_DIR}"
   ADD_TO_PARENT TTMLIRPythonSources
@@ -87,6 +93,7 @@ declare_mlir_python_extension(TTMLIRPythonExtensions.Main
     TTKernelModule.cpp
     TTNNModule.cpp
     Overrides.cpp
+    OptimizerOverrides.cpp
     Passes.cpp
   EMBED_CAPI_LINK_LIBS
     MLIRCAPITransforms
diff --git a/python/OptimizerOverrides.cpp b/python/OptimizerOverrides.cpp
new file mode 100644
index 0000000000..b41d2081d2
--- /dev/null
+++ b/python/OptimizerOverrides.cpp
@@ -0,0 +1,153 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ttmlir/Dialect/TTNN/Utils/OptimizerOverrides.h"
+#include "ttmlir/Bindings/Python/TTMLIRModule.h"
+
+namespace mlir::ttmlir::python {
+
+void populateOptimizerOverridesModule(py::module &m) {
+
+  py::class_<tt::ttnn::OptimizerOverridesHandler>(m,
+                                                  "OptimizerOverridesHandler")
+      .def(py::init<>())
+
+      .def("set_enable_optimizer",
+           &tt::ttnn::OptimizerOverridesHandler::setEnableOptimizer)
+      .def("get_enable_optimizer",
+           &tt::ttnn::OptimizerOverridesHandler::getEnableOptimizer)
+
+      .def("set_memory_reconfig",
+           &tt::ttnn::OptimizerOverridesHandler::setMemoryReconfig)
+      .def("get_memory_reconfig",
+           &tt::ttnn::OptimizerOverridesHandler::getMemoryReconfig)
+
+      .def("set_enable_memory_layout_analysis",
+           &tt::ttnn::OptimizerOverridesHandler::setEnableMemoryLayoutAnalysis)
+      .def("get_enable_memory_layout_analysis",
+           &tt::ttnn::OptimizerOverridesHandler::getEnableMemoryLayoutAnalysis)
+
+      .def("set_enable_memory_layout_analysis_policy",
+           &tt::ttnn::OptimizerOverridesHandler::
+               setEnableMemoryLayoutAnalysisPolicy)
+      .def("get_enable_memory_layout_analysis_policy",
+           &tt::ttnn::OptimizerOverridesHandler::
+               getEnableMemoryLayoutAnalysisPolicy)
+
+      .def("set_memory_layout_analysis_policy",
+           &tt::ttnn::OptimizerOverridesHandler::setMemoryLayoutAnalysisPolicy)
+      .def("get_memory_layout_analysis_policy",
+           &tt::ttnn::OptimizerOverridesHandler::getMemoryLayoutAnalysisPolicy)
+
+      .def("set_system_desc_path",
+           &tt::ttnn::OptimizerOverridesHandler::setSystemDescPath)
+      .def("get_system_desc_path",
+           &tt::ttnn::OptimizerOverridesHandler::getSystemDescPath)
+
+      .def("set_max_legal_layouts",
+           &tt::ttnn::OptimizerOverridesHandler::setMaxLegalLayouts)
+      .def("get_max_legal_layouts",
+           &tt::ttnn::OptimizerOverridesHandler::getMaxLegalLayouts)
+
+      .def("set_mesh_shape", &tt::ttnn::OptimizerOverridesHandler::setMeshShape)
+      .def("get_mesh_shape", &tt::ttnn::OptimizerOverridesHandler::getMeshShape)
+
+      .def("get_input_layout_overrides",
+           &tt::ttnn::OptimizerOverridesHandler::
+               getInputLayoutOverridesPybindWrapper)
+      .def("get_output_layout_overrides",
+           &tt::ttnn::OptimizerOverridesHandler::
+               getOutputLayoutOverridesPybindWrapper)
+
+      .def("add_input_layout_override", &tt::ttnn::OptimizerOverridesHandler::
+                                            addInputLayoutOverridePybindWrapper)
+      .def("add_output_layout_override",
+           &tt::ttnn::OptimizerOverridesHandler::
+               addOutputLayoutOverridePybindWrapper)
+
+      .def("to_string", &tt::ttnn::OptimizerOverridesHandler::toString);
+
+  py::enum_<mlir::tt::MemoryLayoutAnalysisPolicyType>(
+      m, "MemoryLayoutAnalysisPolicyType")
+      .value("DFSharding", mlir::tt::MemoryLayoutAnalysisPolicyType::DFSharding)
+      .value("L1Interleaved",
+             mlir::tt::MemoryLayoutAnalysisPolicyType::L1Interleaved);
+
+  py::enum_<mlir::tt::ttnn::BufferType>(m, "BufferType")
+      .value("DRAM", mlir::tt::ttnn::BufferType::DRAM)
+      .value("L1", mlir::tt::ttnn::BufferType::L1)
+      .value("SystemMemory", mlir::tt::ttnn::BufferType::SystemMemory)
+      .value("L1Small", mlir::tt::ttnn::BufferType::L1Small)
+      .value("Trace", mlir::tt::ttnn::BufferType::Trace);
+
+  py::enum_<mlir::tt::ttnn::Layout>(m, "Layout")
+      .value("RowMajor", mlir::tt::ttnn::Layout::RowMajor)
+      .value("Tile", mlir::tt::ttnn::Layout::Tile)
+      .value("Invalid", mlir::tt::ttnn::Layout::Invalid);
+
+  py::enum_<mlir::tt::ttnn::TensorMemoryLayout>(m, "TensorMemoryLayout")
+      .value("Interleaved", mlir::tt::ttnn::TensorMemoryLayout::Interleaved)
+      .value("SingleBank", mlir::tt::ttnn::TensorMemoryLayout::SingleBank)
+      .value("HeightSharded", mlir::tt::ttnn::TensorMemoryLayout::HeightSharded)
+      .value("WidthSharded", mlir::tt::ttnn::TensorMemoryLayout::WidthSharded)
+      .value("BlockSharded", mlir::tt::ttnn::TensorMemoryLayout::BlockSharded);
+
+  py::enum_<mlir::tt::DataType>(m, "DataType")
+      .value("Float32", mlir::tt::DataType::Float32)
+      .value("Float16", mlir::tt::DataType::Float16)
+      .value("BFloat16", mlir::tt::DataType::BFloat16)
+      .value("BFP_Float8", mlir::tt::DataType::BFP_Float8)
+      .value("BFP_BFloat8", mlir::tt::DataType::BFP_BFloat8)
+      .value("BFP_Float4", mlir::tt::DataType::BFP_Float4)
+      .value("BFP_BFloat4", mlir::tt::DataType::BFP_BFloat4)
+      .value("BFP_Float2", mlir::tt::DataType::BFP_Float2)
+      .value("BFP_BFloat2", mlir::tt::DataType::BFP_BFloat2)
+      .value("UInt32", mlir::tt::DataType::UInt32)
+      .value("UInt16", mlir::tt::DataType::UInt16)
+      .value("UInt8", mlir::tt::DataType::UInt8);
+
+  py::class_<mlir::tt::ttnn::InputLayoutOverrideParams>(
+      m, "InputLayoutOverrideParams")
+      .def(py::init<>())
+      .def_property(
+          "operand_idxes",
+          [](const mlir::tt::ttnn::InputLayoutOverrideParams &obj) {
+            // Getter: Convert SmallVector to std::vector
+            return std::vector<int64_t>(obj.operandIdxes.begin(),
+                                        obj.operandIdxes.end());
+          },
+          [](mlir::tt::ttnn::InputLayoutOverrideParams &obj,
+             const std::vector<int64_t> &input) {
+            // Setter: Convert std::vector to SmallVector
+            obj.operandIdxes.clear();
+            obj.operandIdxes.append(input.begin(), input.end());
+          });
+
+  py::class_<mlir::tt::ttnn::OutputLayoutOverrideParams>(
+      m, "OutputLayoutOverrideParams")
+      .def(py::init<>())
+      .def_property(
+          "grid",
+          [](const mlir::tt::ttnn::OutputLayoutOverrideParams &obj) {
+            // Getter: Convert SmallVector to std::vector
+            return std::vector<int64_t>(obj.grid.begin(), obj.grid.end());
+          },
+          [](mlir::tt::ttnn::OutputLayoutOverrideParams &obj,
+             const std::vector<int64_t> &input) {
+            // Setter: Convert std::vector to SmallVector
+            obj.grid.clear();
+            obj.grid.append(input.begin(), input.end());
+          })
+      .def_readwrite("buffer_type",
+                     &mlir::tt::ttnn::OutputLayoutOverrideParams::bufferType)
+      .def_readwrite(
+          "tensor_memory_layout",
+          &mlir::tt::ttnn::OutputLayoutOverrideParams::tensorMemoryLayout)
+      .def_readwrite("memory_layout",
+                     &mlir::tt::ttnn::OutputLayoutOverrideParams::memoryLayout)
+      .def_readwrite("data_type",
+                     &mlir::tt::ttnn::OutputLayoutOverrideParams::dataType);
+}
+
+} // namespace mlir::ttmlir::python
diff --git a/python/TTMLIRModule.cpp b/python/TTMLIRModule.cpp
index 9c4a4c81b5..0347da75b5 100644
--- a/python/TTMLIRModule.cpp
+++ b/python/TTMLIRModule.cpp
@@ -40,4 +40,7 @@ PYBIND11_MODULE(_ttmlir, m) {
   auto passes =
       m.def_submodule("passes", "Python-Bound Passes & Transformations");
   mlir::ttmlir::python::populatePassesModule(passes);
+  auto optimizer_overrides = m.def_submodule(
+      "optimizer_overrides", "Python-Bound Optimizer Overrides");
+  mlir::ttmlir::python::populateOptimizerOverridesModule(optimizer_overrides);
 }
diff --git a/python/test_infra/test_optimizer_overrides.py b/python/test_infra/test_optimizer_overrides.py
new file mode 100644
index 0000000000..68ea33c8e7
--- /dev/null
+++ b/python/test_infra/test_optimizer_overrides.py
@@ -0,0 +1,120 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+import ttmlir.optimizer_overrides as oo
+
+from ttmlir.optimizer_overrides import MemoryLayoutAnalysisPolicyType
+from ttmlir.optimizer_overrides import BufferType
+from ttmlir.optimizer_overrides import Layout
+from ttmlir.optimizer_overrides import TensorMemoryLayout
+from ttmlir.optimizer_overrides import DataType
+
+
+def main():
+
+    print("\n\n ================ TESTING START ================ \n\n")
+
+    # ----------------------------------------------------------------------------- #
+    # Instantiate the OptimizerOverridesHandler
+    # ----------------------------------------------------------------------------- #
+    obj = oo.OptimizerOverridesHandler()
+
+    # ----------------------------------------------------------------------------- #
+    # Test setters and getters
+    # ----------------------------------------------------------------------------- #
+
+    # Enable Optimizer
+    obj.set_enable_optimizer(True)
+    print(f"Enable optimizer: {obj.get_enable_optimizer()}")
+    obj.set_enable_optimizer(False)
+    print(f"Enable optimizer: {obj.get_enable_optimizer()}")
+
+    # Memory Reconfig
+    obj.set_memory_reconfig(True)
+    print(f"Memory Reconfig: {obj.get_memory_reconfig()}")
+    obj.set_memory_reconfig(False)
+    print(f"Memory Reconfig: {obj.get_memory_reconfig()}")
+
+    # Enable Memory Layout Analysis
+    obj.set_enable_memory_layout_analysis(True)
+    print(f"Enable Memory Layout Analysis: {obj.get_enable_memory_layout_analysis()}")
+    obj.set_enable_memory_layout_analysis(False)
+    print(f"Enable Memory Layout Analysis: {obj.get_enable_memory_layout_analysis()}")
+
+    # Enable Memory Layout Analysis Policy
+    obj.set_enable_memory_layout_analysis_policy(True)
+    print(
+        f"Enable Memory Layout Analysis Policy: {obj.get_enable_memory_layout_analysis_policy()}"
+    )
+    obj.set_enable_memory_layout_analysis_policy(False)
+    print(
+        f"Enable Memory Layout Analysis Policy: {obj.get_enable_memory_layout_analysis_policy()}"
+    )
+
+    # Memory Layout Analysis Policy
+    obj.set_memory_layout_analysis_policy(MemoryLayoutAnalysisPolicyType.DFSharding)
+    print(f"Memory Layout Analysis Policy: {obj.get_memory_layout_analysis_policy()}")
+    obj.set_memory_layout_analysis_policy(MemoryLayoutAnalysisPolicyType.L1Interleaved)
+    print(f"Memory Layout Analysis Policy: {obj.get_memory_layout_analysis_policy()}")
+
+    # System Descriptor Path
+    obj.set_system_desc_path("System Descriptor Path")
+    print(f"System Descriptor Path: {obj.get_system_desc_path()}")
+
+    # Max Legal Layouts
+    obj.set_max_legal_layouts(10)
+    print(f"Max Legal Layouts: {obj.get_max_legal_layouts()}")
+
+    # Mesh Shape
+    obj.set_mesh_shape([1, 2, 3])
+    print(f"Mesh Shape: {obj.get_mesh_shape()}")
+
+    # ----------------------------------------------------------------------------- #
+    # Test Input Layout and Output Layout
+    # ----------------------------------------------------------------------------- #
+
+    # Input Layout
+    obj.add_input_layout_override("add", [1, 2])
+    obj.add_input_layout_override("mul", [0, 1])
+    obj.add_input_layout_override("sub", [2, 3])
+    print(f"Input Layout: {obj.get_input_layout_overrides()}\n")
+
+    # Output Layout
+    obj.add_output_layout_override(
+        "add",
+        [0, 1],
+        BufferType.DRAM,
+        TensorMemoryLayout.HeightSharded,
+        Layout.RowMajor,
+        DataType.Float16,
+    )
+    obj.add_output_layout_override(
+        "mul",
+        [1, 2],
+        BufferType.L1,
+        TensorMemoryLayout.WidthSharded,
+        Layout.RowMajor,
+        DataType.BFloat16,
+    )
+    obj.add_output_layout_override(
+        "sub",
+        [2, 3],
+        BufferType.SystemMemory,
+        TensorMemoryLayout.BlockSharded,
+        Layout.Tile,
+        DataType.UInt16,
+    )
+    print(f"Output Layout: {obj.get_output_layout_overrides()}\n")
+
+    # ----------------------------------------------------------------------------- #
+    # Test string method
+    # ----------------------------------------------------------------------------- #
+    print(f"Optimizer Override string: {obj.to_string()}")
+
+    print("\n\n ================ TESTING END ================ \n\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/ttmlir/optimizer_overrides.py b/python/ttmlir/optimizer_overrides.py
new file mode 100644
index 0000000000..880ab85cfc
--- /dev/null
+++ b/python/ttmlir/optimizer_overrides.py
@@ -0,0 +1,5 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from ._mlir_libs._ttmlir.optimizer_overrides import *

From a5bc17a63e2223b67c8e8b24f820128d74ce072d Mon Sep 17 00:00:00 2001
From: Stefan Gligorijevic <189116645+sgligorijevicTT@users.noreply.github.com>
Date: Mon, 9 Dec 2024 12:58:51 +0100
Subject: [PATCH 73/84] Add TOSA to TTIR conversion for MaxPool2D (#1477)

* Add TOSA to TTIR conversion for MaxPool2D

* replace auto with concrete types
---
 .../TosaToTTIR/TosaToTTIRPatterns.cpp         | 43 +++++++++++++++++--
 .../Conversion/TosaToTTIR/maxpool2d_op.mlir   | 11 +++++
 2 files changed, 51 insertions(+), 3 deletions(-)
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/maxpool2d_op.mlir

diff --git a/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp b/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp
index 5768b840dd..607a083310 100644
--- a/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp
+++ b/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp
@@ -95,10 +95,11 @@ class TosaToTTIRMatmulOpConversionPattern
     if (!legalityResult.succeeded()) {
       return legalityResult;
     }
-    auto outputType = mlir::cast<RankedTensorType>(srcOp.getResult().getType());
-    auto outputTensor = rewriter.create<tensor::EmptyOp>(
+    RankedTensorType outputType =
+        mlir::cast<RankedTensorType>(srcOp.getResult().getType());
+    tensor::EmptyOp outputTensor = rewriter.create<tensor::EmptyOp>(
         srcOp.getLoc(), outputType.getShape(), outputType.getElementType());
-    auto operands = adaptor.getOperands();
+    ValueRange operands = adaptor.getOperands();
 
     rewriter.replaceOpWithNewOp<mlir::tt::ttir::MatmulOp>(
         srcOp, TypeRange(outputTensor.getType()), operands[0], operands[1],
@@ -148,6 +149,35 @@ class TosaToTTIRReduceOpConversionPattern : public OpConversionPattern<SrcOp> {
   }
 };
 
+class TosaToTTIRMaxPool2DOpConversionPattern
+    : public OpConversionPattern<tosa::MaxPool2dOp> {
+  using OpConversionPattern<tosa::MaxPool2dOp>::OpConversionPattern;
+  using Adaptor = tosa::MaxPool2dOp::Adaptor;
+
+public:
+  LogicalResult
+  matchAndRewrite(tosa::MaxPool2dOp srcOp, Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    auto outputType = mlir::cast<RankedTensorType>(srcOp.getResult().getType());
+    auto outputTensor = rewriter.create<tensor::EmptyOp>(
+        srcOp.getLoc(), outputType.getShape(), outputType.getElementType());
+
+    auto dims = srcOp.getKernelAttr();
+    auto strides = srcOp.getStrideAttr();
+    auto pad = srcOp.getPadAttr();
+    rewriter.replaceOpWithNewOp<mlir::tt::ttir::MaxPool2dOp>(
+        srcOp, TypeRange(outputTensor.getType()), adaptor.getInput(),
+        outputTensor, dims[0], dims[1], strides[0], strides[1], 1, 1, false,
+        pad[2], pad[3], pad[0], pad[1],
+        rewriter.getArrayAttr(
+            SmallVector<Attribute>(adaptor.getOperands().size() + 1,
+                                   rewriter.getAttr<OperandConstraintAttr>(
+                                       OperandConstraint::AnyDeviceTile))));
+    return success();
+  }
+};
+
 void addElementwiseUnaryOpsConversionPatterns(MLIRContext *ctx,
                                               RewritePatternSet &patterns,
                                               TypeConverter &typeConverter) {
@@ -245,6 +275,12 @@ void addReductionOpsConversionPatterns(MLIRContext *ctx,
                                                    mlir::tt::ttir::SumOp>>(
       typeConverter, ctx);
 }
+
+void addPoolingOpsConversionPatterns(MLIRContext *ctx,
+                                     RewritePatternSet &patterns,
+                                     TypeConverter &typeConverter) {
+  patterns.add<TosaToTTIRMaxPool2DOpConversionPattern>(typeConverter, ctx);
+}
 } // namespace
 
 namespace mlir::tt {
@@ -258,6 +294,7 @@ void populateTosaToTTIRPatterns(MLIRContext *ctx, RewritePatternSet &patterns,
   addCompareOpsConversionPatterns(ctx, patterns, typeConverter);
   addMatmulOpsConversionPatterns(ctx, patterns, typeConverter);
   addReductionOpsConversionPatterns(ctx, patterns, typeConverter);
+  addPoolingOpsConversionPatterns(ctx, patterns, typeConverter);
 }
 
 } // namespace mlir::tt
diff --git a/test/ttmlir/Conversion/TosaToTTIR/maxpool2d_op.mlir b/test/ttmlir/Conversion/TosaToTTIR/maxpool2d_op.mlir
new file mode 100644
index 0000000000..ff1ef5b4f6
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/maxpool2d_op.mlir
@@ -0,0 +1,11 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_maxpool(%arg0: tensor<32x800x600x6xf32>) -> tensor<32x400x300x6xf32> {
+    // CHECK: func.func {{.+}} [[IN_SIZE:tensor<[0-9]+x[0-9]+x[0-9]+x[0-9]+xf32>]]{{.*}} ->
+    %1 = tosa.max_pool2d %arg0 {kernel = array<i64: 2, 2>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 2>} : (tensor<32x800x600x6xf32>) -> tensor<32x400x300x6xf32>
+    // CHECK: %[[OP_OUT:[0-9]+]] = tensor.empty() : [[OUT_SIZE:tensor<[0-9]+x[0-9]+x[0-9]+x[0-9]+xf32>]]
+    // CHECK: %[[VAL:[0-9]+]] = "ttir.max_pool2d"(%arg{{[0-9]+}}, %[[OP_OUT]]){{.+}} ([[IN_SIZE]], [[OUT_SIZE]]) -> [[OUT_SIZE]]
+    // CHECK: return %[[VAL]] : [[OUT_SIZE]]
+    return %1 : tensor<32x400x300x6xf32>
+  }
+}

From 437f52101e4124e8753323c0f06a83c55f7b1e8f Mon Sep 17 00:00:00 2001
From: Andrej Jakovljevic <ajakovljevic@tenstorrent.com>
Date: Mon, 9 Dec 2024 13:26:59 +0100
Subject: [PATCH 74/84] Deleting the asserts preventing reshaping of more than
 4D tensors (#1524)

---
 lib/Dialect/TTIR/IR/TTIROps.cpp | 5 -----
 lib/Dialect/TTNN/IR/TTNNOps.cpp | 5 -----
 2 files changed, 10 deletions(-)

diff --git a/lib/Dialect/TTIR/IR/TTIROps.cpp b/lib/Dialect/TTIR/IR/TTIROps.cpp
index 1e9ae04afc..44af2f2c4b 100644
--- a/lib/Dialect/TTIR/IR/TTIROps.cpp
+++ b/lib/Dialect/TTIR/IR/TTIROps.cpp
@@ -338,11 +338,6 @@ ::mlir::LogicalResult mlir::tt::ttir::ReshapeOp::verify() {
     return emitOpError("Shape attribute must be non-empty");
   }
 
-  // Check that the shape attribute has at most 5 elements
-  if (shape_size > 5) {
-    return emitOpError("Shape attribute must have at most 5 elements");
-  }
-
   // Cardinality of the input and output tensors must be the same
   if (inputType.getNumElements() != outputType.getNumElements()) {
     return emitOpError(
diff --git a/lib/Dialect/TTNN/IR/TTNNOps.cpp b/lib/Dialect/TTNN/IR/TTNNOps.cpp
index 00fa36c278..cca75a7b26 100644
--- a/lib/Dialect/TTNN/IR/TTNNOps.cpp
+++ b/lib/Dialect/TTNN/IR/TTNNOps.cpp
@@ -303,11 +303,6 @@ ::mlir::LogicalResult mlir::tt::ttnn::ReshapeOp::verify() {
     return emitOpError("Shape attribute must be non-empty");
   }
 
-  // Check that the shape attribute has at most 5 elements
-  if (shape_size > 5) {
-    return emitOpError("Shape attribute must have at most 5 elements");
-  }
-
   // Cardinality of the input and output tensors must be the same
   if (inputType.getNumElements() != outputType.getNumElements()) {
     return emitOpError(

From 88cf9a326475a9cc530065044778f03a78d4e526 Mon Sep 17 00:00:00 2001
From: Muhammad Asif Manzoor <mmanzoor@tenstorrent.com>
Date: Mon, 9 Dec 2024 09:29:00 -0500
Subject: [PATCH 75/84] Update hard coded path for build process (#1531)

* Use TTMLIR_TOOLCHAIN_DIR var instead of '/opt/ttmlir-toolchain'
---
 CMakeLists.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2927fb5602..bebff7a0fd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,7 +32,11 @@ if (TT_RUNTIME_ENABLE_PERF_TRACE)
   add_compile_options(-DTRACY_ENABLE=ON)
 endif()
 
-add_compile_options(-Wall -Wextra -Wpedantic -Werror -Wno-unused-parameter --system-header-prefix=/opt/ttmlir-toolchain)
+if (NOT DEFINED ENV{TTMLIR_TOOLCHAIN_DIR})
+    message(FATAL_ERROR "TTMLIR_TOOLCHAIN_DIR environment variable not set. Please run 'source env/activate'.")
+endif()
+
+add_compile_options(-Wall -Wextra -Wpedantic -Werror -Wno-unused-parameter --system-header-prefix=ENV{TTMLIR_TOOLCHAIN_DIR})
 
 include(TTMLIRBuildTypes)
 
@@ -46,10 +50,6 @@ set(Python3_EXECUTABLE $ENV{TTMLIR_VENV_DIR}/bin/python3)
 include(FindMLIR)
 include(TTMLIRVersion)
 
-if (NOT DEFINED ENV{TTMLIR_TOOLCHAIN_DIR})
-    message(FATAL_ERROR "TTMLIR_TOOLCHAIN_DIR environment variable not set. Please run 'source env/activate'.")
-endif()
-
 set(TTMLIR_TOOLCHAIN_DIR $ENV{TTMLIR_TOOLCHAIN_DIR})
 set(TTMLIR_SOURCE_DIR ${PROJECT_SOURCE_DIR})
 set(TTMLIR_BINARY_DIR ${PROJECT_BINARY_DIR})

From 9c416941ffab3b099117433215927012c33330c0 Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <157983820+vmilosevic@users.noreply.github.com>
Date: Mon, 9 Dec 2024 15:36:46 +0100
Subject: [PATCH 76/84] Uplift third_party/tt-metal to
 7768e89929915f40f7fdb45e352dd4b83f335168 2024-12-09 (#1535)

* Uplift third_party/tt-metal to 7768e89929915f40f7fdb45e352dd4b83f335168 2024-12-09

* Remove obsolete comment missed previous uplift in tools/ttnn-standalone/CMakeLists.txt

---------

Co-authored-by: kmitrovicTT <169657397+kmitrovicTT@users.noreply.github.com>
Co-authored-by: Kyle Mabee <kmabee@tenstorrent.com>
---
 third_party/CMakeLists.txt           | 2 +-
 tools/ttnn-standalone/CMakeLists.txt | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index acb8983839..e4c17bae47 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -1,6 +1,6 @@
 include(ExternalProject)
 
-set(TT_METAL_VERSION "8ee1f823ccc6680fd4ba5aa9004487692da1c545")
+set(TT_METAL_VERSION "7768e89929915f40f7fdb45e352dd4b83f335168")
 
 if ("$ENV{ARCH_NAME}" STREQUAL "grayskull")
   set(ARCH_NAME "grayskull")
diff --git a/tools/ttnn-standalone/CMakeLists.txt b/tools/ttnn-standalone/CMakeLists.txt
index bc22737ee9..23c78c7ca9 100644
--- a/tools/ttnn-standalone/CMakeLists.txt
+++ b/tools/ttnn-standalone/CMakeLists.txt
@@ -91,10 +91,6 @@ set(LINK_LIBS
     yaml-cpp
     pthread
 
-    # The below libs have been added to tt-metal repo at some point, but are not
-    # currently needed by the targets here - leaving them commented here for
-    # reference
-
     # TTNN
     # _ttnn  # Why doesn't this work?
     $ENV{TT_METAL_HOME}-build/lib/_ttnn.so

From 32e97e06d9470f982933f7f6092e202357bdfa73 Mon Sep 17 00:00:00 2001
From: Muhammad Asif Manzoor <mmanzoor@tenstorrent.com>
Date: Mon, 9 Dec 2024 10:59:59 -0500
Subject: [PATCH 77/84] Update pytorch version for ttrt (#1518)

---
 runtime/tools/python/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runtime/tools/python/requirements.txt b/runtime/tools/python/requirements.txt
index 8bfab8347d..b427d78493 100644
--- a/runtime/tools/python/requirements.txt
+++ b/runtime/tools/python/requirements.txt
@@ -1 +1 @@
-torch==2.3.0 --index-url https://download.pytorch.org/whl/cpu
+torch==2.5.0 --index-url https://download.pytorch.org/whl/cpu

From d8cc464b6d777ba5bb6f4910bf02ec810e301607 Mon Sep 17 00:00:00 2001
From: Andrej Jakovljevic <ajakovljevic@tenstorrent.com>
Date: Mon, 9 Dec 2024 17:11:46 +0100
Subject: [PATCH 78/84] Add support for 1d convolution in ttir and ttnn mlir
 dialects (#1438)

* Added support for 1d convolution

* Added tests

* Addressed comments

* Addressed more comments
---
 .../TTIRToTTIRDecomposition.cpp               | 227 +++++++++++++++---
 lib/Dialect/TTNN/Transforms/TTNNLayout.cpp    |  20 ++
 .../Conversion/StableHLOToTTIR/conv2d_op.mlir |  24 +-
 .../TTNN/convolution/simple_conv1d.mlir       |  17 ++
 4 files changed, 256 insertions(+), 32 deletions(-)
 create mode 100644 test/ttmlir/Dialect/TTNN/convolution/simple_conv1d.mlir

diff --git a/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecomposition.cpp b/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecomposition.cpp
index ed7eb0be82..9ba4257428 100644
--- a/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecomposition.cpp
+++ b/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecomposition.cpp
@@ -224,34 +224,22 @@ static std::vector<TransposeDims> generateConvKernelTransposeIndices(
   return generateTransposeIndices(kernelLayout, ttnnConvolutionKernelLayout);
 }
 
-struct ConvolutionToConv2dPattern
+struct ConvolutionDecompositionPattern
     : public OpConversionPattern<ttir::ConvolutionOp> {
 public:
   using OpConversionPattern<ttir::ConvolutionOp>::OpConversionPattern;
 
-  constexpr static uint32_t numSpatialDims = 2;
-  constexpr static uint32_t SPATIAL_DIM_HEIGHT = 0;
-  constexpr static uint32_t SPATIAL_DIM_WIDTH = 1;
-
-  // NHWC
-  static inline const std::vector<int64_t> conv2dLayout = {
-      ConvolutionDimension::BATCH,
-      SPATIAL_DIM_HEIGHT,
-      SPATIAL_DIM_WIDTH,
-      ConvolutionDimension::FEATURE,
-  };
-  // OIHW
-  static inline const std::vector<int64_t> conv2dKernelLayout = {
-      ConvolutionKernelDimension::OUTPUT_FEATURES,
-      ConvolutionKernelDimension::INPUT_FEATURES,
-      SPATIAL_DIM_HEIGHT,
-      SPATIAL_DIM_WIDTH,
-  };
-
-  LogicalResult isConv2d(ttir::ConvolutionOp op) const {
+  LogicalResult
+  matchAndRewrite(ttir::ConvolutionOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override = 0;
 
-    // Conv2d will have 2 spatial dimensions
+protected:
+  bool isNDimensional(ttir::ConvolutionOp op, uint32_t numSpatialDims) const {
+    return op.getConvolutionLayout().getInputSpatialDimensions().size() ==
+           numSpatialDims;
+  }
 
+  bool isSupportedConv(ttir::ConvolutionOp op) const {
     assert(op.getConvolutionLayout().getInputSpatialDimensions().size() ==
                op.getConvolutionLayout().getOutputSpatialDimensions().size() &&
            "Convolution input, output, and kernel must have the same number of "
@@ -261,33 +249,211 @@ struct ConvolutionToConv2dPattern
            "Convolution input, output, and kernel must have the same number of "
            "spatial dimensions");
 
-    if (op.getConvolutionLayout().getInputSpatialDimensions().size() !=
-        numSpatialDims) {
-      return failure();
-    }
-
     // Not currently supporting window reversal
     std::vector<bool> windowReversal(op.getWindowReversal().begin(),
                                      op.getWindowReversal().end());
     for (bool reversed : windowReversal) {
       if (reversed) {
-        return failure();
+        return false;
       }
     }
 
     // Not currently support batch groups
     if (op.getBatchGroupCount() != 1) {
+      return false;
+    }
+
+    return true;
+  }
+};
+
+// A decompostion pattern that matches to a ttir.convolution op that does 1D
+// convolution. Since that is not supported in ttnn, we reshape the inputs and
+// the output to match a 2D ttir.convolution op. The expectation is that the new
+// ttir.convolution op will be picked up by the ConvolutionToConv2dPattern and
+// translated into ttir.conv2d op.
+struct Legalize1DConvolutionPattern : public ConvolutionDecompositionPattern {
+public:
+  using ConvolutionDecompositionPattern::ConvolutionDecompositionPattern;
+  constexpr static uint32_t numSpatialDims = 1;
+
+  LogicalResult
+  matchAndRewrite(ttir::ConvolutionOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    if (!(isSupportedConv(op) && isNDimensional(op, numSpatialDims))) {
+      return failure();
+    }
+
+    // Not currently supporting spatial dims other than 2 for the 1D case.
+    if (op.getConvolutionLayout().getInputSpatialDimensions()[0] != 2) {
       return failure();
     }
 
+    // The shapes that the convolution currently operates with have are 3D, and
+    // we need to add another dimension for it to match the conv2d signature, so
+    // adding a dimension of size 1 to the end of input and output shapes.
+    auto outputType =
+        mlir::cast<RankedTensorType>(adaptor.getOutput().getType());
+    llvm::ArrayRef<int64_t> outputShape = outputType.getShape();
+    llvm::SmallVector<int64_t, 4> conv2dOutputShape(outputShape.begin(),
+                                                    outputShape.end());
+    conv2dOutputShape.push_back(1);
+    auto DPSConv2dOutput = rewriter.create<tensor::EmptyOp>(
+        op->getLoc(), conv2dOutputShape, outputType.getElementType());
+    auto conv2dOutputType =
+        mlir::cast<RankedTensorType>(DPSConv2dOutput.getType());
+
+    auto inputType = mlir::cast<RankedTensorType>(adaptor.getInput().getType());
+    llvm::ArrayRef<int64_t> inputShape = inputType.getShape();
+    llvm::SmallVector<int64_t, 4> reshapeInputShape(inputShape.begin(),
+                                                    inputShape.end());
+    reshapeInputShape.push_back(1);
+
+    auto weightType =
+        mlir::cast<RankedTensorType>(adaptor.getWeight().getType());
+    llvm::ArrayRef<int64_t> weightShape = weightType.getShape();
+    llvm::SmallVector<int64_t, 4> reshapeWeightShape(weightShape.begin(),
+                                                     weightShape.end());
+    reshapeWeightShape.push_back(1);
+
+    ttir::ReshapeOp reshapeInput =
+        createReshapeOp(op.getLoc(), adaptor.getInput(), reshapeInputShape,
+                        op.getOperandConstraints(), rewriter);
+    ttir::ReshapeOp reshapeWeight =
+        createReshapeOp(op.getLoc(), adaptor.getWeight(), reshapeWeightShape,
+                        op.getOperandConstraints(), rewriter);
+
+    mlir::DenseI64ArrayAttr conv2dOpWindowsStridesAttr =
+        addIntegerToDenseArrayAttr(rewriter, adaptor.getWindowStridesAttr(), 1);
+    mlir::DenseI64ArrayAttr conv2dOpPaddingAttr =
+        addIntegerToDenseArrayAttr(rewriter, adaptor.getPaddingAttr(), 0);
+    conv2dOpPaddingAttr =
+        addIntegerToDenseArrayAttr(rewriter, conv2dOpPaddingAttr, 0);
+    mlir::DenseI64ArrayAttr conv2dOpInputDilationAttr =
+        addIntegerToDenseArrayAttr(rewriter, adaptor.getInputDilationAttr(), 1);
+    mlir::DenseI64ArrayAttr conv2dOpWeightDilationAttr =
+        addIntegerToDenseArrayAttr(rewriter, adaptor.getWeightDilationAttr(),
+                                   1);
+    mlir::DenseBoolArrayAttr conv2dOpWindowReversalAttr =
+        addBooleanToDenseArrayAttr(rewriter, adaptor.getWindowReversalAttr(),
+                                   false);
+
+    auto convolutionLayout = adaptor.getConvolutionLayoutAttr();
+
+    // The additional spatial dimension is added at the and (3rd in 0 indexed
+    // array).
+    llvm::SmallVector<int64_t, 4> conv2dInputSpatialDimensions(
+        convolutionLayout.getInputSpatialDimensions().begin(),
+        convolutionLayout.getInputSpatialDimensions().end());
+    conv2dInputSpatialDimensions.push_back(3);
+
+    llvm::SmallVector<int64_t, 4> conv2dKernelSpatialDimensions(
+        convolutionLayout.getKernelSpatialDimensions().begin(),
+        convolutionLayout.getKernelSpatialDimensions().end());
+    conv2dKernelSpatialDimensions.push_back(3);
+
+    llvm::SmallVector<int64_t, 4> conv2dOutputSpatialDimensions(
+        convolutionLayout.getOutputSpatialDimensions().begin(),
+        convolutionLayout.getOutputSpatialDimensions().end());
+    conv2dOutputSpatialDimensions.push_back(3);
+
+    mlir::tt::ttir::ConvolutionOp new2dConvolutionOp =
+        rewriter.create<mlir::tt::ttir::ConvolutionOp>(
+            op.getLoc(), conv2dOutputType, reshapeInput, reshapeWeight,
+            mlir::Value(nullptr), DPSConv2dOutput, conv2dOpWindowsStridesAttr,
+            conv2dOpPaddingAttr, conv2dOpInputDilationAttr,
+            conv2dOpWeightDilationAttr, conv2dOpWindowReversalAttr,
+            mlir::tt::ttir::ConvolutionLayoutAttr::get(
+                getContext(), convolutionLayout.getInputBatchDimension(),
+                convolutionLayout.getInputFeatureDimension(),
+                conv2dInputSpatialDimensions,
+                convolutionLayout.getKernelOutputFeatureDimension(),
+                convolutionLayout.getKernelInputFeatureDimension(),
+                conv2dKernelSpatialDimensions,
+                convolutionLayout.getOutputBatchDimension(),
+                convolutionLayout.getOutputFeatureDimension(),
+                conv2dOutputSpatialDimensions),
+            adaptor.getFeatureGroupCountAttr(),
+            adaptor.getBatchGroupCountAttr(),
+            rewriter.getArrayAttr(
+                SmallVector<Attribute>(adaptor.getOperands().size() + 1,
+                                       rewriter.getAttr<OperandConstraintAttr>(
+                                           OperandConstraint::AnyDeviceTile))));
+    ttir::ReshapeOp reshapeOutput =
+        createReshapeOp(op.getLoc(), new2dConvolutionOp, outputShape,
+                        op.getOperandConstraints(), rewriter);
+
+    rewriter.replaceOp(op, reshapeOutput);
+
     return success();
   }
 
+private:
+  ttir::ReshapeOp createReshapeOp(Location loc, Value tensor,
+                                  llvm::ArrayRef<int64_t> target_input_shape,
+                                  ::mlir::ArrayAttr constraints,
+                                  ConversionPatternRewriter &rewriter) const {
+    auto inputType = mlir::cast<RankedTensorType>(tensor.getType());
+
+    auto DPSReshapeOutput = rewriter.create<tensor::EmptyOp>(
+        loc, llvm::ArrayRef<int64_t>(target_input_shape),
+        inputType.getElementType());
+    llvm::SmallVector<int32_t, 2> shapei32(target_input_shape.begin(),
+                                           target_input_shape.end());
+    auto shape_attr = rewriter.getI32ArrayAttr(shapei32);
+
+    return rewriter.create<ttir::ReshapeOp>(
+        loc,
+        mlir::RankedTensorType::get(target_input_shape,
+                                    inputType.getElementType()),
+        tensor, DPSReshapeOutput, shape_attr, constraints);
+  }
+
+  mlir::DenseI64ArrayAttr
+  addIntegerToDenseArrayAttr(ConversionPatternRewriter &rewriter,
+                             mlir::DenseI64ArrayAttr denseArrayAttr,
+                             uint64_t integerValue) const {
+    llvm::SmallVector<int64_t, 4> newDenseArray(denseArrayAttr.asArrayRef());
+    newDenseArray.push_back(integerValue);
+    return rewriter.getDenseI64ArrayAttr(newDenseArray);
+  }
+
+  mlir::DenseBoolArrayAttr
+  addBooleanToDenseArrayAttr(ConversionPatternRewriter &rewriter,
+                             mlir::DenseBoolArrayAttr denseArrayAttr,
+                             bool booleanValue) const {
+    llvm::SmallVector<bool, 4> newDenseArray(denseArrayAttr.asArrayRef());
+    newDenseArray.push_back(booleanValue);
+    return rewriter.getDenseBoolArrayAttr(newDenseArray);
+  }
+};
+struct ConvolutionToConv2dPattern : public ConvolutionDecompositionPattern {
+public:
+  using ConvolutionDecompositionPattern::ConvolutionDecompositionPattern;
+
+  constexpr static uint32_t numSpatialDims = 2;
+  constexpr static uint32_t SPATIAL_DIM_HEIGHT = 0;
+  constexpr static uint32_t SPATIAL_DIM_WIDTH = 1;
+
+  // NHWC
+  static inline const std::vector<int64_t> conv2dLayout = {
+      ConvolutionDimension::BATCH,
+      SPATIAL_DIM_HEIGHT,
+      SPATIAL_DIM_WIDTH,
+      ConvolutionDimension::FEATURE,
+  };
+  // OIHW
+  static inline const std::vector<int64_t> conv2dKernelLayout = {
+      ConvolutionKernelDimension::OUTPUT_FEATURES,
+      ConvolutionKernelDimension::INPUT_FEATURES,
+      SPATIAL_DIM_HEIGHT,
+      SPATIAL_DIM_WIDTH,
+  };
+
   LogicalResult
   matchAndRewrite(ttir::ConvolutionOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-
-    if (failed(isConv2d(op))) {
+    if (!(isSupportedConv(op) && isNDimensional(op, numSpatialDims))) {
       return failure();
     }
 
@@ -1039,6 +1205,7 @@ void populateTTIRToTTIRDecompositionPatterns(MLIRContext *ctx,
                                              TypeConverter &typeConverter) {
   patterns.add<PoolingToPool2dPattern>(typeConverter, ctx);
   patterns.add<IndexToSliceConversionPattern>(typeConverter, ctx);
+  patterns.add<Legalize1DConvolutionPattern>(typeConverter, ctx);
   patterns.add<ConvolutionToConv2dPattern>(typeConverter, ctx);
   patterns.add<GetDimensionSizeToConstantConversionPattern>(typeConverter, ctx);
   patterns.add<GatherToEmbeddingConversionPattern>(typeConverter, ctx);
diff --git a/lib/Dialect/TTNN/Transforms/TTNNLayout.cpp b/lib/Dialect/TTNN/Transforms/TTNNLayout.cpp
index 9036346a43..712e12ad08 100644
--- a/lib/Dialect/TTNN/Transforms/TTNNLayout.cpp
+++ b/lib/Dialect/TTNN/Transforms/TTNNLayout.cpp
@@ -279,6 +279,20 @@ createToLayoutOp(PatternRewriter &rewriter, Location loc, Value input,
                           ttnnMemoryLayoutAttr, tiled);
 }
 
+static bool changeLayoutToHost(DestinationStyleOpInterface &op,
+                               OpOperand &operand, PatternRewriter &rewriter) {
+  Location newLoc = appendInputSuffix(op.getLoc(), operand.getOperandNumber());
+  std::optional<Value> layout =
+      createToLayoutOp(rewriter, newLoc, operand.get(),
+                       BufferType::SystemMemory, nullptr, false /* tiled */);
+  if (layout.has_value()) {
+    rewriter.modifyOpInPlace(
+        op, [&]() { op->setOperand(operand.getOperandNumber(), *layout); });
+    return true;
+  }
+  return false;
+}
+
 // Updates the layout of the operands of a TTIR ops which have DPS operands.
 // This function rewrites the operands and result to have the correct layout
 // with respect to operand constraints.
@@ -304,6 +318,12 @@ class TTNNLayoutDPSOperandsRewriter
       // TTNN Conv2d moves input, weight, and bias from host to device
       // itself. Inserting the ToLayoutOp on these operands is thus problematic.
       if (mlir::isa<ttir::Conv2dOp>(op.getOperation()) && !isResult) {
+        // For the weight input of the conv2d op, it specifically needs to be on
+        // host, so we create a host to layout op (issue
+        // https://github.com/tenstorrent/tt-mlir/issues/1528).
+        if (operand.getOperandNumber() == 1) {
+          modified = changeLayoutToHost(op, operand, rewriter);
+        }
         continue;
       }
 
diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/conv2d_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/conv2d_op.mlir
index ce4a6f6565..0c41398cd1 100644
--- a/test/ttmlir/Conversion/StableHLOToTTIR/conv2d_op.mlir
+++ b/test/ttmlir/Conversion/StableHLOToTTIR/conv2d_op.mlir
@@ -2,6 +2,8 @@
 // RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s
 module @jit_convolution attributes {} {
   func.func public @test_convolution(%arg0: tensor<1x128x128x32xf32>, %arg1: tensor<64x32x3x3xf32>) -> tensor<1x128x128x64xf32> {
+    // CHECK: %[[C:.*]] = tensor.empty[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttir.convolution"[[C:.*]]
     %0 = stablehlo.convolution(%arg0, %arg1)
       dim_numbers = [b, 0, 1, f]x[o, i, 0, 1]->[b, 0, 1, f],
       window = {
@@ -12,8 +14,26 @@ module @jit_convolution attributes {} {
         batch_group_count = 1 : i64,
         precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]
       } : (tensor<1x128x128x32xf32>, tensor<64x32x3x3xf32>) -> tensor<1x128x128x64xf32>
-    // CHECK: %[[C:.*]] = tensor.empty[[C:.*]]
-    // CHECK: %[[C:.*]] = "ttir.convolution"[[C:.*]]
     return %0 : tensor<1x128x128x64xf32>
   }
+
+  // Tests 1d convolution that gets translated to 2d.
+  func.func @test_convolution_1d(%arg0: tensor<1x256x512xf32>, %arg1: tensor<1024x256x1xf32>) -> tensor<1x1024x512xf32> {
+    // CHECK: [[VAL0:%[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<[0-9]+x[0-9]+x[0-9]+xf[0-9]+>]]
+    // CHECK: %1 = "ttir.convolution"(%arg0, %arg1, [[VAL0]])
+    // CHECK: batch_group_count = 1 : i64, convolution_layout = #ttir<convolution_layout input_batch = 0, input_feature = 1, input_spatial_dimensions = 2, kernel_output_feature = 0, kernel_input_feature = 1, kernel_spatial_dimensions = 2, output_batch = 0, output_feature = 1, output_spatial_dimensions = 2
+    // CHECK: padding = array<i64: 0, 0>, weight_dilation = array<i64: 1>, window_reversal = array<i1: false>, window_strides = array<i64: 1>
+    // CHECK: : (tensor<1x256x512xf32>, tensor<1024x256x1xf32>, [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    %0 = stablehlo.convolution(%arg0, %arg1)
+      dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0],
+      window = {
+        stride = [1],
+        pad = [[0, 0]],
+        rhs_dilate = [1]
+      } {
+        batch_group_count = 1 : i64,
+        feature_group_count = 1 : i64
+      } : (tensor<1x256x512xf32>, tensor<1024x256x1xf32>) -> tensor<1x1024x512xf32>
+    return %0 : tensor<1x1024x512xf32>
+  }
 }
diff --git a/test/ttmlir/Dialect/TTNN/convolution/simple_conv1d.mlir b/test/ttmlir/Dialect/TTNN/convolution/simple_conv1d.mlir
new file mode 100644
index 0000000000..8f75362a02
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/convolution/simple_conv1d.mlir
@@ -0,0 +1,17 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s | FileCheck %s
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|none|interleaved|single_bank|height_sharded|width_sharded|block_sharded|any_layout|any_device_tile>
+module {
+  func.func @main(%arg0: tensor<1x256x512xf32>, %arg1: tensor<1024x256x1xf32>, %arg2: tensor<1024xf32>) -> tensor<1x1024x512xf32> {
+    %0 = tensor.empty() : tensor<1x1024x512xf32>
+    // CHECK: [[VAL0:%[0-9]+]] = "ttnn.reshape"(%{{.*}}) <{shape = [1 : i32, 256 : i32, 512 : i32, 1 : i32]}> : (tensor<[[TENSOR_SHAPE0:[0-9]+x[0-9]+x[0-9]+xf32]], #{{.*}}) -> tensor<[[TENSOR_SHAPE1:[0-9]+x[0-9]+x[0-9]+x1xf32]], #{{.*}}>
+    // CHECK: [[VAL1:%[0-9]+]] = "ttnn.reshape"(%{{.*}}) <{shape = [1024 : i32, 256 : i32, 1 : i32, 1 : i32]}> : (tensor<[[TENSOR_SHAPE2:[0-9]+x[0-9]+x[0-9]+xf32]], #{{.*}}>) -> tensor<[[TENSOR_SHAPE3:[0-9]+x[0-9]+x[0-9]+x1xf32]], #{{.*}}>
+    // CHECK: [[VAL2:%[0-9]+]] = "ttnn.transpose"([[VAL0]]) <{dim0 = 1 : si32, dim1 = 2 : si32}> : (tensor<[[TENSOR_SHAPE1]], #{{.*}}>) -> tensor<[[TENSOR_SHAPE4:[0-9]+x[0-9]+x[0-9]+x1xf32]], #{{.*}}>
+    // CHECK: [[VAL3:%[0-9]+]] = "ttnn.transpose"([[VAL2]]) <{dim0 = 2 : si32, dim1 = 3 : si32}> : (tensor<[[TENSOR_SHAPE4]], #{{.*}}>) -> tensor<[[TENSOR_SHAPE5:[0-9]+x[0-9]+x[0-9]+x[0-9]+xf32]], #{{.*}}>
+    // CHECK: [[VAL4:%[0-9]+]] = "ttnn.reshape"([[VAL3]]) <{shape = [1 : i32, 1 : i32, 512 : i32, 256 : i32]}> : (tensor<[[TENSOR_SHAPE5]], #{{.*}}>) -> tensor<[[TENSOR_SHAPE6:[0-9]+x[0-9]+x[0-9]+x[0-9]+xf32]], #{{.*}}>
+    // CHECK: [[VAL5:%[0-9]+]] = "ttnn.conv2d"([[VAL4]], %10, %{{[0-9]+}}, %{{[0-9]+}})
+    // CHECK: (tensor<[[TENSOR_SHAPE6]], #{{.*}}>, tensor<1024x256x1x1xf32,  #{{.*}}>, tensor<1x1x512x1024xf32, #{{.*}}>, !tt.device<#device>) -> tensor<1x1x512x1024xf32,  #{{.*}}>
+    %1 = "ttir.convolution"(%arg0, %arg1, %0) <{batch_group_count = 1 : i64, convolution_layout = #ttir<convolution_layout input_batch = 0, input_feature = 1, input_spatial_dimensions = 2, kernel_output_feature = 0, kernel_input_feature = 1, kernel_spatial_dimensions = 2, output_batch = 0, output_feature = 1, output_spatial_dimensions = 2>, feature_group_count = 1 : i64, input_dilation = array<i64: 1>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile], padding = array<i64: 0, 0>, weight_dilation = array<i64: 1>, window_reversal = array<i1: false>, window_strides = array<i64: 1>}> : (tensor<1x256x512xf32>, tensor<1024x256x1xf32>, tensor<1x1024x512xf32>) -> tensor<1x1024x512xf32>
+    // CHECK: return %{{.*}} : tensor<1x1024x512xf32, #ttnn_layout3>
+    return %1 : tensor<1x1024x512xf32>
+  }
+}

From 6c4a4fac829af58fa44316b5dcbadf9ff7c9086c Mon Sep 17 00:00:00 2001
From: Sasa Vuckovic <134393361+svuckovicTT@users.noreply.github.com>
Date: Mon, 9 Dec 2024 17:14:20 +0100
Subject: [PATCH 79/84] Add pass to create input tensor generator functions for
 emitc path (#1523)

---
 .../ttmlir/Dialect/TTNN/Transforms/Passes.td  |  39 ++++
 include/ttmlir/Dialect/TTNN/Utils/Utils.h     |   4 +
 lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp    |  28 +++
 lib/Dialect/TTNN/Transforms/Passes.cpp        | 195 +++++++++++++++++-
 lib/Dialect/TTNN/Utils/Utils.cpp              |  12 --
 .../Transforms/ttnn_create_input_gens_0.mlir  |  36 ++++
 6 files changed, 300 insertions(+), 14 deletions(-)
 create mode 100644 test/ttmlir/Dialect/TTNN/Transforms/ttnn_create_input_gens_0.mlir

diff --git a/include/ttmlir/Dialect/TTNN/Transforms/Passes.td b/include/ttmlir/Dialect/TTNN/Transforms/Passes.td
index 444927e348..13253d131d 100644
--- a/include/ttmlir/Dialect/TTNN/Transforms/Passes.td
+++ b/include/ttmlir/Dialect/TTNN/Transforms/Passes.td
@@ -36,4 +36,43 @@ def TTNNWorkarounds : Pass<"ttnn-workaround", "::mlir::ModuleOp"> {
   }];
 }
 
+def TTNNCreateInputGenerators: Pass<"ttnn-create-input-gens", "::mlir::ModuleOp"> {
+  let summary = "Create input generators for the forward functions.";
+  let description = [{
+    This pass creates input generators for the "forward" functions. It
+    additionally creates a main function to run the forward function with the
+    generated inputs.
+
+    The pass is useful for EmitC path. By creating input generators before
+    converting to Emitc Dialect, followed by transformation to C++ code, the
+    resulting code won't require any edits to run.
+
+    Given a forward function like this:
+
+    ```
+    func.func @add(%arg0: tensor<32x32xbf16>, %arg1: tensor<32x32xbf16>) -> tensor<32x32xbf16> {
+      %0 = "ttnn.add"(%arg0, %arg1) : (tensor<32x32xbf16>, tensor<32x32xbf16>) -> tensor<32x32xbf16>
+      return %0 : tensor<32x32xbf16>
+    }
+    ```
+
+    The pass will create two function like this:
+
+    ```
+    func.func @createInputsFor_add() -> (tensor<32x32xbf16>, tensor<32x32xbf16>) {
+      %0 = "ttnn.empty"() <{shape = #ttnn.shape<32x32>}> : () -> tensor<32x32xbf16>
+      %1 = "ttnn.empty"() <{shape = #ttnn.shape<32x32>}> : () -> tensor<32x32xbf16>
+      return %0, %1 : tensor<32x32xbf16>, tensor<32x32xbf16>
+    }
+
+    func.func @main() -> i32 {
+      %0:2 = call @createInputsFor_add() : () -> (tensor<32x32xbf16>, tensor<32x32xbf16>)
+      %1 = call @add(%0#0, %0#1) : (tensor<32x32xbf16>, tensor<32x32xbf16>) -> tensor<32x32xbf16>
+      %c0_i32 = arith.constant 0 : i32
+      return %c0_i32 : i32
+    }
+    ```
+  }];
+}
+
 #endif
diff --git a/include/ttmlir/Dialect/TTNN/Utils/Utils.h b/include/ttmlir/Dialect/TTNN/Utils/Utils.h
index f214fa793d..2c4b7a2508 100644
--- a/include/ttmlir/Dialect/TTNN/Utils/Utils.h
+++ b/include/ttmlir/Dialect/TTNN/Utils/Utils.h
@@ -35,6 +35,10 @@ mlir::tt::TensorMemoryLayout toTTTensorMemoryLayout(
 mlir::tt::MemorySpace
 toTTMemorySpace(const mlir::tt::ttnn::BufferType bufferType);
 
+// Get Layout from MemRefType
+//
+Layout getLayoutFromMemRef(mlir::MemRefType memref);
+
 mlir::Type createRowMajorTypeFromDtype(::mlir::MLIRContext *context,
                                        DataType dtype);
 
diff --git a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
index 3986438e64..aedad4d290 100644
--- a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
+++ b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
@@ -9,6 +9,7 @@
 #include "ttmlir/Dialect/TTNN/IR/TTNNOps.h"
 #include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h"
 
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/EmitC/IR/EmitC.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Func/Transforms/FuncConversions.h"
@@ -616,6 +617,29 @@ class DeallocateOpConversionPattern
   }
 };
 
+// arith::ConstantOp conversion pattern
+//
+class ArithConstantOpConversionPattern
+    : public OpConversionPattern<arith::ConstantOp> {
+
+public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(arith::ConstantOp constOp, arith::ConstantOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    Type newTy = this->getTypeConverter()->convertType(constOp.getType());
+    if (!newTy) {
+      return rewriter.notifyMatchFailure(constOp, "type conversion failed");
+    }
+
+    rewriter.replaceOpWithNewOp<emitc::ConstantOp>(constOp, newTy,
+                                                   adaptor.getValue());
+    return success();
+  }
+};
+
 // Module Op conversion pattern
 //
 // This conversion pattern removes attributes from the ModuleOp. Previously,
@@ -762,6 +786,10 @@ void populateTTNNToEmitCPatterns(mlir::MLIRContext *ctx,
                                                                 ctx);
   patterns.add<DefaultOpConversionPattern<ttnn::FillCacheOp>>(typeConverter,
                                                               ctx);
+
+  // Arith ops
+  //
+  patterns.add<ArithConstantOpConversionPattern>(typeConverter, ctx);
 }
 
 } // namespace mlir::tt
diff --git a/lib/Dialect/TTNN/Transforms/Passes.cpp b/lib/Dialect/TTNN/Transforms/Passes.cpp
index 01971b6c61..c842c4075b 100644
--- a/lib/Dialect/TTNN/Transforms/Passes.cpp
+++ b/lib/Dialect/TTNN/Transforms/Passes.cpp
@@ -3,15 +3,27 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "ttmlir/Dialect/TTNN/Transforms/Passes.h"
+
+#include "ttmlir/Dialect/TTNN/IR/TTNNOps.h"
+#include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h"
+#include "ttmlir/Dialect/TTNN/IR/TTNNOpsTypes.h"
+#include "ttmlir/Dialect/TTNN/Utils/Utils.h"
+
 #include "mlir/Analysis/Liveness.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/PatternMatch.h"
-#include "ttmlir/Dialect/TTNN/IR/TTNNOpsTypes.h"
-#include "ttmlir/Dialect/TTNN/Utils/Utils.h"
+#include "mlir/IR/TypeRange.h"
+#include "mlir/IR/ValueRange.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
 
 namespace mlir::tt::ttnn {
 #define GEN_PASS_DEF_TTNNDEALLOCATE
 #define GEN_PASS_DEF_TTNNDECOMPOSELAYOUTS
+#define GEN_PASS_DEF_TTNNCREATEINPUTGENERATORS
 #include "ttmlir/Dialect/TTNN/Transforms/Passes.h.inc"
 
 class TTNNDeallocate : public impl::TTNNDeallocateBase<TTNNDeallocate> {
@@ -873,4 +885,183 @@ class TTNNDecomposeLayouts
   }
 };
 
+class TTNNCreateInputGenerators
+    : public impl::TTNNCreateInputGeneratorsBase<TTNNCreateInputGenerators> {
+
+public:
+  using impl::TTNNCreateInputGeneratorsBase<
+      TTNNCreateInputGenerators>::TTNNCreateInputGeneratorsBase;
+
+  void runOnOperation() final {
+    ModuleOp module = getOperation();
+    IRRewriter rewriter(&getContext());
+
+    // Ensure that the module has a single region and a single block within that
+    // region
+    assert(module->getRegions().size() == 1);
+    assert(module->getRegion(0).getBlocks().size() == 1);
+
+    // Get the first block of the region at index 0
+    //
+    Block *firstBlock = module.getBody(0);
+
+    // Find all the func.func ops in the module
+    //
+    SmallVector<func::FuncOp, 1> forwardFuncOps;
+    for (mlir::Operation &op : firstBlock->getOperations()) {
+      if (mlir::func::FuncOp funcOp = dyn_cast<func::FuncOp>(op)) {
+
+        // Skip functions that are called elsewhere in the IR
+        //
+        // This will skip utility functions that are used by other functions,
+        // only top-level "forward" functions should be considered
+        //
+        if (!funcOp->getUses().empty()) {
+          continue;
+        }
+
+        forwardFuncOps.push_back(funcOp);
+      }
+    }
+
+    // Iterate over all the func ops and add input tensor generator functions
+    //
+    for (mlir::func::FuncOp forwardFuncOp : forwardFuncOps) {
+      // Get all the input tensors for the current forward func
+      //
+      llvm::SmallVector<mlir::RankedTensorType, 2> inputTensors;
+      for (auto input : forwardFuncOp.getFunctionType().getInputs()) {
+        inputTensors.push_back(llvm::cast<mlir::RankedTensorType>(input));
+      }
+
+      // Create a new function that will generate the input tensors
+      //
+      std::string inputGenFuncName =
+          "createInputsFor_" + forwardFuncOp.getName().str();
+
+      // Create function type
+      //
+      mlir::TypeRange returnTypeRange =
+          mlir::TypeRange(forwardFuncOp.getFunctionType().getInputs());
+      FunctionType functionType =
+          mlir::FunctionType::get(&getContext(), {}, returnTypeRange);
+
+      // Set insertion point to end of first block
+      //
+      rewriter.setInsertionPointToEnd(firstBlock);
+
+      // Create the function
+      //
+      func::FuncOp inputGenFuncOp = rewriter.create<mlir::func::FuncOp>(
+          module->getLoc(), inputGenFuncName, functionType);
+
+      // Add a Block to func op and set insertion point to the beginning of the
+      // Block
+      //
+      ::mlir::Block *currFnBlock = inputGenFuncOp.addEntryBlock();
+      rewriter.setInsertionPointToStart(currFnBlock);
+
+      // Create the input tensors
+      //
+      SmallVector<Value, 2> generatedTensors;
+      for (Type tensorType : returnTypeRange) {
+        assert(llvm::isa<mlir::RankedTensorType>(tensorType));
+
+        RankedTensorType tensor =
+            llvm::cast<mlir::RankedTensorType>(tensorType);
+
+        // Get the layout attribute
+        //
+        ttnn::TTNNLayoutAttr layoutAttr =
+            mlir::cast<ttnn::TTNNLayoutAttr>(tensor.getEncoding());
+
+        // Get the shape of the tensor, tensor layout, and data type
+        //
+        ShapeAttr shapeAttr =
+            ttnn::ShapeAttr::get(&getContext(), tensor.getShape());
+        ttnn::LayoutAttr tensorLayoutAttr =
+            ttnn::LayoutAttr::get(&getContext(), layoutAttr.getLayout());
+        DataTypeAttr dTypeAttr =
+            DataTypeAttr::get(&getContext(), layoutAttr.getDataType());
+
+        // Create a new tensor
+        //
+        // TODO(svuckovic): Move from ttnn::EmptyOp to ttnn::OnesOp once #1476
+        // lands
+        //
+        mlir::Value tensorValue = rewriter.create<ttnn::EmptyOp>(
+            forwardFuncOp->getLoc(), tensorType, nullptr, shapeAttr, dTypeAttr,
+            tensorLayoutAttr, nullptr);
+
+        generatedTensors.push_back(tensorValue);
+      }
+
+      // Return the generated tensors
+      //
+      rewriter.create<func::ReturnOp>(forwardFuncOp->getLoc(),
+                                      generatedTensors);
+    }
+
+    // Create a main function to call input generators and forward funcs
+    //
+    {
+      // Create a new function that will generate the input tensors
+      //
+      std::string mainFuncName = "main";
+
+      // Create function type
+      //
+      mlir::TypeRange returnTypeRange = mlir::TypeRange(rewriter.getI32Type());
+      FunctionType functionType =
+          mlir::FunctionType::get(&getContext(), {}, returnTypeRange);
+
+      // Set insertion point to end of first block
+      //
+      rewriter.setInsertionPointToEnd(firstBlock);
+
+      // Create the function
+      //
+      func::FuncOp mainFuncOp = rewriter.create<mlir::func::FuncOp>(
+          module->getLoc(), mainFuncName, functionType);
+
+      ::mlir::Block *currFnBlock = mainFuncOp.addEntryBlock();
+
+      // Set insertion point to the beginning of the block
+      //
+      rewriter.setInsertionPointToStart(currFnBlock);
+
+      // Call the input generators
+      //
+      for (mlir::func::FuncOp forwardFuncOp : forwardFuncOps) {
+        std::string inputGenFuncName =
+            "createInputsFor_" + forwardFuncOp.getName().str();
+
+        // Get the input generator function
+        //
+        mlir::func::FuncOp inputGenFuncOp =
+            module.lookupSymbol<mlir::func::FuncOp>(inputGenFuncName);
+
+        // Call the input generator function
+        //
+        func::CallOp createdTensors = rewriter.create<mlir::func::CallOp>(
+            forwardFuncOp->getLoc(), inputGenFuncOp, ValueRange());
+
+        rewriter.create<mlir::func::CallOp>(forwardFuncOp->getLoc(),
+                                            forwardFuncOp,
+                                            createdTensors->getResults());
+      }
+
+      // Return 0
+      //
+      // func::ReturnOp requires a Value to be returned, which means that an SSA
+      // needs to be returned, hence create a constant 0 via arith::ConstantOp
+      //
+      Value constantZero = rewriter.create<arith::ConstantOp>(
+          rewriter.getUnknownLoc(), rewriter.getI32Type(),
+          rewriter.getI32IntegerAttr(0));
+      rewriter.create<func::ReturnOp>(mainFuncOp->getLoc(), constantZero);
+    }
+  }
+};
+
 } // namespace mlir::tt::ttnn
diff --git a/lib/Dialect/TTNN/Utils/Utils.cpp b/lib/Dialect/TTNN/Utils/Utils.cpp
index 0156299218..514e17e521 100644
--- a/lib/Dialect/TTNN/Utils/Utils.cpp
+++ b/lib/Dialect/TTNN/Utils/Utils.cpp
@@ -80,18 +80,6 @@ toTTMemorySpace(const mlir::tt::ttnn::BufferType bufferType) {
   llvm_unreachable("Unknown MemorySpace");
 }
 
-DataType getDataTypeFromMemRef(mlir::MemRefType memref) {
-  Type elementType = memref.getElementType();
-  DataType dtype = DataType::Float32;
-  if (llvm::isa<TileType>(elementType)) {
-    auto tileType = mlir::cast<TileType>(elementType);
-    dtype = tileType.getDataType();
-  } else {
-    dtype = elementTypeToDataType(elementType);
-  }
-  return dtype;
-}
-
 Layout getLayoutFromMemRef(mlir::MemRefType memref) {
   ttnn::Layout ttnnLayoutEnum = ttnn::Layout::RowMajor;
   Type elementType = memref.getElementType();
diff --git a/test/ttmlir/Dialect/TTNN/Transforms/ttnn_create_input_gens_0.mlir b/test/ttmlir/Dialect/TTNN/Transforms/ttnn_create_input_gens_0.mlir
new file mode 100644
index 0000000000..8342c4f5a6
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/Transforms/ttnn_create_input_gens_0.mlir
@@ -0,0 +1,36 @@
+// RUN: ttmlir-opt --ttnn-create-input-gens %s | FileCheck %s
+
+#device = #tt.device<workerGrid = #tt.grid<8x8, (d0, d1) -> (0, d0, d1)>, l1Map = (d0, d1)[s0, s1] -> (0, d0 floordiv s0, d1 floordiv s1, (d0 mod s0) * s1 + d1 mod s1), dramMap = (d0, d1)[s0, s1] -> (0, 0, ((((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) floordiv 8192) mod 12, (((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) floordiv 98304 + (((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) mod 8192), meshShape = , chipIds = [0]>
+#dram = #ttnn.buffer_type<dram>
+#system_desc = #tt.system_desc<[{role = host, target_triple = "x86_64-pc-linux"}], [{arch = <wormhole_b0>, grid = 8x8, l1_size = 1499136, num_dram_channels = 12, dram_channel_size = 1073741824, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32, l1_unreserved_base = 98816, erisc_l1_unreserved_base = 102624, dram_unreserved_base = 32, dram_unreserved_end = 1073083040, physical_cores = {worker = [ 1x1,  1x2,  1x3,  1x4,  1x6,  1x7,  1x8,  1x9,  2x1,  2x2,  2x3,  2x4,  2x6,  2x7,  2x8,  2x9,  3x1,  3x2,  3x3,  3x4,  3x6,  3x7,  3x8,  3x9,  4x1,  4x2,  4x3,  4x4,  4x6,  4x7,  4x8,  4x9,  5x1,  5x2,  5x3,  5x4,  5x6,  5x7,  5x8,  5x9,  7x1,  7x2,  7x3,  7x4,  7x6,  7x7,  7x8,  7x9,  8x1,  8x2,  8x3,  8x4,  8x6,  8x7,  8x8,  8x9,  9x1,  9x2,  9x3,  9x4,  9x6,  9x7,  9x8,  9x9] dram = [ 1x0,  1x5,  2x5,  3x5,  5x0,  5x5,  7x0,  7x5,  8x5,  9x5,  11x0,  11x5] eth_inactive = [ 0x1,  0x2,  0x3,  0x4,  0x6,  0x7,  0x8,  0x9,  6x2,  6x3,  6x6,  6x7,  6x8]}, supported_data_types = [<f32>, <f16>, <bf16>, <bfp_f8>, <bfp_bf8>, <bfp_f4>, <bfp_bf4>, <bfp_f2>, <bfp_bf2>, <u32>, <u16>, <u8>], supported_tile_sizes = [ 4x16,  16x16,  32x16,  4x32,  16x32,  32x32], num_cbs = 32}], [0], [3 : i32], [ 0x0x0x0]>
+#system_memory = #ttnn.buffer_type<system_memory>
+#ttnn_layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<32x32xbf16, #system_memory>>
+#ttnn_layout1 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!tt.tile<32x32, bf16>, #dram>, <interleaved>>
+module attributes {tt.device = #device, tt.system_desc = #system_desc} {
+  // CHECK: func.func @add(%arg0: [[TENSOR_A:.*]], %arg1: [[TENSOR_B:.*]]) -> [[TENSOR_OUT:.*]] {
+  func.func @add(%arg0: tensor<32x32xbf16, #ttnn_layout>, %arg1: tensor<32x32xbf16, #ttnn_layout>) -> tensor<32x32xbf16, #ttnn_layout> {
+    %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+    %1 = "ttnn.to_device"(%arg0, %0) <{memory_config = #ttnn.memory_config<#dram, <<1x1>>, <interleaved>>}> : (tensor<32x32xbf16, #ttnn_layout>, !tt.device<#device>) -> tensor<32x32xbf16, #ttnn_layout1>
+    %2 = "ttnn.to_layout"(%1) <{layout = #ttnn.layout<tile>}> : (tensor<32x32xbf16, #ttnn_layout1>) -> tensor<32x32xbf16, #ttnn_layout1>
+    %3 = "ttnn.to_device"(%arg1, %0) <{memory_config = #ttnn.memory_config<#dram, <<1x1>>, <interleaved>>}> : (tensor<32x32xbf16, #ttnn_layout>, !tt.device<#device>) -> tensor<32x32xbf16, #ttnn_layout1>
+    %4 = "ttnn.to_layout"(%3) <{layout = #ttnn.layout<tile>}> : (tensor<32x32xbf16, #ttnn_layout1>) -> tensor<32x32xbf16, #ttnn_layout1>
+    %5 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes<bf16>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<#dram, <<1x1>>, <interleaved>>, shape = #ttnn.shape<32x32>}> : (!tt.device<#device>) -> tensor<32x32xbf16, #ttnn_layout1>
+    %6 = "ttnn.add"(%2, %4, %5) <{operandSegmentSizes = array<i32: 2, 1>}> : (tensor<32x32xbf16, #ttnn_layout1>, tensor<32x32xbf16, #ttnn_layout1>, tensor<32x32xbf16, #ttnn_layout1>) -> tensor<32x32xbf16, #ttnn_layout1>
+    %7 = "ttnn.from_device"(%6) : (tensor<32x32xbf16, #ttnn_layout1>) -> tensor<32x32xbf16, #ttnn_layout>
+    %8 = "ttnn.to_layout"(%7) <{layout = #ttnn.layout<row_major>}> : (tensor<32x32xbf16, #ttnn_layout>) -> tensor<32x32xbf16, #ttnn_layout>
+    return %8 : tensor<32x32xbf16, #ttnn_layout>
+  }
+
+// Confirm that the generator func is generated, and that the tensor attrs match:
+//
+// CHECK: func.func @createInputsFor_add() -> ([[TENSOR_A]], [[TENSOR_B]]) {
+// CHECK: {{.*}} -> [[TENSOR_A]]
+// CHECK: {{.*}} -> [[TENSOR_B]]
+// CHECK: return %0, %1 : [[TENSOR_A]], [[TENSOR_B]]
+
+// Confirm that the main func is generated, and that the tensor attrs match:
+//
+// CHECK: func.func @main() -> i32 {
+// CHECK: %0:2 = call @createInputsFor_add() : () -> ([[TENSOR_A]], [[TENSOR_B]])
+// CHECK: %1 = call @add(%0#0, %0#1) : ([[TENSOR_A]], [[TENSOR_B]]) -> [[TENSOR_OUT]]
+}

From 231fa9590628426f432a6de05003112f015e453e Mon Sep 17 00:00:00 2001
From: Andrej Jakovljevic <ajakovljevic@tenstorrent.com>
Date: Mon, 9 Dec 2024 17:37:53 +0100
Subject: [PATCH 80/84] Removed dead code check for typecast (#1483)

---
 lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
index d77d095acc..db4320ff3f 100644
--- a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
+++ b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
@@ -846,10 +846,6 @@ class TypecastOpConversionPattern
 
     DataType outputDataType = outputLayoutAttr.getDataType();
 
-    if (op->getUsers().empty()) {
-      return rewriter.notifyMatchFailure(
-          op, "ttir.typecast op should have at least one use.");
-    }
     rewriter.replaceOpWithNewOp<ttnn::TypecastOp>(
         op, this->getTypeConverter()->convertType(op.getType(0)), input,
         outputDataType);

From fecec9c297a1d46f7f3767737ad9f69422d4a5a4 Mon Sep 17 00:00:00 2001
From: Nick Smith <127986401+nsmithtt@users.noreply.github.com>
Date: Mon, 9 Dec 2024 11:13:38 -0800
Subject: [PATCH 81/84] Make clang-tidy its own job (#1527)

---
 .github/workflows/build-and-test.yml     | 58 ++++++++++++++++++++----
 cmake/modules/LintTools.cmake            | 10 +++-
 include/ttmlir/Conversion/CMakeLists.txt |  2 +
 third_party/CMakeLists.txt               |  4 ++
 4 files changed, 65 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index 62cfdd9455..68db5d1cff 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -40,6 +40,56 @@ jobs:
           echo "DOCKER_CI_IMAGE $DOCKER_CI_IMAGE"
           echo "docker-image=$DOCKER_CI_IMAGE" >> "$GITHUB_OUTPUT"
 
+  lint:
+    needs: build-image
+    timeout-minutes: 120
+    strategy:
+      fail-fast: false
+    name: Lint (clang-tidy)
+    runs-on: ubuntu-latest
+    container:
+      image: ${{ needs.build-image.outputs.docker-image }}
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+          fetch-depth: 0
+
+    - name: Set reusable strings
+      id: strings
+      shell: bash
+      run: |
+        echo "work-dir=$(pwd)" >> "$GITHUB_OUTPUT"
+        echo "build-output-dir=$(pwd)/build" >> "$GITHUB_OUTPUT"
+        echo "install-output-dir=$(pwd)/install" >> "$GITHUB_OUTPUT"
+
+    - name: Git safe dir
+      run: git config --global --add safe.directory ${{ steps.strings.outputs.work-dir }}
+
+    - name: Configure CMake
+      shell: bash
+      run: |
+        source env/activate
+        cmake -G Ninja \
+        -B ${{ steps.strings.outputs.build-output-dir }} \
+        -DCMAKE_CXX_COMPILER=clang++-17 \
+        -DCMAKE_C_COMPILER=clang-17 \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_INSTALL_PREFIX=${{ steps.strings.outputs.install-output-dir }} \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+        -DTTMLIR_ENABLE_RUNTIME=ON \
+        -DTTMLIR_ENABLE_RUNTIME_TESTS=ON \
+        -DTTMLIR_ENABLE_STABLEHLO=ON \
+        -DTTMLIR_ENABLE_OP_MODEL=ON \
+        -S ${{ steps.strings.outputs.work-dir }}
+
+    - name: Lint
+      id: lint
+      shell: bash
+      run: |
+        source env/activate
+        cmake --build ${{ steps.strings.outputs.build-output-dir }} -- clang-tidy
+
   build-ttmlir:
     needs: build-image
     timeout-minutes: 120
@@ -119,14 +169,6 @@ jobs:
         cmake --build ${{ steps.strings.outputs.build-output-dir }}
         cmake --install ${{ steps.strings.outputs.build-output-dir }} --component Test
 
-    - name: Lint
-      id: lint
-      shell: bash
-      if: matrix.build.enable_perf == 'OFF'
-      run: |
-        source env/activate
-        cmake --build ${{ steps.strings.outputs.build-output-dir }} -- clang-tidy
-
     - name: Unique-ify clang-tidy fixes
       shell: bash
       if: failure() && steps.lint.outcome == 'failure'
diff --git a/cmake/modules/LintTools.cmake b/cmake/modules/LintTools.cmake
index 28b4b28092..7e56040110 100644
--- a/cmake/modules/LintTools.cmake
+++ b/cmake/modules/LintTools.cmake
@@ -1,4 +1,12 @@
 # clang-tidy setup
 add_custom_target(clang-tidy-filter-out-external-srcs COMMAND python3 ${TTMLIR_SOURCE_DIR}/tools/scripts/filter-compile-commands.py ${TTMLIR_BINARY_DIR}/compile_commands.json "${TTMLIR_SOURCE_DIR}")
-add_custom_target(clang-tidy COMMAND run-clang-tidy.py -p ${PROJECT_BINARY_DIR} -export-fixes clang-tidy-fixes.yaml -warnings-as-errors '*' -extra-arg-before=-DDISABLE_STATIC_ASSERT_TESTS -extra-arg-before=-D__cpp_structured_bindings=202400 DEPENDS clang-tidy-filter-out-external-srcs)
+add_custom_target(clang-tidy COMMAND run-clang-tidy.py -p ${PROJECT_BINARY_DIR} -export-fixes clang-tidy-fixes.yaml -warnings-as-errors '*' -extra-arg-before=-DDISABLE_STATIC_ASSERT_TESTS -extra-arg-before=-D__cpp_structured_bindings=202400
+  DEPENDS
+    clang-tidy-filter-out-external-srcs
+    mlir-headers
+    mlir-generic-headers
+    tt-metal-download
+    tt-metal-configure
+    FBS_GENERATION
+)
 add_custom_target(clang-format COMMAND git-clang-format)
diff --git a/include/ttmlir/Conversion/CMakeLists.txt b/include/ttmlir/Conversion/CMakeLists.txt
index 891fa56080..ba6b267836 100644
--- a/include/ttmlir/Conversion/CMakeLists.txt
+++ b/include/ttmlir/Conversion/CMakeLists.txt
@@ -5,7 +5,9 @@ include_directories(${TTMLIR_SOURCE_DIR}/include)
 set(LLVM_TARGET_DEFINITIONS Passes.td)
 if (TTMLIR_ENABLE_STABLEHLO)
 mlir_tablegen(Passes.h.inc -gen-pass-decls -name TTMLIRConversion -DTTMLIR_ENABLE_STABLEHLO)
+add_dependencies(mlir-headers PassesIncGen)
 else()
 mlir_tablegen(Passes.h.inc -gen-pass-decls -name TTMLIRConversion)
 endif()
 add_public_tablegen_target(TTMLIRConversionPassIncGen)
+add_dependencies(mlir-headers TTMLIRConversionPassIncGen)
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index e4c17bae47..8d66b68695 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -71,6 +71,10 @@ ExternalProject_Add(
   BUILD_BYPRODUCTS ${TTNN_LIBRARY_PATH} ${TTMETAL_LIBRARY_PATH} ${TRACY_LIBRARY_PATH} ${DEVICE_LIBRARY_PATH}
 )
 
+ExternalProject_Add_StepTargets(tt-metal download configure)
+set_target_properties(tt-metal-download PROPERTIES EXCLUDE_FROM_ALL TRUE)
+set_target_properties(tt-metal-configure PROPERTIES EXCLUDE_FROM_ALL TRUE)
+
 set_target_properties(tt-metal PROPERTIES EXCLUDE_FROM_ALL TRUE)
 
 list(APPEND library_names TTNN_LIBRARY TTMETAL_LIBRARY DEVICE_LIBRARY)

From 4ed40921c1fe79df932275a8ad3700a02047514a Mon Sep 17 00:00:00 2001
From: Nick Smith <127986401+nsmithtt@users.noreply.github.com>
Date: Mon, 9 Dec 2024 11:51:58 -0800
Subject: [PATCH 82/84] Add missing codeowner dirs (#895)

---
 .github/CODEOWNERS | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 8513285524..9728499a04 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,6 +1,9 @@
 *.cmake @nsmithtt @sdjordjevicTT
 *.fbs @tapspatel @nsmithtt
 *.txt @nsmithtt @sdjordjevicTT
+*.md @nsmithtt
+/docs/ @nsmithtt
+/env/ @nsmithtt
 /.github/ @vmilosevic @tapspatel
 /include/ttmlir/Conversion/TTIRToTTNN/ @sdjordjevicTT @svuckovicTT @mtopalovicTT @rpavlovicTT @jserbedzijaTT @jnie-TT
 /include/ttmlir/Conversion/TTNNToEmitC/ @svuckovicTT @rpavlovicTT @sdjordjevicTT @mtopalovicTT @jserbedzijaTT
@@ -26,3 +29,4 @@
 /test/ttmlir/Silicon/TTNN/optimizer/ @nobradovictt @odjuricicTT
 /test/unittests/Optimizer @nobradovictt @odjuricicTT
 /tools/explorer/ @odjuricicTT @nobradovictt @vprajapati-tt
+/tools/ @svuckovicTT @mtopalovicTT

From e2007ef6f64072c2ec2a31d84bfa1376d0bb784f Mon Sep 17 00:00:00 2001
From: Sanja Djukic <sdjukic@tenstorrent.com>
Date: Mon, 9 Dec 2024 22:21:30 +0100
Subject: [PATCH 83/84] TOSA To TTIR: adding a conversion pattern for ClampOp
 (#1484)

* added a conversion pattern for clampop and the corresponding test

* moved patterns that don't use default pattern, fixed blank lines

* made comments full sentences

* refactor: grouping of operations

* fixed mistake i made when resolving conflicts

* fixed mistake
---
 .../TosaToTTIR/TosaToTTIRPatterns.cpp         | 29 ++++++++++++++++++-
 test/ttmlir/Conversion/TosaToTTIR/clamp.mlir  | 11 +++++++
 2 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 test/ttmlir/Conversion/TosaToTTIR/clamp.mlir

diff --git a/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp b/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp
index 607a083310..8d4e4caafd 100644
--- a/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp
+++ b/lib/Conversion/TosaToTTIR/TosaToTTIRPatterns.cpp
@@ -81,6 +81,31 @@ class TosaToTTIRMultiplyOpConversionPattern
   }
 };
 
+class TosaToTTIRClampOpConversionPattern
+    : public OpConversionPattern<tosa::ClampOp> {
+  using OpConversionPattern<tosa::ClampOp>::OpConversionPattern;
+
+public:
+  LogicalResult
+  matchAndRewrite(tosa::ClampOp srcOp, tosa::ClampOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    RankedTensorType outputType =
+        mlir::cast<RankedTensorType>(srcOp.getResult().getType());
+
+    tensor::EmptyOp outputTensor = rewriter.create<tensor::EmptyOp>(
+        srcOp.getLoc(), outputType.getShape(), outputType.getElementType());
+
+    rewriter.replaceOpWithNewOp<mlir::tt::ttir::ClampOp>(
+        srcOp, TypeRange(outputTensor.getType()), adaptor.getOperands()[0],
+        outputTensor, adaptor.getMinFp(), adaptor.getMaxFp(),
+        rewriter.getArrayAttr(
+            SmallVector<Attribute>(adaptor.getOperands().size() + 1,
+                                   rewriter.getAttr<OperandConstraintAttr>(
+                                       OperandConstraint::AnyDeviceTile))));
+    return success();
+  }
+};
+
 class TosaToTTIRMatmulOpConversionPattern
     : public OpConversionPattern<tosa::MatMulOp> {
   using OpConversionPattern<tosa::MatMulOp>::OpConversionPattern;
@@ -104,6 +129,7 @@ class TosaToTTIRMatmulOpConversionPattern
     rewriter.replaceOpWithNewOp<mlir::tt::ttir::MatmulOp>(
         srcOp, TypeRange(outputTensor.getType()), operands[0], operands[1],
         outputTensor,
+
         rewriter.getArrayAttr(
             SmallVector<Attribute>(adaptor.getOperands().size() + 1,
                                    rewriter.getAttr<OperandConstraintAttr>(
@@ -181,7 +207,6 @@ class TosaToTTIRMaxPool2DOpConversionPattern
 void addElementwiseUnaryOpsConversionPatterns(MLIRContext *ctx,
                                               RewritePatternSet &patterns,
                                               TypeConverter &typeConverter) {
-
   patterns.add<TosaToTTIRDefaultDPSOpConversionPattern<tosa::AbsOp,
                                                        mlir::tt::ttir::AbsOp>>(
       typeConverter, ctx);
@@ -295,6 +320,8 @@ void populateTosaToTTIRPatterns(MLIRContext *ctx, RewritePatternSet &patterns,
   addMatmulOpsConversionPatterns(ctx, patterns, typeConverter);
   addReductionOpsConversionPatterns(ctx, patterns, typeConverter);
   addPoolingOpsConversionPatterns(ctx, patterns, typeConverter);
+
+  patterns.add<TosaToTTIRClampOpConversionPattern>(typeConverter, ctx);
 }
 
 } // namespace mlir::tt
diff --git a/test/ttmlir/Conversion/TosaToTTIR/clamp.mlir b/test/ttmlir/Conversion/TosaToTTIR/clamp.mlir
new file mode 100644
index 0000000000..0444fbcffa
--- /dev/null
+++ b/test/ttmlir/Conversion/TosaToTTIR/clamp.mlir
@@ -0,0 +1,11 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+module attributes {} {
+  func.func @test_clamp(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+    %0 = tosa.clamp %arg0 { min_int = 2 : i64, max_int = 3 : i64, min_fp = 2.0 : f32, max_fp = 3.0 : f32 } : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+    // CHECK: %[[OP_OUT:[0-9]+]] = tensor.empty() : [[TENSOR_SIZE:tensor<[0-9]+x[0-9]+x[0-9]+xf[0-9]+>]]
+    // CHECK: %[[VAL:[0-9]+]] = "ttir.clamp"(%arg{{[0-9]+}}, %[[OP_OUT]])
+    // CHECK-SAME: max = 3.000000e+00 : f32, min = 2.000000e+00 : f32{{.+}}: ([[TENSOR_SIZE]], [[TENSOR_SIZE]]) -> [[TENSOR_SIZE]]
+    return %0 : tensor<13x21x3xf32>
+    // CHECK: return %[[VAL]] : [[TENSOR_SIZE]]
+  }
+}

From 8abdc3613d012cdeec8139404271f3c166d4cc60 Mon Sep 17 00:00:00 2001
From: Vincent Wells <vwells@tenstorrent.com>
Date: Mon, 9 Dec 2024 17:26:04 -0600
Subject: [PATCH 84/84] Change env/CMakeLists.txt to prevent
 ttlmir-opt/translate from linking to llvm so (#1525)

---
 env/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/env/CMakeLists.txt b/env/CMakeLists.txt
index 0f3c26736b..1e26e8ca43 100644
--- a/env/CMakeLists.txt
+++ b/env/CMakeLists.txt
@@ -54,7 +54,7 @@ ExternalProject_Add(
     -DLLVM_INSTALL_GTEST=ON
     -DLLVM_LINK_LLVM_DYLIB=ON
     -DMLIR_BUILD_MLIR_C_DYLIB=ON
-    -DMLIR_LINK_MLIR_DYLIB=ON
+    -DMLIR_LINK_MLIR_DYLIB=OFF
     -DMLIR_BUILD_MLIR_C_DYLIB=ON
     # ======================
     -DCMAKE_BUILD_TYPE=MinSizeRel