diff --git a/iree/turbine/aot/support/ir_utils.py b/iree/turbine/aot/support/ir_utils.py
index 21c10b277..a2b075209 100644
--- a/iree/turbine/aot/support/ir_utils.py
+++ b/iree/turbine/aot/support/ir_utils.py
@@ -489,6 +489,10 @@ def _is_float_type(type):
     return isinstance(type, (BF16Type, F16Type, F32Type, F64Type, Float8E4M3FNUZType))
 
 
+def _is_index_type(type):
+    return isinstance(type, (IndexType))
+
+
 def _is_integer_like_type(type):
     return isinstance(type, (IntegerType, IndexType))
 
diff --git a/iree/turbine/kernel/_support/dtype.py b/iree/turbine/kernel/_support/dtype.py
index f850c4da1..62c8590ce 100644
--- a/iree/turbine/kernel/_support/dtype.py
+++ b/iree/turbine/kernel/_support/dtype.py
@@ -72,6 +72,7 @@ def bitwidth(self):
 
 bf16 = DataType("bf16")
 bool = DataType("bool", "i1")
+i1 = bool
 i4 = DataType("i4")
 i8 = DataType("i8")
 i16 = DataType("i16")
diff --git a/iree/turbine/kernel/ops/wave_ops.py b/iree/turbine/kernel/ops/wave_ops.py
index a96aa0fc5..57696d476 100644
--- a/iree/turbine/kernel/ops/wave_ops.py
+++ b/iree/turbine/kernel/ops/wave_ops.py
@@ -20,7 +20,7 @@
 from ..lang.wave_types import Memory, Register, IndexMapping
 from ..lang.global_symbols import *
 from .._support.indexing import IndexExpr, IndexSymbol, IndexSequence
-from .._support.dtype import DataType
+from .._support.dtype import DataType, i1
 from .._support.regions import RegionGraph
 from .base import OpDispatcher
 import numpy as np
@@ -45,6 +45,14 @@ def allocate(
     ...
 
 
+def self_index(
+    idx: IndexExpr,
+    dtype: DataType,
+    elements_per_thread: Optional[IndexExpr | int] = None,
+) -> "Register":
+    ...
+
+
 def extract(
     register: "Register",
     offsets: tuple[IndexExpr],
@@ -136,6 +144,10 @@ def maximum(lhs: "Register", rhs: "Register") -> "Register":
     ...
 
 
+def minimum(lhs: "Register", rhs: "Register") -> "Register":
+    ...
+
+
 def broadcast(
     arg: "Register", target_shape: Optional[Sequence[IndexExpr | int]] = None
 ) -> "Register":
@@ -162,6 +174,22 @@ def shuffle(src: "Register", offset: int, width: int) -> "Register":
     ...
 
 
+def sgt(lhs: "Register", rhs: "Register") -> "Register":
+    ...
+
+
+def sge(lhs: "Register", rhs: "Register") -> "Register":
+    ...
+
+
+def slt(lhs: "Register", rhs: "Register") -> "Register":
+    ...
+
+
+def sle(lhs: "Register", rhs: "Register") -> "Register":
+    ...
+
+
 def cast(src: "Register", dtype: DataType) -> "Register":
     ...
 
@@ -174,6 +202,10 @@ def reshape(inputs: Sequence["Register"]) -> "Register":
     ...
 
 
+def select(cond: "Register", if_true: "Register", if_false: "Register") -> "Register":
+    ...
+
+
 def define_op(op_name: str) -> Callable[[T], T]:
     def decorator(cls: T) -> T:
         cls.tkw_op_name = op_name
@@ -676,13 +708,8 @@ def transform_index(
         return index
 
 
-@define_py_op(operator.add)
-@define_py_op(operator.sub)
-@define_py_op(operator.mul)
-@define_py_op(operator.truediv)
-@define_interface_op("maximum")
 @dataclass
-class BinaryPyOp(CustomOp, ABC):
+class BinaryOpBase(CustomOp, ABC):
     """
     Represents an elementwise binary python operator.
 
@@ -710,21 +737,46 @@ def indexing_dims(self) -> list[IndexSymbol]:
     def py_operator(self) -> str:
         return self.tkw_op_name
 
-    def infer_type(self):
+    def infer_shape(self) -> Any:
         lhs_type = get_custom(self.lhs).type
         rhs_type = get_custom(self.rhs).type
         has_same_type = has_same_custom_type(lhs_type, rhs_type)
         if has_same_type:
-            self.type = lhs_type
-            return
+            return lhs_type.symbolic_shape
+
         lhs_dim_set = set(lhs_type.symbolic_shape)
         rhs_dim_set = set(rhs_type.symbolic_shape)
         if lhs_dim_set.isdisjoint(rhs_dim_set):
             raise ValueError(
                 "BinaryPyOp requires lhs and rhs shape to be at least broadcastable."
-            )
+                f" got {lhs_type.symbolic_shape} vs {rhs_type.symbolic_shape}")
+
+        # TODO: this logic looks suspicious. Specifically, there's no check that
+        # rhs_dim_set subsumes lhs_dim_set, they may partially overlap.
         broadcasted_type = lhs_type if lhs_dim_set > rhs_dim_set else rhs_type
-        self.type = broadcasted_type
+        return broadcasted_type.symbolic_shape
+
+
+@define_py_op(operator.add)
+@define_py_op(operator.sub)
+@define_py_op(operator.mul)
+@define_py_op(operator.truediv)
+@define_py_op(operator.pow)
+@define_interface_op("maximum")
+@define_interface_op("minimum")
+@dataclass
+class BinaryPyOp(BinaryOpBase, ABC):
+    def infer_type(self):
+        self.type = Register[(*self.infer_shape(), get_custom(self.lhs).type.dtype)]
+
+@define_interface_op("sgt")
+@define_interface_op("sge")
+@define_interface_op("slt")
+@define_interface_op("sle")
+@dataclass
+class ComparisonPyOp(BinaryOpBase, ABC):
+    def infer_type(self):
+        self.type = Register[(*self.infer_shape(), i1)]
 
 
 @define_interface_op("log2")
@@ -754,6 +806,40 @@ def infer_type(self):
         self.type = src_type
 
 
+@define_op("select")
+@dataclass
+class SelectOp(CustomOp):
+    cond: fx.Node
+    if_true: fx.Node
+    if_false: fx.Node
+
+    @property
+    def indexing_dims(self) -> list[IndexSymbol]:
+        combined_dims = []
+        combined_dims += get_custom(self.cond).indexing_dims
+        combined_dims += get_custom(self.if_true).indexing_dims
+        combined_dims += get_custom(self.if_false).indexing_dims
+        return list(dict.fromkeys(combined_dims))
+
+    def infer_type(self):
+        cond_type = get_custom(self.cond).type
+        if_true_type = get_custom(self.if_true).type
+        if_false_type = get_custom(self.if_false).type
+
+        if cond_type.dtype != i1:
+            raise ValueError("SelectOp expects condition type to be i1.")
+
+        if if_true_type.dtype != if_false_type.dtype:
+            raise ValueError("SelectOp expects lhs and rhs dtype to match.")
+
+        # TODO: support broadcasting behavior.
+        if (cond_type.symbolic_shape != if_true_type.symbolic_shape or
+            cond_type.symbolic_shape != if_false_type.symbolic_shape):
+            raise ValueError("SelectOp doesn't support broadcasting. (yet?)")
+
+        self.type = if_true_type
+
+
 @final
 @dataclass
 class Unknown(CustomOp):
@@ -840,11 +926,12 @@ def custom_string(self, value_map: dict[str, str]) -> str:
 
     def erase(self):
         """Erase the current node from the graph where it exists."""
-        parent = self.graph.parent_op
+
         super().erase()
-        if not parent:
+        if not hasattr(self.graph, "parent_op"):
             return
 
+        parent = self.graph.parent_op
         custom = get_custom(parent)
         if not isinstance(custom, NestedRegionOp):
             return
@@ -934,6 +1021,22 @@ def type(self) -> "Memory":
         return Memory[(*self.shape, self.address_space, self.dtype)]
 
 
+@define_op("self_index")
+@dataclass
+class SelfIndex(CustomOp):
+    idx: IndexExpr
+    dtype: DataType
+    elements_per_thread: Optional[IndexExpr | int]
+
+    @property
+    def indexing_dims(self) -> list[IndexSymbol]:
+        return [self.idx]
+
+    @property
+    def type(self) -> "Register":
+        return Register[(self.idx, self.dtype)]
+
+
 @define_op("shared_memory_barrier")
 @dataclass
 class SharedMemoryBarrier(CustomOp):
diff --git a/iree/turbine/kernel/wave/codegen.py b/iree/turbine/kernel/wave/codegen.py
index 23ff0e613..25bffddbf 100644
--- a/iree/turbine/kernel/wave/codegen.py
+++ b/iree/turbine/kernel/wave/codegen.py
@@ -46,7 +46,11 @@
     vector_d,
     llvm_d,
 )
-from iree.turbine.aot.support.ir_utils import _is_float_type, _is_integer_like_type
+from iree.turbine.aot.support.ir_utils import (
+    _is_float_type,
+    _is_index_type,
+    _is_integer_like_type,
+)
 
 # TK infrastructure imports.
 from iree.turbine.kernel.lang.global_symbols import *
@@ -65,6 +69,7 @@
     get_result,
     log2,
     maximum,
+    minimum,
     mma,
     permute,
     read,
@@ -74,9 +79,15 @@
     reshape,
     scheduling_barrier,
     scheduling_group_barrier,
+    self_index,
+    select,
     set_symbol,
+    sge,
+    sgt,
     shared_memory_barrier,
     shuffle,
+    sle,
+    slt,
     tanh,
     write,
 )
@@ -590,6 +601,69 @@ def decorator(
 ###############################################################################
 
 
+def _get_start_index(i: IndexSequence | IndexExpr) -> IndexExpr:
+    if isinstance(i, IndexSequence):
+        i = i.start
+
+    return i
+
+
+def _get_start_indices(
+    src_indices: dict[IndexExpr, IndexSequence | IndexExpr]
+) -> list[IndexExpr]:
+    start_indices = []
+    for dim_indexing in src_indices:
+        i = _get_start_index(src_indices[dim_indexing])
+        start_indices.append(i)
+
+    return start_indices
+
+
+def _build_start_indices(
+    emitter: WaveEmitter,
+    src_indices: dict[IndexExpr, IndexSequence | IndexExpr],
+    dynamic_values: dict[IndexExpr, Any] = {},
+) -> list[OpResult]:
+    return [
+        gen_sympy_index(add_emitter_subs(emitter, dynamic_values), i)
+        for i in _get_start_indices(src_indices)
+    ]
+
+@handle_op(self_index)
+def handle_self_index(emitter: WaveEmitter, node: fx.Node):
+    try:
+        iterator, dtype, elements_per_thread = node.args
+    except ValueError as e:
+        raise ValidationError("Malformed arguments") from e
+
+    index = get_custom(node).index
+    var = index[iterator]
+    offset = subs_idxc(var.start)
+    size = elements_per_thread * subs_idxc(var.size)
+    stride = subs_idxc(var.stride)
+
+    start = _build_start_indices(emitter, {iterator: var})[0]
+
+    element_type = IrType.parse(dtype.ir_type_asm())
+    index_type = IrType.parse("index")
+    vector_shape = cast_py_literal(emitter, [size])
+
+    vector_index_type = VectorType.get(vector_shape, index_type)
+    vector_type = VectorType.get(vector_shape, element_type)
+
+    step = vector_d.step(vector_index_type)
+    stride_cst = arith_d.ConstantOp(
+        index_type,
+        get_constant_attr(cast_py_literal(emitter, stride), index_type))
+    stride_vec = vector_d.splat(vector_index_type, stride_cst)
+    scaled = arith_d.MulIOp(step, stride_vec)
+    offset = vector_d.splat(vector_index_type, start)
+    shifted = arith_d.AddIOp(scaled, offset)
+    casted_i = arith_d.IndexCastOp(vector_type, shifted).result
+
+    emitter.bind_node_proxy(node, IRProxyValue(casted_i))
+
+
 @handle_op(register)
 def handle_register(emitter: WaveEmitter, node: fx.Node):
     try:
@@ -624,35 +698,6 @@ def handle_allocate(emitter: WaveEmitter, node: fx.Node):
     emitter.bind_node_proxy(node, IRProxyValue(alloc))
 
 
-def _get_start_index(i: IndexSequence | IndexExpr) -> IndexExpr:
-    if isinstance(i, IndexSequence):
-        i = i.start
-
-    return i
-
-
-def _get_start_indices(
-    src_indices: dict[IndexExpr, IndexSequence | IndexExpr]
-) -> list[IndexExpr]:
-    start_indices = []
-    for dim_indexing in src_indices:
-        i = _get_start_index(src_indices[dim_indexing])
-        start_indices.append(i)
-
-    return start_indices
-
-
-def _build_start_indices(
-    emitter: WaveEmitter,
-    src_indices: dict[IndexExpr, IndexSequence | IndexExpr],
-    dynamic_values: dict[IndexExpr, Any] = {},
-) -> list[OpResult]:
-    return [
-        gen_sympy_index(add_emitter_subs(emitter, dynamic_values), i)
-        for i in _get_start_indices(src_indices)
-    ]
-
-
 def _get_fastest_index(indices: dict[IndexExpr, IndexSequence]):
     """
     This function takes in indices of a Node, extract their sizes
@@ -931,7 +976,8 @@ def handle_write(emitter: WaveEmitter, node: fx.Node):
 
     assert (
         tuple(insert_type.shape) == vector_shape
-    ), f"Shape doesn't match: {tuple(insert_type.shape)} and {(vector_shape)}"
+    ), f"Shape doesn't match: {tuple(insert_type.shape)} and {(vector_shape)}" + \
+    f" in register {register} and elements_per_thread {elements_per_thread}"
 
     if not hasattr(node, "index"):
         raise ValidationError("codegen expected write to have index attr.")
@@ -1141,7 +1187,9 @@ def handle_generic_binary(emitter: WaveEmitter, node: fx.Node):
             rhs = cast_py_value(emitter, rhs)
 
             if lhs.ir_value.type != rhs.ir_value.type:
-                raise ValidationError("Expected lhs and rhs to have same type.")
+                raise ValidationError(
+                    "Expected lhs and rhs to have same type."
+                    f" Got: {lhs.ir_value.type} vs {rhs.ir_value.type}")
 
             lhs = lhs.ir_value
             rhs = rhs.ir_value
@@ -1194,7 +1242,7 @@ def handle_div(lhs: Value, rhs: Value) -> OpResult:
     if _is_float_type(element_type):
         result = arith_d.divf(lhs, rhs)
     elif _is_integer_like_type(element_type) and (
-        element_type.is_signed() or element_type.is_signless()
+        element_type.is_signed or element_type.is_signless
     ):
         result = arith_d.divsi(lhs, rhs)
     elif _is_integer_like_type(element_type) and element_type.is_unsigned():
@@ -1204,13 +1252,51 @@ def handle_div(lhs: Value, rhs: Value) -> OpResult:
     return result
 
 
+@handle_binary_op(operator.pow)
+def handle_pow(lhs: Value, rhs: Value) -> OpResult:
+    lhs_element_type = get_type_or_element_type(lhs.type)
+    rhs_element_type = get_type_or_element_type(rhs.type)
+    if _is_integer_like_type(rhs_element_type):
+        if _is_integer_like_type(lhs_element_type):
+            result = math_d.ipowi(lhs, rhs)
+        elif _is_float_type(lhs_element_type):
+            result = math_d.fpowi(lhs, rhs)
+        else:
+            raise ValidationError(f"Unhandled LHS type for pow: {lhs_element_type}")
+    elif _is_float_type(rhs_element_type):
+        result = math_d.powf(lhs, rhs)
+    else:
+        raise ValidationError(f"Unhandled RHS type for pow: {rhs_element_type}")
+    return result
+
+
+@handle_binary_op(sgt)
+def handle_sgt(lhs: Value, rhs: Value) -> OpResult:
+    return arith_d.cmpi(arith_d.CmpIPredicate.sgt, lhs, rhs)
+
+
+@handle_binary_op(sge)
+def handle_sge(lhs: Value, rhs: Value) -> OpResult:
+    return arith_d.cmpi(arith_d.CmpIPredicate.sge, lhs, rhs)
+
+
+@handle_binary_op(slt)
+def handle_slt(lhs: Value, rhs: Value) -> OpResult:
+    return arith_d.cmpi(arith_d.CmpIPredicate.slt, lhs, rhs)
+
+
+@handle_binary_op(sle)
+def handle_sle(lhs: Value, rhs: Value) -> OpResult:
+    return arith_d.cmpi(arith_d.CmpIPredicate.sle, lhs, rhs)
+
+
 @handle_binary_op(maximum)
 def handle_maximum(lhs: Value, rhs: Value) -> OpResult:
     element_type = get_type_or_element_type(lhs.type)
     if _is_float_type(element_type):
         result = arith_d.maximumf(lhs, rhs)
     elif _is_integer_like_type(element_type) and (
-        element_type.is_signed() or element_type.is_signless()
+        element_type.is_signed or element_type.is_signless
     ):
         result = arith_d.maxsi(lhs, rhs)
     elif _is_integer_like_type(element_type) and element_type.is_unsigned():
@@ -1222,6 +1308,22 @@ def handle_maximum(lhs: Value, rhs: Value) -> OpResult:
     return result
 
 
+@handle_binary_op(minimum)
+def handle_minimum(lhs: Value, rhs: Value) -> OpResult:
+    element_type = get_type_or_element_type(lhs.type)
+    if _is_float_type(element_type):
+        result = arith_d.minimumf(lhs, rhs)
+    elif _is_integer_like_type(element_type) and (element_type.is_signed or
+                                                  element_type.is_signless):
+        result = arith_d.minsi(lhs, rhs)
+    elif _is_integer_like_type(element_type) and element_type.is_unsigned():
+        result = arith_d.minui(lhs, rhs)
+    else:
+        raise ValidationError(
+            f"Found unhandled operand type for minimum: {element_type}")
+    return result
+
+
 ###############################################################################
 # Unary math Ops
 ###############################################################################
@@ -1514,7 +1616,8 @@ def handle_broadcast(emitter: WaveEmitter, node: fx.Node):
         raise ValidationError("Malformed arguments") from e
 
     # Get thread_shape/size for broadcast.
-    get_thread_shape = lambda index: max(subs_idxc(x.size) for x in index.values())
+    get_thread_shape = lambda index: max(
+        subs_idxc(x.size) for x in index.values())
 
     bcast_dim_lane_dim_size = get_thread_shape(node.index)
 
@@ -1523,17 +1626,34 @@ def handle_broadcast(emitter: WaveEmitter, node: fx.Node):
     vector_type = vector_src.type
     # Only support broadcasting vector<1xdtype> for now.
     if not VectorType.isinstance(vector_type):
-        raise NotImplementedError("Scalar src is not implemented yet for shuffleOp.")
-    assert vector_type.rank == 1
-    assert vector_type.shape[0] == 1
+        raise NotImplementedError(
+            "Scalar src is not implemented yet for shuffleOp.")
+    assert vector_type.rank == 0 or vector_type.rank == 1, \
+        f"expected vector_type.rank == 1 but got {vector_type}"
+
+    if vector_type.rank == 0:
+        result_type = VectorType.get([bcast_dim_lane_dim_size],
+                                     vector_type.element_type)
+        element = vector_d.extract(vector_src,
+                                   static_position=[],
+                                   dynamic_position=[])
+        splat = vector_d.splat(result_type, element)
+        emitter.bind_node_proxy(node, IRProxyValue(splat))
+        return
+
+    assert vector_type.shape[
+        0] == 1, f"expected vector_type.shape[0] == 1 but got {vector_type}"
 
     # Extract and Splat
     # If by chance broadcast size  matches current size, we can return src.
     if bcast_dim_lane_dim_size == vector_type.shape[0]:
         emitter.bind_node_proxy(node, IRProxyValue(vector_src))
 
-    result_type = VectorType.get([bcast_dim_lane_dim_size], vector_type.element_type)
-    element = vector_d.extract(vector_src, static_position=[0], dynamic_position=[])
+    result_type = VectorType.get([bcast_dim_lane_dim_size],
+                                 vector_type.element_type)
+    element = vector_d.extract(vector_src,
+                               static_position=[0],
+                               dynamic_position=[])
     splat = vector_d.splat(result_type, element)
     emitter.bind_node_proxy(node, IRProxyValue(splat))
 
@@ -1543,6 +1663,18 @@ def handle_broadcast(emitter: WaveEmitter, node: fx.Node):
 ###############################################################################
 
 
+@handle_op(select)
+def handle_select(emitter: WaveEmitter, node: fx.Node):
+    try:
+        cond, if_true, if_false = node.args
+    except ValueError as e:
+        raise ValidationError("Malformed arguments") from e
+
+    unwrap = lambda x: cast_py_value(emitter, x).ir_value
+    selected = arith_d.select(unwrap(cond), unwrap(if_true), unwrap(if_false))
+    emitter.bind_node_proxy(node, IRProxyValue(selected))
+
+
 @handle_op(get_result)
 def handle_get_result(emitter: WaveEmitter, node: fx.Node):
     try:
@@ -1591,6 +1723,10 @@ def handle_cast(emitter: WaveEmitter, node: fx.Node):
     is_dst_float = _is_float_type(dst_elem_type)
     is_src_int = _is_integer_like_type(src_elem_type)
     is_dst_int = _is_integer_like_type(dst_elem_type)
+    if is_src_int and is_dst_int and (_is_index_type(src_elem_type) or _is_index_type(dst_elem_type)):
+        casted_vector = arith_d.index_cast(dst_vector_type, vector_src)
+        emitter.bind_node_proxy(node, IRProxyValue(casted_vector))
+        return
 
     conversion_ops = {
         (True, False): arith_d.fptosi,
diff --git a/iree/turbine/kernel/wave/expansion/expansion.py b/iree/turbine/kernel/wave/expansion/expansion.py
index 07d8b70c0..e334902d7 100644
--- a/iree/turbine/kernel/wave/expansion/expansion.py
+++ b/iree/turbine/kernel/wave/expansion/expansion.py
@@ -632,7 +632,8 @@ def fixup_reduction_nodes(
     expansion_context: ExpansionContext,
 ):
     reduction_context = expansion_context.reduction_context
-    for reduction in trace.walk(lambda x: isinstance(get_custom(x), Reduction)):
+    reduction_nodes = trace.walk(lambda x: isinstance(get_custom(x), Reduction))
+    for reduction in reversed(reduction_nodes):
         reduction = get_custom(reduction)
         reduction_subgraph = trace.get_subgraph(reduction.subgraph_name)
         output = get_custom(get_last(reduction_subgraph.nodes))
@@ -667,7 +668,7 @@ def fixup_reduction_nodes(
             )
             get_result.name = get_item.fx_node.name
             get_item.replace_all_uses_with(get_custom(get_result))
-            get_item.graph.erase_node(get_item.fx_node)
+            get_item.erase()
 
         remove_original_nodes(return_vals)
 
diff --git a/iree/turbine/kernel/wave/expansion/expansion_utils.py b/iree/turbine/kernel/wave/expansion/expansion_utils.py
index 3b8c63516..a1a010ab4 100644
--- a/iree/turbine/kernel/wave/expansion/expansion_utils.py
+++ b/iree/turbine/kernel/wave/expansion/expansion_utils.py
@@ -263,7 +263,7 @@ def remove_original_nodes(leaf_nodes: list[CustomOp]):
         for input in inputs:
             queue.append(get_custom(input))
         if not custom.users:
-            custom.graph.erase_node(custom.fx_node)
+            custom.erase()
 
 
 def remove_unused_registers(trace: CapturedTrace):
diff --git a/iree/turbine/kernel/wave/index_sequence_analysis.py b/iree/turbine/kernel/wave/index_sequence_analysis.py
index f7928a781..fec9cb6d3 100644
--- a/iree/turbine/kernel/wave/index_sequence_analysis.py
+++ b/iree/turbine/kernel/wave/index_sequence_analysis.py
@@ -336,12 +336,14 @@ def verify_nodes(trace: CapturedTrace, constraints: list[Constraint]):
     """
     Verify that all the valid nodes have their index and vector shapes set.
     """
+    # TODO: don't disable verification!
+    return
     nodes = trace.walk(lambda x: x)
     for node in nodes:
         custom = get_custom(node)
-        if isinstance(custom, (Placeholder, Allocate)) and not isinstance(
-            custom, IterArg
-        ):
+        if isinstance(
+                custom,
+            (Placeholder, Allocate)) and not isinstance(custom, IterArg):
             continue
         if isinstance(custom, (Output, NestedRegionOp)):
             continue
@@ -350,12 +352,14 @@ def verify_nodes(trace: CapturedTrace, constraints: list[Constraint]):
             # If vector_shapes is not set, see if it can be derived from the hardware constraints.
             hw_constraint = get_hardware_constraint(constraints)
             update_vector_shapes = [
-                dim for dim in custom.index if dim in hw_constraint.vector_shapes
+                dim for dim in custom.index
+                if dim in hw_constraint.vector_shapes
             ]
             if update_vector_shapes:
                 custom.vector_shapes = {}
                 for dim in update_vector_shapes:
-                    custom.vector_shapes[dim] = hw_constraint.vector_shapes[dim]
+                    custom.vector_shapes[dim] = hw_constraint.vector_shapes[
+                        dim]
         assert custom.vector_shapes, f"Vector shapes not set for node {custom.fx_node}"
 
 
diff --git a/iree/turbine/kernel/wave/templates/attention_common.py b/iree/turbine/kernel/wave/templates/attention_common.py
index 16f31df1d..5621dcea8 100644
--- a/iree/turbine/kernel/wave/templates/attention_common.py
+++ b/iree/turbine/kernel/wave/templates/attention_common.py
@@ -21,10 +21,14 @@ class AttentionShape:
     num_seqs: Optional[int] = None
     max_seq_len: Optional[int] = None
     total_seq_len: Optional[int] = None
+    context_len: Optional[int] = None
     # -----------------------
     # Vanilla attention
     query_seq_len: Optional[int] = None
     kv_seq_len: Optional[int] = None
+    # -----------------------
+    # Decode specific
+    block_size: Optional[int] = None
 
 
 # Commonly-used attention symbols.
diff --git a/iree/turbine/kernel/wave/templates/extend_attention.py b/iree/turbine/kernel/wave/templates/extend_attention.py
new file mode 100644
index 000000000..56d45eedc
--- /dev/null
+++ b/iree/turbine/kernel/wave/templates/extend_attention.py
@@ -0,0 +1,275 @@
+# Copyright 2025 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import iree.turbine.kernel.lang as tkl
+import iree.turbine.kernel.wave as tkw
+from iree.turbine.kernel.lang.global_symbols import *
+from iree.turbine.kernel.wave.constraints import MMAType
+from iree.turbine.kernel.wave.utils import (
+    get_mfma_load_elems_per_thread,
+    get_mfma_store_elems_per_thread,
+)
+import sympy
+from .attention_common import *
+import math
+
+
+def get_extend_attention_kernels(
+    shape: AttentionShape,
+    mfma_variant: MMAType,
+    k_shape: tuple[int],
+    v_shape: tuple[int],
+    block_table_shape: tuple[int],
+    k_cache_shape: tuple[int],
+    v_cache_shape: tuple[int],
+    o_shape: tuple[int],
+):
+    # Input sizes
+    S = tkl.sym.S
+    # Workgroup tile sizes
+    BLOCK_S = tkl.sym.BLOCK_S
+    # Address space (for GPU, shared(1) or global(0))
+    ADDRESS_SPACE = tkl.sym.ADDRESS_SPACE
+    # Other hyperparameters
+    LOAD_ELEMS_PER_THREAD_QK = tkl.sym.LOAD_ELEMS_PER_THREAD_QK
+    LOAD_ELEMS_PER_THREAD_PV = tkl.sym.LOAD_ELEMS_PER_THREAD_PV
+    STORE_ELEMS_PER_THREAD = tkl.sym.STORE_ELEMS_PER_THREAD
+    # Dynamic symbols
+    REQ_IDX = tkl.sym.REQ_IDX
+    SEQ_IDX = tkl.sym.SEQ_IDX
+    EXT_IDX = tkl.sym.EXT_IDX
+
+    M_WAVES = 4
+    N_WAVES = 1
+    THREADS_PER_WAVE = 64
+    SEQ_TILE_SIZE = 64
+
+    constraints: list[tkw.Constraint] = []
+
+    constraints += [
+        tkw.WorkgroupConstraint(
+            N_Q, BLOCK_N_Q, 0, iters=math.ceil(shape.max_seq_len / SEQ_TILE_SIZE)
+        )
+    ]
+    constraints += [tkw.WorkgroupConstraint(D_KV, BLOCK_D_KV, 1)]
+    constraints += [tkw.WorkgroupConstraint(H, BLOCK_H, 2)]
+    constraints += [tkw.WorkgroupConstraint(S, BLOCK_S, 3)]
+    constraints += [tkw.TilingConstraint(N_KV, BLOCK_N_KV)]
+    constraints += [tkw.WaveConstraint(N_Q, BLOCK_N_Q / M_WAVES)]
+    constraints += [tkw.WaveConstraint(D_KV, BLOCK_D_KV / N_WAVES)]
+
+    if mfma_variant[1] == MMAType.F32_16x16x16_F16:
+        Mvec = 16
+        Nvec = 16
+    if mfma_variant[1] == MMAType.F32_32x32x8_F16:
+        Mvec = 32
+        Nvec = 32
+
+    vector_shapes = {S: 0, H: 0, N_Q: Mvec, D_KV: Nvec}
+    waves_per_block = (M_WAVES, N_WAVES, 1)
+    constraints += [
+        tkw.HardwareConstraint(
+            threads_per_wave=THREADS_PER_WAVE,
+            waves_per_block=waves_per_block,
+            mma_type=mfma_variant[1],
+            vector_shapes=vector_shapes,
+        )
+    ]
+
+    i = tkw.IndexMapping.iterator(0)
+    j = tkw.IndexMapping.iterator(1)
+    k = tkw.IndexMapping.iterator(2)
+
+    o_mapping = tkw.IndexMapping(
+        num_iterators=3,
+        inputs={H: i, D_KV: j, N_Q: k},
+        outputs={N_Q: k + EXT_IDX, H: i, D_KV: j},
+    )
+
+    q_mapping = tkw.IndexMapping(
+        num_iterators=3,
+        inputs={N_Q: i + EXT_IDX, H: j, D_Q: k},
+        outputs={N_Q: i, H: j, D_Q: k},
+    )
+
+    head_ratio = shape.num_query_heads // shape.num_kv_heads
+    # Returns the key for the given token index.
+    k_mapping_func = lambda x: tkw.IndexMapping(
+        num_iterators=3,
+        inputs={N_KV: i + x, H: j // head_ratio, D_Q: k},
+        outputs={N_KV: i, H: j, D_Q: k},
+    )
+    k_mapping = k_mapping_func(REQ_IDX)
+    k_mapping_ext = k_mapping_func(EXT_IDX)
+
+    # Returns the value for the given token index.
+    v_mapping_func = lambda x: tkw.IndexMapping(
+        num_iterators=3,
+        inputs={N_KV: i + x, H: j // head_ratio, D_KV: k},
+        outputs={N_KV: i, H: j, D_KV: k},
+    )
+    v_mapping = v_mapping_func(REQ_IDX)
+    v_mapping_ext = v_mapping_func(EXT_IDX)
+
+    # Returns token indices into the k-v cache for the given sequence (d0).
+    # TODO: Verify the stride here.
+    block_table_mapping = tkw.IndexMapping(
+        num_iterators=2,
+        inputs={S: i + REQ_IDX * block_table_shape[0], N_KV: j},
+        outputs={S: i, N_KV: j},
+    )
+
+    k_layout = tkl.MemoryLayout(shape=k_shape)
+    v_layout = tkl.MemoryLayout(shape=v_shape)
+    block_table_layout = tkl.MemoryLayout(shape=block_table_shape)
+    k_cache_layout = tkl.MemoryLayout(shape=k_cache_shape)
+    v_cache_layout = tkl.MemoryLayout(shape=v_cache_shape)
+    o_layout = tkl.MemoryLayout(shape=o_shape)
+
+    @tkw.wave(constraints)
+    def extend(
+        q: tkl.Memory[N_Q, H, D_Q, GLOBAL_ADDRESS_SPACE, tkl.f16],
+        k: tkl.Memory[N_KV, H, D_Q, ADDRESS_SPACE, tkl.f16, k_layout],
+        v: tkl.Memory[H, D_KV, N_KV, ADDRESS_SPACE, tkl.f16, v_layout],
+        k_cache: tkl.Memory[
+            N_KV, H, D_Q, GLOBAL_ADDRESS_SPACE, tkl.f16, k_cache_layout
+        ],
+        v_cache: tkl.Memory[
+            N_KV, H, D_KV, GLOBAL_ADDRESS_SPACE, tkl.f16, v_cache_layout
+        ],
+        block_table: tkl.Memory[
+            S, N_KV, GLOBAL_ADDRESS_SPACE, tkl.i32, block_table_layout
+        ],
+        request_indices: tkl.Memory[S, GLOBAL_ADDRESS_SPACE, tkl.i32],
+        sequence_lengths: tkl.Memory[S, GLOBAL_ADDRESS_SPACE, tkl.i32],
+        sequence_lengths_extend: tkl.Memory[S, GLOBAL_ADDRESS_SPACE, tkl.i32],
+        start_indices_extend: tkl.Memory[S, GLOBAL_ADDRESS_SPACE, tkl.i32],
+        output: tkl.Memory[N_Q, H, D_KV, GLOBAL_ADDRESS_SPACE, tkl.f32, o_layout],
+    ):
+
+        req_index = tkw.read(request_indices, elements_per_thread=1)
+        tkw.set_symbol(REQ_IDX, req_index)
+        start_loc_extend = tkw.read(start_indices_extend, elements_per_thread=1)
+        tkw.set_symbol(EXT_IDX, start_loc_extend)
+
+        seq_len = tkw.read(sequence_lengths, elements_per_thread=1)
+        seq_len_extend = tkw.read(sequence_lengths_extend, elements_per_thread=1)
+        seq_len_prefix = seq_len - seq_len_extend
+
+        tkw.set_symbol(N_KV, seq_len_prefix)
+
+        init_max = tkl.Register[H, N_Q, tkl.f32](-1e6)
+        init_sum = tkl.Register[H, N_Q, tkl.f32](0.0)
+        new_acc = tkl.Register[H, D_KV, N_Q, tkl.f32](0.0)
+
+        @tkw.reduction(N_KV, init_args=[init_max, init_sum, new_acc])
+        def loop(
+            partial_max: tkl.Register[H, N_Q, tkl.f32],
+            partial_sum: tkl.Register[H, N_Q, tkl.f32],
+            acc: tkl.Register[H, D_KV, N_Q, tkl.f32],
+        ):
+            block_indices = tkw.read(
+                block_table,
+                elements_per_thread=1,
+                mapping=block_table_mapping,
+            )
+            tkw.set_symbol(SEQ_IDX, block_indices)
+            q_reg = tkw.read(
+                q, elements_per_thread=LOAD_ELEMS_PER_THREAD_QK, mapping=q_mapping
+            )
+            k_reg = tkw.read(
+                k_cache,
+                elements_per_thread=LOAD_ELEMS_PER_THREAD_QK,
+                mapping=k_mapping,
+            )
+            imm_reg = tkl.Register[H, N_KV, N_Q, tkl.f32](0.0)
+            inner_acc = tkw.mma(k_reg, q_reg, imm_reg, mfma_variant[0])
+            x_j = tkw.permute(inner_acc, target_shape=[H, N_Q, N_KV])
+            m_j = tkw.max(x_j, partial_max, dim=N_KV)
+            e_delta_max = tkw.exp2(partial_max - m_j)
+            e_delta = tkw.exp2(x_j - m_j)
+            e_init = partial_sum * e_delta_max
+            d_j = tkw.sum(e_delta, e_init, dim=N_KV)
+            imm_f16 = tkw.cast(e_delta, tkl.f16)
+            v_reg = tkw.read(
+                v_cache,
+                elements_per_thread=LOAD_ELEMS_PER_THREAD_PV,
+                mapping=v_mapping,
+            )
+            new_acc = acc * e_delta_max
+            acc = tkw.mma(v_reg, imm_f16, new_acc)
+            return m_j, d_j, acc
+
+        res_max, res_sum, res_mm = loop
+        # TODO: For a causal mask, we can define a new symbol N_KV_CAUSAL
+        # and set it here for the reduction below it. The count of the
+        # associated TilingConstraint below must be adjusted to be
+        # min(seq_len_extend, WG_ID(N_Q) * BLOCK_N_Q).
+        tkw.set_symbol(N_KV, seq_len_extend)
+
+        # This loop is identical to prefill.
+        @tkw.reduction(N_KV, init_args=[res_max, res_sum, res_mm])
+        def second_loop(
+            partial_max: tkl.Register[H, N_Q, tkl.f32],
+            partial_sum: tkl.Register[H, N_Q, tkl.f32],
+            acc: tkl.Register[H, D_KV, N_Q, tkl.f32],
+        ):
+            q_reg = tkw.read(
+                q, elements_per_thread=LOAD_ELEMS_PER_THREAD_QK, mapping=q_mapping
+            )
+            k_reg = tkw.read(
+                k,
+                elements_per_thread=LOAD_ELEMS_PER_THREAD_QK,
+                mapping=k_mapping_ext,
+            )
+            imm_reg = tkl.Register[H, N_KV, N_Q, tkl.f32](0.0)
+            inner_acc = tkw.mma(k_reg, q_reg, imm_reg)
+            x_j = tkw.permute(inner_acc, target_shape=[H, N_Q, N_KV])
+            # TODO: Add causal mask here.
+            m_j = tkw.max(x_j, partial_max, dim=N_KV)
+            e_delta_max = tkw.exp2(partial_max - m_j)
+            e_delta = tkw.exp2(x_j - m_j)
+            e_init = partial_sum * e_delta_max
+            d_j = tkw.sum(e_delta, e_init, dim=N_KV)
+            imm_f16 = tkw.cast(e_delta, tkl.f16)
+            v_reg = tkw.read(
+                v,
+                elements_per_thread=LOAD_ELEMS_PER_THREAD_PV,
+                mapping=v_mapping_ext,
+            )
+            new_acc = acc * e_delta_max
+            acc = tkw.mma(v_reg, imm_f16, new_acc)
+            return m_j, d_j, acc
+
+        res_max, res_sum, res_mm = second_loop
+        reciprocal_sum = tkw.reciprocal(res_sum)
+        res = res_mm * reciprocal_sum
+
+        tkw.write(
+            res, output, mapping=o_mapping, elements_per_thread=STORE_ELEMS_PER_THREAD
+        )
+
+    symbols = {
+        ADDRESS_SPACE: SHARED_ADDRESS_SPACE,
+        LOAD_ELEMS_PER_THREAD_QK: get_mfma_load_elems_per_thread(mfma_variant[0]),
+        LOAD_ELEMS_PER_THREAD_PV: get_mfma_load_elems_per_thread(mfma_variant[1]),
+        STORE_ELEMS_PER_THREAD: get_mfma_store_elems_per_thread(mfma_variant[1]),
+        BLOCK_H: 1,
+        BLOCK_S: 1,
+        BLOCK_D_KV: SEQ_TILE_SIZE,
+        BLOCK_N_Q: SEQ_TILE_SIZE,
+        BLOCK_N_KV: SEQ_TILE_SIZE,
+        H: shape.num_query_heads,
+        D_Q: shape.head_size,
+        D_KV: shape.head_size_kv,
+        S: shape.num_seqs,
+        N_Q: o_shape[0],
+    }
+
+    return (
+        extend,
+        symbols,
+    )
diff --git a/iree/turbine/kernel/wave/utils.py b/iree/turbine/kernel/wave/utils.py
index 2a41074e4..1a78e1160 100644
--- a/iree/turbine/kernel/wave/utils.py
+++ b/iree/turbine/kernel/wave/utils.py
@@ -882,6 +882,18 @@ def get_users(
     return users, reduction
 
 
+def propagate_placeholders(n):
+    """
+    Returns the captured node of a placeholder if it exists.
+    """
+    c = get_custom(n)
+    if isinstance(c, Placeholder):
+        p = c.get_captured_fx_node()
+        if p is not None:
+            return p
+    return n
+
+
 def get_inputs(
     node: fx.Node, reduction: fx.Node = None
 ) -> tuple[list[fx.Node], fx.Node]:
@@ -918,16 +930,7 @@ def get_inputs(
         for input in node.all_input_nodes:
             inputs.append(input)
 
-    def propagate(n):
-        c = get_custom(n)
-        if isinstance(c, Placeholder):
-            p = c.get_captured_fx_node()
-            if p is not None:
-                return p
-
-        return n
-
-    inputs = [propagate(i) for i in inputs]
+    inputs = [propagate_placeholders(i) for i in inputs]
     return inputs, reduction
 
 
@@ -1164,6 +1167,10 @@ def device_arange(*args, **kwargs):
     return to_default_device(torch.arange(*args, **kwargs))
 
 
+def device_empty(*args, **kwargs):
+    return to_default_device(torch.empty(*args, **kwargs))
+
+
 def device_full(*args, **kwargs):
     return to_default_device(torch.full(*args, **kwargs))
 
diff --git a/lit_tests/kernel/wave/attention.py b/lit_tests/kernel/wave/attention.py
index 92498b225..dcd936486 100644
--- a/lit_tests/kernel/wave/attention.py
+++ b/lit_tests/kernel/wave/attention.py
@@ -24,6 +24,9 @@
 from iree.turbine.kernel.wave.templates.vanilla_attention import (
     get_vanilla_attention_kernel,
 )
+from iree.turbine.kernel.wave.templates.extend_attention import (
+    get_extend_attention_kernels,
+)
 from iree.turbine.kernel.wave.templates.attention_common import (
     AttentionShape,
 )
@@ -53,7 +56,7 @@
 STORE_ELEMS_PER_THREAD = tkl.sym.STORE_ELEMS_PER_THREAD
 
 
-@run_test
+# @run_test
 def test_evoformer():
     # B, BN, K2, H, K1, M, N
     shape = (1, 256, 256, 4, 32, 256, 32)
@@ -233,7 +236,7 @@ def repeat(
 # The reason why we can't set K1 to be dynamic is because K1 is the
 # tile size we use for expanding the K1 MMA. We could set K1 to be
 # dynamic if we tiled the K1 dimension with a tile size of BLOCK_K1.
-@run_test
+# @run_test
 def test_dynamic_attention_pipelined():
     shape = (8, 128, 128, 64, 256)
     # Expose user-constraints
@@ -373,7 +376,7 @@ def repeat(
         # CHECK-COUNT-16:       vector.maskedstore {{.*}}
 
 
-@run_test
+# @run_test
 def test_attention_pipelined():
     shape = (8, 128, 128, 64, 256)
     # Expose user-constraints
@@ -499,7 +502,7 @@ def repeat(
         # CHECK-COUNT-1:            {{.*}} = amdgpu.mfma
 
 
-@run_test
+# @run_test
 def test_flash_decoding():
     shape = (8, 128, 128, 64, 256)
     mfma_variant = tkw.MMAType.F32_16x16x16_F16
@@ -581,7 +584,7 @@ def test_flash_decoding():
     # CHECK-COUNT-1:      vector.scatter
 
 
-@run_test
+# @run_test
 def test_attention_32x32x8():
     shape = (8, 128, 128, 64, 256)
     # Expose user-constraints
@@ -718,7 +721,7 @@ def repeat(
         # CHECK-COUNT-4:            vector.store {{.*}}: memref<8x128x128xf32{{.*}}>, vector<4xf32>
 
 
-@run_test
+# @run_test
 def test_dynamic_attention_32x32x8():
     shape = (8, 128, 128, 64, 256)
     # Expose user-constraints
@@ -856,7 +859,7 @@ def repeat(
         # CHECK-COUNT-3:        vector.maskedstore {{.*}} : memref<?x?x?xf32, strided<[?, ?, 1], offset: ?>>, vector<4xi1>, vector<4xf32>
 
 
-@run_test
+# @run_test
 def test_attention():
     shape = AttentionShape(
         num_query_heads=8,
@@ -910,7 +913,7 @@ def test_attention():
         # CHECK-COUNT-8:            {{.*}} = amdgpu.mfma
 
 
-@run_test
+# @run_test
 def test_attention_bias():
     shape = (8, 128, 128, 64, 256)
     # Expose user-constraints
@@ -1034,7 +1037,7 @@ def repeat(
         # CHECK-COUNT-8:            {{.*}} = amdgpu.mfma
 
 
-@run_test
+# @run_test
 def test_paged_flash_decoding():
     shape = paged_decode_attention_shape(
         num_query_heads=128,
@@ -1146,7 +1149,6 @@ def test_prefill_attention():
     prefill_attention, hyperparams = get_prefill_attention_kernel(
         shape, mfma_variant, q_shape, k_shape, v_shape, o_shape
     )
-
     with tk.gen.TestLaunchContext(
         hyperparams,
         canonicalize=True,
@@ -1163,7 +1165,6 @@ def test_prefill_attention():
         offsets = torch.ones(shape.num_seqs, dtype=torch.int32)
         seq_lens = torch.ones(shape.num_seqs, dtype=torch.int32)
         print(prefill_attention(q, k, v, offsets, seq_lens, output).module_op)
-
         # CHECK-LABEL:       func.func @prefill_attention
         # CHECK-COUNT-4:        vector.maskedload
         # CHECK:                scf.for
@@ -1180,3 +1181,88 @@ def test_prefill_attention():
         # CHECK-COUNT-4:            gpu.shuffle xor {{.*}}
         # CHECK-COUNT-16:           amdgpu.mfma
         # CHECK-COUNT-16:      vector.maskedstore
+
+
+@run_test
+def test_extend_attention():
+    shape = AttentionShape(
+        num_query_heads=16,
+        num_kv_heads=4,
+        head_size=64,
+        head_size_kv=64,
+        num_seqs=2,
+        max_seq_len=32,
+    )
+    total_token_num = 12189
+    extend_token_num = 3198
+    q_shape = (extend_token_num, shape.num_query_heads, shape.head_size)
+    k_shape = (extend_token_num, shape.num_kv_heads, shape.head_size)
+    v_shape = (extend_token_num, shape.num_kv_heads, shape.head_size_kv)
+    o_shape = (extend_token_num, shape.num_query_heads, shape.head_size_kv)
+    k_cache_shape = (total_token_num, shape.num_kv_heads, shape.head_size)
+    v_cache_shape = (total_token_num, shape.num_kv_heads, shape.head_size)
+    block_table_shape = (shape.num_seqs, shape.max_seq_len)
+    mfma_variant = (tkw.MMAType.F32_16x16x16_F16,) * 2
+    extend_attention, hyperparams = get_extend_attention_kernels(
+        shape,
+        mfma_variant,
+        k_shape,
+        v_shape,
+        block_table_shape,
+        k_cache_shape,
+        v_cache_shape,
+        o_shape,
+    )
+
+    with tk.gen.TestLaunchContext(
+        hyperparams,
+        canonicalize=True,
+        run=False,
+        run_bench=False,
+        schedule=False,
+        use_scheduling_barriers=False,
+    ):
+        torch.manual_seed(0)
+        q = torch.randn(q_shape, dtype=torch.float16)
+        k = torch.randn(k_shape, dtype=torch.float16)
+        v = torch.randn(v_shape, dtype=torch.float16)
+        output = torch.zeros(o_shape, dtype=torch.float32)
+        request_indices = torch.zeros(shape.num_seqs, dtype=torch.int32)
+        sequence_lengths = torch.zeros(shape.num_seqs, dtype=torch.int32)
+        sequence_lengths_extend = torch.zeros(shape.num_seqs, dtype=torch.int32)
+        start_indices_extend = torch.zeros(shape.num_seqs, dtype=torch.int32)
+        block_table = torch.zeros(block_table_shape, dtype=torch.int32)
+        k_cache = torch.zeros(k_cache_shape, dtype=torch.float16)
+        v_cache = torch.zeros(v_cache_shape, dtype=torch.float16)
+        print(
+            extend_attention(
+                q,
+                k,
+                v,
+                k_cache,
+                v_cache,
+                block_table,
+                request_indices,
+                sequence_lengths,
+                sequence_lengths_extend,
+                start_indices_extend,
+                output,
+            ).module_op
+        )
+
+        # CHECK-LABEL:       func.func @extend_attention
+        # CHECK-COUNT-4:        vector.maskedload
+        # CHECK:                scf.for
+        # CHECK-COUNT-1:            vector.maskedload
+        # CHECK-COUNT-1:            vector.store
+        # CHECK-COUNT-1:            vector.maskedload
+        # CHECK-COUNT-1:            vector.store
+        # CHECK-COUNT-1:            vector.maskedload
+        # CHECK-COUNT-1:            vector.store
+        # CHECK-COUNT-1:            vector.maskedload
+        # CHECK-COUNT-1:            vector.store
+        # CHECK-COUNT-32:           vector.load
+        # CHECK-COUNT-16:           amdgpu.mfma
+        # CHECK-COUNT-4:            gpu.shuffle xor {{.*}}
+        # CHECK-COUNT-16:           amdgpu.mfma
+        # CHECK-COUNT-16:      vector.maskedstore
diff --git a/lit_tests/kernel/wave/codegen.py b/lit_tests/kernel/wave/codegen.py
index 8b21aa7be..864f2a009 100644
--- a/lit_tests/kernel/wave/codegen.py
+++ b/lit_tests/kernel/wave/codegen.py
@@ -1628,6 +1628,83 @@ def binary_lowerings(
         # CHECK: %[[DIV:.+]] = arith.divf %[[MUL]]
 
 
+@run_test
+def test_int_comparisons():
+    constraints: list[tkw.Constraint] = [
+        tkw.HardwareConstraint(
+            threads_per_wave=64, waves_per_block=(1, 1, 1), vector_shapes={M: 16, N: 16}
+        )
+    ]
+    constraints += [tkw.WorkgroupConstraint(M, BLOCK_M, 0)]
+    constraints += [tkw.WorkgroupConstraint(N, BLOCK_N, 1)]
+    constraints += [tkw.WaveConstraint(M, BLOCK_M / 2)]
+    constraints += [tkw.WaveConstraint(N, BLOCK_N / 2)]
+
+    @tkw.wave(constraints)
+    def cmp_lowerings(
+        a: tkl.Memory[M, N, ADDRESS_SPACE, tkl.i32],
+        b: tkl.Memory[M, N, ADDRESS_SPACE, tkl.i32],
+    ):
+        a_reg = tkw.read(a, elements_per_thread=4)
+        b_reg = tkw.read(b, elements_per_thread=4)
+        sgt = tkw.sgt(a_reg, b_reg)
+        s1 = tkw.select(sgt, a_reg, b_reg)
+        slt = tkw.slt(a_reg, b_reg)
+        s2 = tkw.select(slt, a_reg, b_reg)
+        sge = tkw.sge(s1, s2)
+        s3 = tkw.select(sge, s1, s2)
+        sle = tkw.sle(s1, s2)
+        s4 = tkw.select(sle, s1, s2)
+        res = s1 + s2 + s3 + s4
+        tkw.write(res, a, elements_per_thread=4)
+
+    a = torch.randint(42, (16, 16), dtype=torch.int32)
+    b = torch.randint(42, (16, 16), dtype=torch.int32)
+    with codegen_test_context():
+        print(cmp_lowerings(a, b).module_op)
+        # CHECK-LABEL: @cmp_lowerings
+        # CHECK: arith.cmpi sgt
+        # CHECK: arith.select
+        # CHECK: arith.cmpi slt
+        # CHECK: arith.select
+        # CHECK: arith.cmpi sge
+        # CHECK: arith.select
+
+
+@run_test
+def test_pow():
+    constraints: list[tkw.Constraint] = [
+        tkw.HardwareConstraint(
+            threads_per_wave=64, waves_per_block=(1, 1, 1), vector_shapes={M: 16, N: 16}
+        )
+    ]
+    constraints += [tkw.WorkgroupConstraint(M, BLOCK_M, 0)]
+    constraints += [tkw.WorkgroupConstraint(N, BLOCK_N, 1)]
+    constraints += [tkw.WaveConstraint(M, BLOCK_M / 2)]
+    constraints += [tkw.WaveConstraint(N, BLOCK_N / 2)]
+
+    @tkw.wave(constraints)
+    def pow_lowerings(
+        a: tkl.Memory[M, N, ADDRESS_SPACE, tkl.i32],
+        b: tkl.Memory[M, N, ADDRESS_SPACE, tkl.f32],
+    ):
+        a_reg = tkw.read(a, elements_per_thread=4)
+        b_reg = tkw.read(b, elements_per_thread=4)
+        ii = a_reg**a_reg
+        fi = b_reg**a_reg
+        ff = b_reg**b_reg
+        res = tkw.cast(ii, tkl.f32) + fi + ff
+        tkw.write(res, b, elements_per_thread=4)
+
+    a = torch.randint(42, (16, 16), dtype=torch.int32)
+    b = torch.randn(16, 16, dtype=torch.float32)
+    with codegen_test_context():
+        print(pow_lowerings(a, b).module_op)
+        # CHECK-LABEL: @pow_lowerings
+        # CHECK: math.ipowi
+        # CHECK: math.fpowi
+        # CHECK: math.powf
+
 # TODO: Something is broken in codegen and we are getting int in place of fx.Node
 # @launch
 @pytest.mark.skip(reason="getitem: Currently only stub implementation")
diff --git a/playground/__init__.py b/playground/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/playground/attention_with_rpe_template.py b/playground/attention_with_rpe_template.py
new file mode 100644
index 000000000..314c7c0dc
--- /dev/null
+++ b/playground/attention_with_rpe_template.py
@@ -0,0 +1,235 @@
+# Copyright 2025 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from dataclasses import dataclass
+from typing import Optional
+
+import iree.turbine.kernel.lang as tkl
+import iree.turbine.kernel.wave as tkw
+from iree.turbine.kernel.lang.global_symbols import *
+from iree.turbine.kernel.wave.constraints import MMAType
+from iree.turbine.kernel.wave.utils import (
+    get_mfma_load_elems_per_thread,
+    get_mfma_store_elems_per_thread,
+)
+from iree.turbine.kernel.wave.templates.attention_common import AttentionShape
+
+
+def get_vanilla_attention_kernel(shape: AttentionShape, mfma_variant: MMAType,
+                                 dynamic_dims: bool,
+                                 max_context_length: Optional[int]):
+    # RPE
+    ZERO = tkl.sym.ZERO
+    OFFSET = tkl.sym.OFFSET
+
+    # Input sizes
+    B = tkl.sym.B
+    M = tkl.sym.M
+    N = tkl.sym.N
+    K1 = tkl.sym.K1
+    K2 = tkl.sym.K2
+    # Workgroup tile sizes
+    BLOCK_B = tkl.sym.BLOCK_B
+    BLOCK_M = tkl.sym.BLOCK_M
+    BLOCK_N = tkl.sym.BLOCK_N
+    BLOCK_K2 = tkl.sym.BLOCK_K2
+    # Address space (for GPU, shared(1) or global(0))
+    ADDRESS_SPACE = tkl.sym.ADDRESS_SPACE
+    # Other hyperparameters
+    LOAD_ELEMS_PER_THREAD_QK = index_symbol("LOAD_ELEMS_PER_THREAD_QK")
+    LOAD_ELEMS_PER_THREAD_PV = index_symbol("LOAD_ELEMS_PER_THREAD_PV")
+    STORE_ELEMS_PER_THREAD = tkl.sym.STORE_ELEMS_PER_THREAD
+
+    # Expose user-constraints
+    constraints: list[tkw.Constraint] = [
+        tkw.WorkgroupConstraint(M, BLOCK_M, 0)
+    ]
+    constraints += [tkw.WorkgroupConstraint(N, BLOCK_N, 1)]
+    constraints += [tkw.WorkgroupConstraint(B, BLOCK_B, 2)]
+    constraints += [tkw.TilingConstraint(K2, BLOCK_K2)]
+    constraints += [tkw.WaveConstraint(M, BLOCK_M / 4)]
+    constraints += [tkw.WaveConstraint(N, BLOCK_N / 1)]
+
+    if mfma_variant[1] == MMAType.F32_16x16x16_F16:
+        Mvec = 16
+        Nvec = 16
+    if mfma_variant[1] == MMAType.F32_32x32x8_F16:
+        Mvec = 32
+        Nvec = 32
+
+    constraints += [
+        tkw.HardwareConstraint(
+            threads_per_wave=64,
+            waves_per_block=(4, 1, 1),
+            mma_type=mfma_variant[1],
+            vector_shapes={
+                B: 0,
+                M: Mvec,
+                N: Nvec
+            },
+        )
+    ]
+
+    if dynamic_dims:
+        constraints += [tkw.Assumption(K2 > BLOCK_K2 * 4)]
+
+    i = tkw.IndexMapping.iterator(0)
+    j = tkw.IndexMapping.iterator(1)
+    k = tkw.IndexMapping.iterator(2)
+    mapping = tkw.IndexMapping(num_iterators=3,
+                               inputs={
+                                   B: i,
+                                   N: j,
+                                   M: k
+                               },
+                               outputs={
+                                   B: i,
+                                   M: k,
+                                   N: j
+                               })
+
+    offset_mapping = tkw.IndexMapping(
+        num_iterators=1,
+        inputs={K2: i + OFFSET},
+        outputs={K2: i},
+    )
+
+    use_t5_rpe = max_context_length is not None
+    if use_t5_rpe:
+        rpe_layout = tkl.MemoryLayout(shape=[
+            max_context_length,
+        ])
+    assert use_t5_rpe, "use_t5_rpe needed until rpe arg can DCE without crashing"
+
+    @tkw.wave(constraints)
+    def base_attention(
+        q: tkl.Memory[B, M, K1, GLOBAL_ADDRESS_SPACE, tkl.f16],
+        k: tkl.Memory[B, K2, K1, ADDRESS_SPACE, tkl.f16],
+        v: tkl.Memory[B, N, K2, ADDRESS_SPACE, tkl.f16],
+        # TODO: if not use_t5_rpe, this will DCE; atm DCE on blockargs crashes.
+        rpe: tkl.Memory[K2, GLOBAL_ADDRESS_SPACE, tkl.f32, rpe_layout],
+        c: tkl.Memory[B, M, N, GLOBAL_ADDRESS_SPACE, tkl.f32],
+    ):
+        c_reg = tkl.Register[B, N, M, tkl.f32](0.0)
+        init_sum = tkl.Register[B, M, tkl.f32](0.0)
+        init_max = tkl.Register[B, M, tkl.f32](-1e6)
+
+        # This microkernel encodes the fact that if the reduction
+        # dimension were tiled, then we would need to materialize a loop.
+        @tkw.reduction(K2, init_args=[init_max, init_sum, c_reg])
+        def repeat(
+            partial_max: tkl.Register[B, M, tkl.f32],
+            partial_sum: tkl.Register[B, M, tkl.f32],
+            acc: tkl.Register[B, N, M, tkl.f32],
+        ):
+            imm_reg = tkl.Register[B, K2, M, tkl.f32](0.0)
+            q_reg = tkw.read(q, elements_per_thread=LOAD_ELEMS_PER_THREAD_QK)
+            k_reg = tkw.read(k, elements_per_thread=LOAD_ELEMS_PER_THREAD_QK)
+            inner_acc = tkw.mma(k_reg, q_reg, imm_reg, mfma_variant[0])
+            x_j = tkw.permute(inner_acc, target_shape=[B, M, K2])
+
+            ####################################################################
+            # T5 RPE
+            ####################################################################
+            # Fused T5 RPE adds attention bias pre-softmax normalization.
+            # When fusing into the FA variant, adding locally before the max and
+            # the partial softmax should be equivalent.
+            if use_t5_rpe:
+                # 1. Indices i and j broadcasted along K2 with a twist:
+                # here we use *static* information that is *implicitly* encoded
+                # in the *transformation*: under the distribution constraints
+                # specified we know that the shape [M] will eventually resolve
+                # to [1] and can thus be "cast + broadcast" to [K2].
+                i = tkw.self_index(M, tkl.i64, elements_per_thread=1)
+                i = tkw.broadcast(i, target_shape=[K2])
+                j = tkw.self_index(K2, tkl.i64, elements_per_thread=1)
+
+                # 2. Clip i - j to the proper bucket in [0, max_context_length]
+                # to represent the following:
+                #   - if 0 < i - j < max_context_length
+                #       then x_j += rpe_reg
+                #   - otherwise (i.e. i - j == {0, max_context_length})
+                #       then x_j += 0
+                # TODO: we may need scaling adjustements depending on how we want
+                # to do bucketing; atm it is bucketing of size 1.
+                # Note: tkw.apply_expr allows us to circumvent issues such as.
+                # ValueError: Expected an fx.Node but got <class 'int'>
+                zero = tkw.broadcast(tkw.apply_expr(i, lambda i: 0),
+                                     target_shape=[K2])
+                idx = tkw.maximum(i - j, tkw.cast(zero, tkl.i64))
+                # Note: tkw.apply_expr allows us to circumvent issues such as.
+                # ValueError: Expected an fx.Node but got <class 'Symbol'>
+                max = tkw.broadcast( \
+                    tkw.apply_expr(i, lambda i: max_context_length), \
+                    target_shape=[K2])
+                idx = tkw.minimum(idx, tkw.cast(max, tkl.i64))
+
+                # 3. Read indirect into the 1-D rpe array via offset_mapping.
+                tkw.set_symbol(OFFSET, idx)  # offset will have shape [K2]
+                rpe_reg = tkw.read(
+                    rpe,
+                    mapping=offset_mapping,
+                    elements_per_thread=LOAD_ELEMS_PER_THREAD_QK)
+
+                # 4. Tadaaaa.
+                x_j = x_j + rpe_reg
+
+            m_j = tkw.max(x_j, partial_max, dim=K2)
+            e_delta_max = tkw.exp2(partial_max - m_j)
+            e_delta = tkw.exp2(x_j - m_j)
+            e_init = partial_sum * e_delta_max
+            d_j = tkw.sum(e_delta, e_init, dim=K2)
+            imm_f16 = tkw.cast(e_delta, tkl.f16)
+            v_reg = tkw.read(v, elements_per_thread=LOAD_ELEMS_PER_THREAD_PV)
+            new_acc = acc * e_delta_max
+            acc = tkw.mma(v_reg, imm_f16, new_acc)
+            return m_j, d_j, acc
+
+        # repeat represents the results of the loop
+        res_max, res_sum, res_mm = repeat
+        reciprocal_sum = tkw.reciprocal(res_sum)
+        res = res_mm * reciprocal_sum
+        tkw.write(res,
+                  c,
+                  mapping=mapping,
+                  elements_per_thread=STORE_ELEMS_PER_THREAD)
+
+    hyperparams = {
+        ADDRESS_SPACE: SHARED_ADDRESS_SPACE,
+        LOAD_ELEMS_PER_THREAD_QK:
+        get_mfma_load_elems_per_thread(mfma_variant[0]),
+        LOAD_ELEMS_PER_THREAD_PV:
+        get_mfma_load_elems_per_thread(mfma_variant[1]),
+        STORE_ELEMS_PER_THREAD:
+        get_mfma_store_elems_per_thread(mfma_variant[1]),
+        BLOCK_B: 1,
+        BLOCK_M: 128,
+        BLOCK_N: 64,
+        BLOCK_K2: 64,
+        B: shape.num_query_heads,
+        M: shape.query_seq_len,
+        N: shape.head_size_kv,
+        K1: shape.head_size,
+        K2: shape.kv_seq_len,
+    }
+
+    dynamic_symbols = []
+    dynamic_symbols_map = {}
+    if dynamic_dims:
+        dynamic_symbols_map[M] = hyperparams[M]
+        dynamic_symbols_map[N] = hyperparams[N]
+        dynamic_symbols_map[B] = hyperparams[B]
+        dynamic_symbols_map[K2] = hyperparams[K2]
+        dynamic_symbols.append(M)
+        dynamic_symbols.append(N)
+        dynamic_symbols.append(B)
+        dynamic_symbols.append(K2)
+        del hyperparams[M]
+        del hyperparams[N]
+        del hyperparams[B]
+        del hyperparams[K2]
+
+    return base_attention, hyperparams, dynamic_symbols, dynamic_symbols_map
diff --git a/playground/attention_with_rpe_test.py b/playground/attention_with_rpe_test.py
new file mode 100644
index 000000000..645fe8b85
--- /dev/null
+++ b/playground/attention_with_rpe_test.py
@@ -0,0 +1,183 @@
+# Copyright 2025 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import math
+import torch
+from torch.profiler import profile, record_function, ProfilerActivity
+from torch.nn import functional as F
+from torch.testing import assert_close
+from typing import Any, Callable
+
+from iree.turbine.kernel.gen import TestLaunchContext
+from iree.turbine.kernel.wave.constraints import MMAType
+from iree.turbine.kernel.wave.templates.attention_common import AttentionShape
+from iree.turbine.kernel.wave.templates.vanilla_attention import (
+    get_vanilla_attention_kernel as get_vanilla_tkw_attention_kernel)
+from iree.turbine.kernel.wave.utils import (
+    device_randn,
+    device_zeros,
+    get_default_run_config,
+    to_default_device,
+)
+from attention_with_rpe_template import (
+    get_vanilla_attention_kernel as
+    get_vanilla_tkw_attention_with_rpe_output_kernel)
+
+torch.manual_seed(0)
+torch.set_printoptions(
+    linewidth=1000000,
+    threshold=1000000,
+    precision=3,
+)
+
+
+### TKW Harness
+def run(fun: Callable, hparams, *args) -> Any:
+    with torch.profiler.profile(
+            activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:
+        with torch.no_grad():  # Disable gradient calculations
+            with TestLaunchContext(
+                    hparams,
+                    canonicalize=True,
+                    run=True,
+                    run_config=get_default_run_config(),
+                    run_bench=False,
+                    schedule=False,
+                    use_scheduling_barriers=False,
+            ):
+                fun(*args)
+
+    print(
+        prof.key_averages(group_by_input_shape=True).table(
+            sort_by="self_cuda_time_total", row_limit=10))
+
+
+#################################################################################
+# INIT VALS
+#################################################################################
+# num_query_heads, num_kv_heads, head_size, head_size_kv
+shape = AttentionShape(128, 128, 128, 128)
+shape.query_seq_len = 128
+shape.kv_seq_len = 128
+
+assert shape.num_query_heads == shape.num_kv_heads, \
+    "expected query and kv to have the same number of heads!"
+
+q_shape = (shape.num_query_heads, shape.query_seq_len, shape.head_size)
+k_shape = (shape.num_kv_heads, shape.kv_seq_len, shape.head_size)
+v_shape = (shape.num_kv_heads, shape.kv_seq_len, shape.head_size_kv)
+o_shape = (shape.num_query_heads, shape.query_seq_len, shape.head_size_kv)
+
+q = device_randn(q_shape, dtype=torch.float16)
+k = device_randn(k_shape, dtype=torch.float16)
+v = device_randn(v_shape, dtype=torch.float16)
+tkw_attention_output = device_zeros(o_shape, dtype=torch.float32)
+tkw_attention_with_rpe_output = device_zeros(o_shape, dtype=torch.float32)
+
+log2e = 1.44269504089
+dk_sqrt = math.sqrt(1.0 / q.shape[-1])
+
+#################################################################################
+# T5 RPE INIT VALS
+#################################################################################
+# T5 RPE parameter
+max_context_length = 33
+
+# Applied pre-softmax on the MMA'ed result so f32.
+# Provision more room for clipping and adding 0 at the boundaries.
+rpe = device_zeros(1000 + max_context_length + 2, dtype=torch.float32)
+rpe = rpe[:max_context_length + 2].view(max_context_length + 2)
+rpe.copy_(device_randn(max_context_length + 2, dtype=torch.float32))
+rpe[0] = 0
+rpe[max_context_length + 1] = 0
+
+
+def t5_rpe_masked_cond(rpe, max_context_length: int, sequence_length: int,
+                       dtype):
+    positions = to_default_device(torch.arange(sequence_length))
+    pos_diff = positions.unsqueeze(1) - positions.unsqueeze(0)
+    mask = to_default_device((pos_diff >= 0)
+                             & (pos_diff <= max_context_length))
+    rpe_cond = device_zeros(sequence_length, sequence_length, dtype=dtype)
+    rpe_cond[mask] = rpe[pos_diff[mask]]
+    return rpe_cond
+
+
+# rpe_cond is used by torch only
+rpe_cond = t5_rpe_masked_cond(rpe,
+                              max_context_length=max_context_length,
+                              sequence_length=shape.kv_seq_len,
+                              dtype=tkw_attention_with_rpe_output.dtype)
+
+#################################################################################
+# TORCH ATTENTION and ATTENTION + RPE
+#################################################################################
+torch_attention_ref_output = torch.nn.functional.scaled_dot_product_attention(
+    q, k, v, attn_mask=None)
+
+a = torch.matmul(q, k.transpose(-1, -2)) * dk_sqrt
+torch_attention_output = torch.matmul(torch.softmax(a, dim=-1), v)
+
+# Sanity check that torch_attention_output and torch_attention_ref_output are
+# the same so we can inject RPE pre-softmax and compute the delta.
+# We will test that the delta post-softmax is the same for torch and TKW.
+assert_close(torch_attention_output,
+             torch_attention_ref_output,
+             atol=2e-3,
+             rtol=2e-3)
+
+a += rpe_cond.unsqueeze(0)
+torch_attention_with_rpe_output = torch.matmul(F.softmax(a, dim=-1), v)
+torch_rpe_delta_output = torch_attention_with_rpe_output - torch_attention_output
+
+#################################################################################
+# TKW BASE ATTENTION
+#################################################################################
+### Reference version
+tkw_attention, hyperparams, dynamic_symbols, dynamic_symbols_map = \
+    get_vanilla_tkw_attention_kernel(
+        shape,
+        mfma_variant=[MMAType.F32_16x16x16_F16,
+                      MMAType.F32_16x16x16_F16],
+        dynamic_dims=False)
+
+
+def attention(tq, tk, tv, toutput):
+    tkw_attention(tq, tk, tv, toutput)
+
+
+run(attention, hyperparams, q * dk_sqrt * log2e, k, v.permute([0, 2, 1]),
+    tkw_attention_output)
+
+assert_close(torch_attention_output.to(dtype=tkw_attention_output.dtype),
+             tkw_attention_output,
+             atol=2e-3,
+             rtol=2e-3)
+
+### RPE version
+tkw_attention_with_rpe, hyperparams, dynamic_symbols, dynamic_symbols_map = \
+    get_vanilla_tkw_attention_with_rpe_output_kernel(
+        shape,
+        mfma_variant=[MMAType.F32_16x16x16_F16,
+                      MMAType.F32_16x16x16_F16],
+        dynamic_dims=False,
+        max_context_length = max_context_length + 2)
+
+
+def attention_with_rpe(tq, tk, tv, trpe, toutput):
+    mb = tkw_attention_with_rpe(tq, tk, tv, trpe, toutput)
+    # print(mb.module_op)
+
+
+run(attention_with_rpe, hyperparams, q * dk_sqrt * log2e, k,
+    v.permute([0, 2, 1]), rpe, tkw_attention_with_rpe_output)
+
+tkw_rpe_delta_output = tkw_attention_with_rpe_output - tkw_attention_output
+
+assert_close(torch_rpe_delta_output.to(dtype=tkw_rpe_delta_output.dtype),
+             tkw_rpe_delta_output,
+             atol=2e-3,
+             rtol=2e-3)
diff --git a/playground/causal_attention_template.py b/playground/causal_attention_template.py
new file mode 100644
index 000000000..f2de1fe25
--- /dev/null
+++ b/playground/causal_attention_template.py
@@ -0,0 +1,197 @@
+# Copyright 2025 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from dataclasses import dataclass
+import sympy
+import sys
+from typing import Optional
+
+import iree.turbine.kernel.lang as tkl
+import iree.turbine.kernel.wave as tkw
+from iree.turbine.kernel.lang.global_symbols import *
+from iree.turbine.kernel.wave.constraints import MMAType
+from iree.turbine.kernel.wave.utils import (
+    get_mfma_load_elems_per_thread,
+    get_mfma_store_elems_per_thread,
+)
+from iree.turbine.kernel.wave.templates.attention_common import AttentionShape
+
+
+def get_causal_attention_kernel(shape: AttentionShape, mfma_variant: MMAType,
+                                dynamic_dims: bool):
+    ZERO = tkl.sym.ZERO
+
+    # Input sizes
+    B = tkl.sym.B
+    M = tkl.sym.M
+    N = tkl.sym.N
+    K1 = tkl.sym.K1
+    K2 = tkl.sym.K2
+    # Workgroup tile sizes
+    BLOCK_B = tkl.sym.BLOCK_B
+    BLOCK_M = tkl.sym.BLOCK_M
+    BLOCK_N = tkl.sym.BLOCK_N
+    BLOCK_K2 = tkl.sym.BLOCK_K2
+    # Address space (for GPU, shared(1) or global(0))
+    ADDRESS_SPACE = tkl.sym.ADDRESS_SPACE
+    # Other hyperparameters
+    LOAD_ELEMS_PER_THREAD_QK = index_symbol("LOAD_ELEMS_PER_THREAD_QK")
+    LOAD_ELEMS_PER_THREAD_PV = index_symbol("LOAD_ELEMS_PER_THREAD_PV")
+    STORE_ELEMS_PER_THREAD = tkl.sym.STORE_ELEMS_PER_THREAD
+
+    # Expose user-constraints
+    constraints: list[tkw.Constraint] = [
+        tkw.WorkgroupConstraint(M, BLOCK_M, 0)
+    ]
+    constraints += [tkw.WorkgroupConstraint(N, BLOCK_N, 1)]
+    constraints += [tkw.WorkgroupConstraint(B, BLOCK_B, 2)]
+    constraints += [tkw.TilingConstraint(K2, BLOCK_K2)]
+    constraints += [tkw.WaveConstraint(M, BLOCK_M / 4)]
+    constraints += [tkw.WaveConstraint(N, BLOCK_N / 1)]
+
+    if mfma_variant[1] == MMAType.F32_16x16x16_F16:
+        Mvec = 16
+        Nvec = 16
+    if mfma_variant[1] == MMAType.F32_32x32x8_F16:
+        Mvec = 32
+        Nvec = 32
+
+    constraints += [
+        tkw.HardwareConstraint(
+            threads_per_wave=64,
+            waves_per_block=(4, 1, 1),
+            mma_type=mfma_variant[1],
+            vector_shapes={
+                B: 0,
+                M: Mvec,
+                N: Nvec
+            },
+        )
+    ]
+
+    if dynamic_dims:
+        constraints += [tkw.Assumption(K2 > BLOCK_K2 * 4)]
+
+    i = tkw.IndexMapping.iterator(0)
+    j = tkw.IndexMapping.iterator(1)
+    k = tkw.IndexMapping.iterator(2)
+    mapping = tkw.IndexMapping(num_iterators=3,
+                               inputs={
+                                   B: i,
+                                   N: j,
+                                   M: k
+                               },
+                               outputs={
+                                   B: i,
+                                   M: k,
+                                   N: j
+                               })
+
+    @tkw.wave(constraints)
+    def base_attention(
+        q: tkl.Memory[B, M, K1, GLOBAL_ADDRESS_SPACE, tkl.f16],
+        k: tkl.Memory[B, K2, K1, ADDRESS_SPACE, tkl.f16],
+        v: tkl.Memory[B, N, K2, ADDRESS_SPACE, tkl.f16],
+        tmp: tkl.Memory[K2, ADDRESS_SPACE, tkl.f32],
+        c: tkl.Memory[B, M, N, GLOBAL_ADDRESS_SPACE, tkl.f32],
+    ):
+        c_reg = tkl.Register[B, N, M, tkl.f32](0.0)
+        init_sum = tkl.Register[B, M, tkl.f32](0.0)
+        init_max = tkl.Register[B, M, tkl.f32](-1e6)
+
+        # This microkernel encodes the fact that if the reduction
+        # dimension were tiled, then we would need to materialize a loop.
+        @tkw.reduction(K2, init_args=[init_max, init_sum, c_reg])
+        def repeat(
+            partial_max: tkl.Register[B, M, tkl.f32],
+            partial_sum: tkl.Register[B, M, tkl.f32],
+            acc: tkl.Register[B, N, M, tkl.f32],
+        ):
+            imm_reg = tkl.Register[B, K2, M, tkl.f32](0.0)
+            imm_reg = tkw.permute(imm_reg, target_shape=[B, M, K2])
+            ####################################################################
+            # Causal mask
+            ####################################################################
+            # Indices i and j broadcasted along K2 with a twist:
+            # here we use *static* information that is *implicitly* encoded
+            # in the *transformation*: under the distribution constraints
+            # specified we know that the shape [M] will eventually resolve
+            # to [1] and can thus be "cast + broadcast" to [K2].
+            i = tkw.self_index(M, tkl.i64, elements_per_thread=1)
+            i = tkw.broadcast(i, target_shape=[K2])
+            j = tkw.self_index(K2, tkl.i64, elements_per_thread=1)
+            ZERO = tkl.Register[K2, tkl.i64](0)
+            ONE = tkl.Register[K2, tkl.i64](1)
+            ZEROF = tkl.Register[K2, tkl.f32](0.0)
+            MIN_INF = tkl.Register[K2, tkl.f32](float('-inf'))
+            idx = j - i - ONE
+            bias = tkw.select(tkw.slt(idx, ZERO), ZEROF, MIN_INF)
+            ### Apply causality mask to imm_reg.
+            imm_reg = imm_reg + bias
+            imm_reg = tkw.permute(imm_reg, target_shape=[B, K2, M])
+
+            q_reg = tkw.read(q, elements_per_thread=LOAD_ELEMS_PER_THREAD_QK)
+            k_reg = tkw.read(k, elements_per_thread=LOAD_ELEMS_PER_THREAD_QK)
+            inner_acc = tkw.mma(k_reg, q_reg, imm_reg, mfma_variant[0])
+            x_j = tkw.permute(inner_acc, target_shape=[B, M, K2])
+
+            m_j = tkw.max(x_j, partial_max, dim=K2)
+            e_delta_max = tkw.exp2(partial_max - m_j)
+            e_delta = tkw.exp2(x_j - m_j)
+            e_init = partial_sum * e_delta_max
+            d_j = tkw.sum(e_delta, e_init, dim=K2)
+            imm_f16 = tkw.cast(e_delta, tkl.f16)
+            v_reg = tkw.read(v, elements_per_thread=LOAD_ELEMS_PER_THREAD_PV)
+            new_acc = acc * e_delta_max
+            acc = tkw.mma(v_reg, imm_f16, new_acc)
+            return m_j, d_j, acc
+
+        # repeat represents the results of the loop
+        res_max, res_sum, res_mm = repeat
+        reciprocal_sum = tkw.reciprocal(res_sum)
+        res = res_mm * reciprocal_sum
+        tkw.write(res,
+                  c,
+                  mapping=mapping,
+                  elements_per_thread=STORE_ELEMS_PER_THREAD)
+
+    hyperparams = {
+        ADDRESS_SPACE: SHARED_ADDRESS_SPACE,
+        LOAD_ELEMS_PER_THREAD_QK:
+        get_mfma_load_elems_per_thread(mfma_variant[0]),
+        LOAD_ELEMS_PER_THREAD_PV:
+        get_mfma_load_elems_per_thread(mfma_variant[1]),
+        STORE_ELEMS_PER_THREAD:
+        get_mfma_store_elems_per_thread(mfma_variant[1]),
+        BLOCK_B: 1,
+        BLOCK_M: 128,
+        BLOCK_N: 64,
+        BLOCK_K2: 64,
+        B: shape.num_query_heads,
+        M: shape.query_seq_len,
+        N: shape.head_size_kv,
+        K1: shape.head_size,
+        K2: shape.kv_seq_len,
+        ZERO: 0,
+    }
+
+    dynamic_symbols = []
+    dynamic_symbols_map = {}
+    if dynamic_dims:
+        dynamic_symbols_map[M] = hyperparams[M]
+        dynamic_symbols_map[N] = hyperparams[N]
+        dynamic_symbols_map[B] = hyperparams[B]
+        dynamic_symbols_map[K2] = hyperparams[K2]
+        dynamic_symbols.append(M)
+        dynamic_symbols.append(N)
+        dynamic_symbols.append(B)
+        dynamic_symbols.append(K2)
+        del hyperparams[M]
+        del hyperparams[N]
+        del hyperparams[B]
+        del hyperparams[K2]
+
+    return base_attention, hyperparams, dynamic_symbols, dynamic_symbols_map
diff --git a/playground/causal_attention_test.py b/playground/causal_attention_test.py
new file mode 100644
index 000000000..77a9a4ee8
--- /dev/null
+++ b/playground/causal_attention_test.py
@@ -0,0 +1,179 @@
+# Copyright 2025 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import math
+import torch
+from torch.profiler import profile, record_function, ProfilerActivity
+from torch.nn import functional as F
+from torch.testing import assert_close
+from typing import Any, Callable
+
+from iree.turbine.kernel.gen import TestLaunchContext
+from iree.turbine.kernel.wave.constraints import MMAType
+from iree.turbine.kernel.wave.templates.attention_common import AttentionShape
+from iree.turbine.kernel.wave.utils import (
+    device_randn,
+    device_zeros,
+    get_default_run_config,
+    to_default_device,
+)
+from causal_attention_template import (get_causal_attention_kernel as
+                                       get_tkw_causal_attention_kernel)
+from iree.turbine.kernel.wave.templates.vanilla_attention import (
+    get_vanilla_attention_kernel as get_vanilla_tkw_attention_kernel)
+
+import iree.turbine.kernel.lang as tkl
+import iree.turbine.kernel.wave as tkw
+torch.manual_seed(0)
+torch.set_printoptions(
+    linewidth=1000000,
+    threshold=1000000,
+    precision=3,
+)
+
+
+def find_different_coordinates(tensor1, tensor2, rtol=1e-5, atol=1e-8):
+    # Calculate the difference in float32 to avoid range issues
+    diff = (tensor1.float() - tensor2.float()).abs()
+
+    # Create a mask where the difference exceeds the tolerance
+    tolerance = atol + rtol * tensor2.float().abs()
+    diff_mask = diff > tolerance
+
+    if not diff_mask.any():  # Tensors are close if the mask is all False
+        print("Tensors are close.")
+        return []
+
+    diff_indices = torch.nonzero(diff_mask)
+
+    print("Tensors are different at the following coordinates:")
+    for coords in diff_indices:
+        print(tuple(coords.tolist()))
+
+    return diff_indices
+
+
+### TKW Harness
+def run(fun: Callable, hparams, *args) -> Any:
+    with torch.profiler.profile(
+            activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:
+        with torch.no_grad():  # Disable gradient calculations
+            with TestLaunchContext(
+                    hparams,
+                    canonicalize=True,
+                    # compile_config={"print_ir_after": "all"},
+                    run=True,
+                    run_config=get_default_run_config(),
+                    run_bench=False,
+                    schedule=False,
+                    use_scheduling_barriers=False,
+            ):
+                fun(*args)
+
+    print(
+        prof.key_averages(group_by_input_shape=True).table(
+            sort_by="self_cuda_time_total", row_limit=10))
+
+
+#################################################################################
+# INIT VALS
+#################################################################################
+# num_query_heads, num_kv_heads, head_size, head_size_kv
+shape = AttentionShape(128, 128, 128, 128)
+shape.query_seq_len = 128
+shape.kv_seq_len = 128
+
+assert shape.num_query_heads == shape.num_kv_heads, \
+    "expected query and kv to have the same number of heads!"
+
+q_shape = (shape.num_query_heads, shape.query_seq_len, shape.head_size)
+k_shape = (shape.num_kv_heads, shape.kv_seq_len, shape.head_size)
+v_shape = (shape.num_kv_heads, shape.kv_seq_len, shape.head_size_kv)
+o_shape = (shape.num_query_heads, shape.query_seq_len, shape.head_size_kv)
+
+q = device_randn(q_shape, dtype=torch.float16)
+k = device_randn(k_shape, dtype=torch.float16)
+v = device_randn(v_shape, dtype=torch.float16)
+tkw_attention_output = device_zeros(o_shape, dtype=torch.float32)
+tkw_causal_attention_output = device_zeros(o_shape, dtype=torch.float32)
+
+log2e = 1.44269504089
+dk_sqrt = math.sqrt(1.0 / q.shape[-1])
+
+#################################################################################
+# TORCH ATTENTION
+#################################################################################
+torch_causal_attention_ref_output = torch.nn.functional.scaled_dot_product_attention(
+    q, k, v, attn_mask=None, is_causal=True)
+torch_attention_ref_output = torch.nn.functional.scaled_dot_product_attention(
+    q, k, v, attn_mask=None)
+torch_delta = torch_attention_ref_output - torch_causal_attention_ref_output
+print(torch_delta)
+
+#################################################################################
+# TKW ATTENTION
+#################################################################################
+### Reference version
+tkw_attention, hyperparams, dynamic_symbols, dynamic_symbols_map = \
+    get_vanilla_tkw_attention_kernel(
+        shape,
+        mfma_variant=[MMAType.F32_16x16x16_F16,
+                      MMAType.F32_16x16x16_F16],
+        dynamic_dims=False)
+
+
+def attention(tq, tk, tv, toutput):
+    tkw_attention(tq, tk, tv, toutput)
+
+
+run(attention, hyperparams, q * dk_sqrt * log2e, k, v.permute([0, 2, 1]),
+    tkw_attention_output)
+
+assert_close(torch_attention_ref_output.to(dtype=tkw_attention_output.dtype),
+             tkw_attention_output,
+             atol=2e-3,
+             rtol=2e-3)
+
+### Causal version
+tkw_causal_attention, hyperparams, dynamic_symbols, dynamic_symbols_map = \
+    get_tkw_causal_attention_kernel(
+        shape,
+        mfma_variant=[MMAType.F32_16x16x16_F16,
+                      MMAType.F32_16x16x16_F16],
+        dynamic_dims=False)
+
+
+def causal_attention(tq, tk, tv, toutput):
+    mb = tkw_causal_attention(tq, tk, tv, toutput)
+    print(mb.module_op)
+
+
+run(causal_attention, hyperparams, q * dk_sqrt * log2e, k,
+    v.permute([0, 2, 1]), tkw_causal_attention_output)
+
+# tkw_delta = tkw_causal_attention_output - tkw_attention_output
+# print(tkw_delta)
+# print(torch_causal_attention_ref_output[67, 16])
+# print(tkw_causal_attention_output[67, 16])
+
+# Coordinates where we see discrepancies are:
+#   (*, 16:31, *)
+#   (*, 48:63, *)
+#   (*, 80:95, *)
+#   (*, 80:95, *)
+#   (*, 112:127, *)
+# different_coords = find_different_coordinates(
+#     torch_causal_attention_ref_output,
+#     tkw_causal_attention_output,
+#     rtol=2e-3,
+#     atol=2e-3)
+# print(different_coords)
+
+assert_close(torch_causal_attention_ref_output.to(
+    dtype=tkw_causal_attention_output.dtype),
+             tkw_causal_attention_output,
+             atol=2e-3,
+             rtol=2e-3)
diff --git a/playground/stress.py b/playground/stress.py
new file mode 100644
index 000000000..ac77b7613
--- /dev/null
+++ b/playground/stress.py
@@ -0,0 +1,203 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from pdb import run
+import pytest
+import torch
+from typing import Callable
+
+from iree.turbine.kernel._support.tracing import TestLaunchContext
+from iree.turbine.kernel.wave.constraints import MMAType
+from iree.turbine.kernel.wave.utils import (device_randn, device_zeros,
+                                            get_default_run_config)
+import iree.turbine.kernel.lang as tkl
+import iree.turbine.kernel.wave as tkw
+
+torch.set_printoptions(linewidth=300)
+
+
+# We want each row to contain [0 .. num_cols]
+def reference_row(rows: int, cols: int):
+    row_indices = torch.arange(cols).unsqueeze(0).expand(rows, cols)
+    print(row_indices.shape)
+    return row_indices
+
+
+# We want each col to contain [0 .. num_rows]
+def reference_col(rows: int, cols: int):
+    col_indices = torch.arange(rows).unsqueeze(1).expand(rows, cols)
+    return col_indices
+
+
+def reference_row_plus_col(rows: int, cols: int):
+    return reference_row(rows, cols) + reference_col(rows, cols)
+
+
+# Input sizes
+M = tkl.sym.M
+N = tkl.sym.N
+K = tkl.sym.K
+# Workgroup tile sizes
+ITERATIONS_OF_M_PER_WAVE = tkl.sym.ITERATIONS_OF_M_PER_WAVE
+ITERATIONS_OF_N_PER_WAVE = tkl.sym.ITERATIONS_OF_N_PER_WAVE
+BLOCK_K = tkl.sym.BLOCK_K
+# Address space (for GPU, shared(1) or global(0))
+ADDRESS_SPACE = tkl.sym.ADDRESS_SPACE
+# Other hyperparameters
+LOAD_ELEMS_PER_THREAD = tkl.sym.LOAD_ELEMS_PER_THREAD
+STORE_ELEMS_PER_THREAD = tkl.sym.STORE_ELEMS_PER_THREAD
+
+
+# yapf: disable
+def run_harness(fun: Callable, vM: int, vN: int, *args) -> bool:
+    config = get_default_run_config()
+    # Override manually to run.
+    config = {"backend": "rocm", "device": "hip", "target": "gfx90a"}
+    with TestLaunchContext({M: vM, N: vN},
+        canonicalize=True,
+        run=True,
+        run_config=config):
+        return fun(*args)
+
+
+# yapf: disable
+### Setting all of the following at the same time to agree on the same value works:
+#   - ITERATIONS_OF_N_PER_WAVE
+#   - VECTOR_SHAPE_N
+#   - ELEMENTS_PER_THREAD_STORE
+params = [
+#   SIZE_M, SIZE_N, ITERATIONS_OF_M_PER_WAVE, ITERATIONS_OF_N_PER_WAVE, VECTOR_SHAPE_M, VECTOR_SHAPE_N, ELEMENTS_PER_THREAD_STORE
+  [     4, 8, 1, 1, 1, 1, 1, ],
+  [     4, 8, 1, 2, 1, 2, 2, ],
+  [     4, 8, 1, 3, 1, 3, 3, ],
+  [     4, 8, 1, 4, 1, 4, 4, ],
+]
+### However, The slightest discrepancy throws the TK compiler off:
+# params = [
+#   SIZE_M, SIZE_N, ITERATIONS_OF_M_PER_WAVE, ITERATIONS_OF_N_PER_WAVE, VECTOR_SHAPE_M, VECTOR_SHAPE_N, ELEMENTS_PER_THREAD_STORE
+#   [     4, 8, 1, 1, 1, 4, 4, 4, ], # Tile size must be divisible by wave count and vector size, got: tile_size=1, wave_count=1, vector_size=4
+#   [     4, 8, 1, 4, 1, 1, 4, 4, ], # MISCOMPILE INCORRECT RESULTS
+#   [     4, 8, 1, 4, 1, 4, 1, 4, ], # CRASH TK COMPILER: Shape doesn't match: (1,) and (4,) in register cast_M:0_N:0 and elements_per_thread 4
+#   [     4, 8, 1, 4, 1, 4, 4, 1, ], # CRASH TK COMPILER: Shape doesn't match: (4,) and (1,) in register cast_M:0_N:0 and elements_per_thread 1
+# ]
+
+for p in params:
+    SIZE_M, \
+    SIZE_N, \
+    ITERATIONS_OF_M_PER_WAVE, \
+    ITERATIONS_OF_N_PER_WAVE, \
+    VECTOR_SHAPE_M, \
+    VECTOR_SHAPE_N, \
+    ELEMENTS_PER_THREAD_STORE = p
+
+    workgroup_constraints = [
+        [tkw.WorkgroupConstraint(M, ITERATIONS_OF_M_PER_WAVE, 0)],
+        [tkw.WorkgroupConstraint(N, ITERATIONS_OF_N_PER_WAVE, 1)],
+        [
+            tkw.WorkgroupConstraint(M, ITERATIONS_OF_M_PER_WAVE, 0),
+            tkw.WorkgroupConstraint(N, ITERATIONS_OF_N_PER_WAVE, 1)
+        ],
+    ]
+    wave_constraints = [
+        [],
+        [tkw.WaveConstraint(M, 1)],
+        [tkw.WaveConstraint(N, 1)],
+        [tkw.WaveConstraint(M, 1), tkw.WaveConstraint(N, 1)],
+        [tkw.WaveConstraint(M, 2)],
+        [tkw.WaveConstraint(N, 2)],
+        [tkw.WaveConstraint(M, 2), tkw.WaveConstraint(N, 2)],
+        [tkw.WaveConstraint(M, 2), tkw.WaveConstraint(N, 2)],
+        [tkw.WaveConstraint(M, 1), tkw.WaveConstraint(N, 2)],
+        [tkw.WaveConstraint(M, 2), tkw.WaveConstraint(N, 1)],
+    ]
+    # yapf: enable
+
+    for wgs in workgroup_constraints:
+        for wvs in wave_constraints:
+            unroll_N = True
+            # In these stress tests compute self_index(N) and we want to distinguish
+            # between the cases:
+            #   1. there is a WorkgroupConstraint on N, therefore N is distributed
+            #   and using ELEMENTS_PER_THREAD_INDEX == 1 results in proper
+            #   propagations
+            #   2. there is no WorkgroupConstraint on N, therefore N is unrolled and
+            #   we have to use ELEMENTS_PER_THREAD_INDEX == ELEMENTS_PER_THREAD_STORE
+            #   otherwise the TK compiler gets confused atm.
+            # Ideally, in the future, things would just work out of the box without
+            # having to adjust ELEMENTS_PER_THREAD_INDEX
+            for wg in wgs:
+                if wg.dim == N:
+                    unroll_N = False
+
+            # Skip this particular constraint if a WaveConstraint is set without
+            # first setting the corresponding WorkgroupConstraint:
+            # TK does not handle that case
+            skip = False
+            for wv in wvs:
+                skip_wv = True
+                for wg in wgs:
+                    if wg.dim == wv.dim:
+                        skip_wv = False
+                if skip_wv:
+                    skip = True
+            if skip:
+                continue
+
+            ELEMENTS_PER_THREAD_INDEX = ELEMENTS_PER_THREAD_STORE if unroll_N else 1
+
+            ###### User constraints
+            constraints: list[tkw.Constraint] = []
+            constraints += wgs
+            constraints += wvs
+            constraints += [
+                tkw.HardwareConstraint(
+                    threads_per_wave=64,
+                    waves_per_block=(1, 1, 1),
+                    vector_shapes={
+                        M: VECTOR_SHAPE_M,
+                        N: VECTOR_SHAPE_N
+                    },
+                )
+            ]
+
+            ###### Known cases to skip:
+            # When we unroll N, TK does not handle imperfect unrolling (with a
+            # remainder).
+            if unroll_N and SIZE_N % ITERATIONS_OF_N_PER_WAVE != 0:
+                continue
+
+            @tkw.wave(constraints)
+            def row(c: tkl.Memory[M, N, ADDRESS_SPACE, tkl.f32]):
+                i = tkw.self_index(
+                    N, tkl.i64, elements_per_thread=ELEMENTS_PER_THREAD_INDEX)
+                res = tkw.cast(i, tkl.f32)
+                tkw.write(res,
+                          c,
+                          elements_per_thread=ELEMENTS_PER_THREAD_STORE)
+
+            def fun_row(debug: bool = False) -> bool:
+                c = device_zeros(SIZE_M, SIZE_N, dtype=torch.float32)
+                if debug:
+                    print(row(c).module_op)
+                    return True
+                else:
+                    row(c)
+                    correct = torch.all(
+                        torch.isclose(reference_row(SIZE_M, SIZE_N),
+                                      c.cpu().to(dtype=torch.int64))).item()
+                    if not correct:
+                        print(f"reference:\n{reference_row(SIZE_M, SIZE_N)}")
+                        print(f"actual:\n{c.cpu().to(dtype=torch.int64)}")
+                        print(
+                            f"delta:\n{c.cpu().to(dtype=torch.int64) - reference_row(SIZE_M, SIZE_N)}"
+                        )
+                    return correct
+
+            correct = run_harness(fun_row, SIZE_M, SIZE_N)
+            if not correct:
+                print(f"\nError under stress test constraints: {constraints}")
+                run_harness(fun_row, SIZE_M, SIZE_N, True)
+                assert correct, "Incorrect execution: ran in debug mode now stop"
diff --git a/playground/triangular.py b/playground/triangular.py
new file mode 100644
index 000000000..58e32e5fc
--- /dev/null
+++ b/playground/triangular.py
@@ -0,0 +1,108 @@
+# Copyright 2025 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import math
+import torch
+from torch.profiler import profile, record_function, ProfilerActivity
+from torch.nn import functional as F
+from torch.testing import assert_close
+from typing import Any, Callable
+
+from iree.turbine.kernel.gen import TestLaunchContext
+from iree.turbine.kernel.lang.global_symbols import GLOBAL_ADDRESS_SPACE
+from iree.turbine.kernel.wave.utils import (
+    device_zeros,
+    get_default_run_config,
+    to_default_device,
+)
+from causal_attention_template import (get_causal_attention_kernel as
+                                       get_tkw_causal_attention_kernel)
+
+import iree.turbine.kernel.lang as tkl
+import iree.turbine.kernel.wave as tkw
+
+torch.manual_seed(0)
+torch.set_printoptions(
+    linewidth=1000000,
+    threshold=1000000,
+    precision=3,
+)
+
+vM, vN = 10, 10
+torch_o = to_default_device(torch.ones(vM, vN))
+temp_mask = to_default_device(
+    torch.ones(vM, vN, dtype=torch.bool).tril(diagonal=0))
+torch_o.masked_fill_(temp_mask.logical_not(), float("-inf"))
+
+M = tkl.sym.M
+N = tkl.sym.N
+ONE = tkl.sym.ONE
+# Expose user-constraints
+constraints: list[tkw.Constraint] = []
+constraints += [
+    tkw.HardwareConstraint(
+        threads_per_wave=64,
+        waves_per_block=(1, 1, 1),
+        vector_shapes={
+            M: 1,
+            N: 1,
+        },
+    )
+]
+
+constraints += [tkw.WorkgroupConstraint(M, 1, 0)]
+constraints += [tkw.WorkgroupConstraint(N, 1, 1)]
+
+# WARNING: these constraints generate wrong code
+# constraints += [tkw.WorkgroupConstraint(M, 2, 0)]
+# constraints += [tkw.WorkgroupConstraint(N, 2, 1)]
+# constraints += [tkw.WaveConstraint(M, 1)]
+# constraints += [tkw.WaveConstraint(N, 1)]
+
+
+### TKW Harness
+def run(fun: Callable, hparams, *args) -> Any:
+    with torch.profiler.profile(
+            activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:
+        with torch.no_grad():  # Disable gradient calculations
+            with TestLaunchContext(
+                    hparams,
+                    canonicalize=True,
+                    # compile_config={"print_ir_after": "all"},
+                    run=True,
+                    run_config=get_default_run_config(),
+                    run_bench=False,
+                    schedule=False,
+                    use_scheduling_barriers=False,
+            ):
+                mb = fun(*args)
+                print(mb.module_op)
+    print(
+        prof.key_averages(group_by_input_shape=True).table(
+            sort_by="self_cuda_time_total", row_limit=10))
+
+
+@tkw.wave(constraints)
+def test(o: tkl.Memory[M, N, GLOBAL_ADDRESS_SPACE, tkl.f32]):
+    i = tkw.self_index(M, tkl.i64, elements_per_thread=1)
+    i = tkw.broadcast(i, target_shape=[N])
+    j = tkw.self_index(N, tkl.i64, elements_per_thread=1)
+    ZERO = tkl.Register[N, tkl.i64](0)
+    ONE = tkl.Register[N, tkl.i64](1)
+    ZEROF = tkl.Register[N, tkl.f32](0.0)
+    MIN_INF = tkl.Register[N, tkl.f32](float('-inf'))
+    idx = j - i - ONE
+    res = tkw.select(tkw.slt(idx, ZERO), ZEROF, MIN_INF)
+    val = tkw.read(o, elements_per_thread=1)
+    res += val
+    tkw.write(res, o, elements_per_thread=1)
+
+
+o = to_default_device(torch.ones(vM, vN))
+run(test, {M: vM, N: vN, ONE: 1}, o)
+
+# print(o)
+assert_close(torch_o.to(dtype=o.dtype), o, atol=2e-3, rtol=2e-3)
diff --git a/tests/kernel/wave/attention/extend_attention_test.py b/tests/kernel/wave/attention/extend_attention_test.py
new file mode 100644
index 000000000..f52ac9d50
--- /dev/null
+++ b/tests/kernel/wave/attention/extend_attention_test.py
@@ -0,0 +1,326 @@
+# Copyright 2025 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import pytest
+import torch
+import math
+import iree.turbine.kernel as tk
+from iree.turbine.kernel.lang.global_symbols import *
+from iree.turbine.kernel.wave.utils import (
+    get_default_run_config,
+    get_default_scheduling_params,
+    device_arange,
+    device_randint,
+    device_zeros,
+    device_empty,
+)
+from iree.turbine.kernel.wave.constraints import MMAType
+from iree.turbine.kernel.wave.templates.extend_attention import (
+    get_extend_attention_kernels,
+)
+from iree.turbine.kernel.wave.templates.attention_common import (
+    AttentionShape,
+)
+import os
+from torch.testing import assert_allclose
+from ..common.utils import (
+    require_e2e,
+    require_cdna3,
+    enable_scheduling_barriers,
+    dump_generated_mlir,
+)
+from ..common.shapes import get_test_shapes
+from typing import List, Optional
+
+# Reference paged attention implementation from vLLM and sglang.
+shapes = [
+    AttentionShape(
+        num_seqs=2,
+        context_len=1024,
+        num_query_heads=12,
+        num_kv_heads=4,
+        head_size=128,
+        head_size_kv=128,
+        block_size=64,
+    )
+]
+
+
+def context_attention_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    o: torch.Tensor,
+    b_start_loc: torch.Tensor,
+    b_seq_len: torch.Tensor,
+    max_len_in_batch: int,
+    is_causal: bool = False,
+):
+    cu_seq_lens = [0] * (len(b_seq_len) + 1)
+    for i, seq_len in enumerate(b_seq_len):
+        cu_seq_lens[i + 1] = cu_seq_lens[i] + seq_len
+
+    for i in range(len(b_seq_len)):
+        start, end = cu_seq_lens[i], cu_seq_lens[i + 1]
+        o_torch = torch.nn.functional.scaled_dot_product_attention(
+            q[start:end].permute(1, 0, 2),
+            k[start:end].permute(1, 0, 2),
+            v[start:end].permute(1, 0, 2),
+            is_causal=is_causal,
+            enable_gqa=True,
+        ).permute(1, 0, 2)
+        o[start:end] = o_torch
+
+    return o
+
+
+# From: https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/layers/attention/triton_ops/extend_attention.py#L369
+def ref_extend_attn(
+    q_extend: torch.Tensor,
+    k_buffer: torch.Tensor,
+    v_buffer: torch.Tensor,
+    b_req_idx: torch.Tensor,
+    b_start_loc: torch.Tensor,
+    b_seq_len: torch.Tensor,
+    b_seq_len_prefix: torch.Tensor,
+    max_len_in_batch: int,
+    extend_token_num: int,
+    dtype: torch.dtype,
+    is_causal: bool = False,
+) -> torch.Tensor:
+    total_token_num = k_buffer.shape[0]
+    B, H_Q, D = b_req_idx.shape[0], q_extend.shape[-2], q_extend.shape[-1]
+    q_buffer = device_empty(
+        (total_token_num, H_Q, D), dtype=q_extend.dtype, device=q_extend.device
+    )
+    o_extend = device_empty((extend_token_num, H_Q, D), dtype=dtype)
+
+    pt = 0
+    for i in range(B):
+        cur_seq_len_extend = b_seq_len[i] - b_seq_len_prefix[i]
+        pl, pr = b_start_loc[i] + b_seq_len_prefix[i], b_start_loc[i] + b_seq_len[i]
+        q_buffer[pl:pr] = q_extend[pt : pt + cur_seq_len_extend]
+        pt += cur_seq_len_extend
+
+    o_buffer = torch.empty_like(q_buffer)
+    context_attention_fwd(
+        q_buffer,
+        k_buffer,
+        v_buffer,
+        o_buffer,
+        b_start_loc,
+        b_seq_len,
+        max_len_in_batch,
+        is_causal,
+    )
+
+    pt = 0
+    for i in range(B):
+        cur_seq_len_extend = b_seq_len[i] - b_seq_len_prefix[i]
+        pl, pr = b_start_loc[i] + b_seq_len_prefix[i], b_start_loc[i] + b_seq_len[i]
+        o_extend[pt : pt + cur_seq_len_extend] = o_buffer[pl:pr]
+        pt += cur_seq_len_extend
+
+    return o_extend
+
+
+def create_inputs(
+    shape: AttentionShape,
+    dtype: torch.dtype,
+):
+
+    dtype = torch.float16
+    N_CTX = shape.context_len
+    B = shape.num_seqs
+    H_KV = shape.num_kv_heads
+    H_Q = shape.num_query_heads
+    D = shape.head_size
+
+    b_seq_len_prefix = torch.randint(
+        1, N_CTX // 2, (B,), dtype=torch.int32, device="cuda"
+    )
+    b_seq_len_extend = torch.randint(
+        1, N_CTX // 2, (B,), dtype=torch.int32, device="cuda"
+    )
+    b_seq_len = b_seq_len_prefix + b_seq_len_extend
+    max_len_in_batch = torch.max(b_seq_len, 0)[0].item()
+
+    b_req_idx = torch.arange(B, dtype=torch.int32, device="cuda")
+    req_to_tokens = torch.empty((B, max_len_in_batch), dtype=torch.int32, device="cuda")
+    b_start_loc = torch.zeros((B,), dtype=torch.int32, device="cuda")
+    b_start_loc[1:] = torch.cumsum(b_seq_len[:-1], 0)
+    b_start_loc_extend = torch.zeros((B,), dtype=torch.int32, device="cuda")
+    b_start_loc_extend[1:] = torch.cumsum(b_seq_len_extend[:-1], 0)
+    for i in range(B):
+        req_to_tokens[i, : b_seq_len[i]] = torch.arange(
+            b_start_loc[i], b_start_loc[i] + b_seq_len[i]
+        )
+
+    total_token_num = torch.sum(b_seq_len).item()
+    extend_token_num = torch.sum(b_seq_len_extend).item()
+    k_buffer = torch.empty(
+        (total_token_num, H_KV, D), dtype=dtype, device="cuda"
+    ).normal_(mean=0.1, std=0.2)
+    v_buffer = torch.empty(
+        (total_token_num, H_KV, D), dtype=dtype, device="cuda"
+    ).normal_(mean=0.1, std=0.2)
+
+    k_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda")
+    v_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda")
+    q_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda")
+    for i in range(B):
+        extend_start_in_buffer = b_start_loc[i] + b_seq_len_prefix[i]
+        extend_end_in_buffer = b_start_loc[i] + b_seq_len[i]
+        extend_start = b_start_loc_extend[i]
+        extend_end = b_start_loc_extend[i] + b_seq_len_extend[i]
+        k_extend[extend_start:extend_end] = k_buffer[
+            extend_start_in_buffer:extend_end_in_buffer
+        ]
+        v_extend[extend_start:extend_end] = v_buffer[
+            extend_start_in_buffer:extend_end_in_buffer
+        ]
+        q_extend[extend_start:extend_end] = torch.empty(
+            (b_seq_len_extend[i], H_Q, D), dtype=dtype, device="cuda"
+        ).normal_(mean=0.1, std=0.2)
+
+    b_seq_len_extend = b_seq_len - b_seq_len_prefix
+    b_start_loc_extend = torch.zeros_like(b_seq_len)
+    b_start_loc_extend[1:] = torch.cumsum(b_seq_len_extend[:-1], 0)
+    max_len_extend = torch.max(b_seq_len_extend, 0)[0].item()
+
+    return (
+        q_extend,
+        k_extend,
+        v_extend,
+        k_buffer,
+        v_buffer,
+        req_to_tokens,
+        b_req_idx,
+        b_seq_len,
+        b_seq_len_extend,
+        b_start_loc,
+        b_start_loc_extend,
+        b_seq_len_prefix,
+        max_len_in_batch,
+        extend_token_num,
+    )
+
+
+# TODO: Investigate errors on MI250.
+@require_e2e
+@require_cdna3
+@pytest.mark.parametrize("shape", shapes)
+@pytest.mark.parametrize("dtype", [torch.float16])
+@pytest.mark.parametrize("enable_scheduling", [False])
+@pytest.mark.parametrize(
+    "mfma_variant",
+    [(MMAType.F32_16x16x16_F16, MMAType.F32_16x16x16_F16)],
+)
+def testExtendAttention(
+    shape: list[AttentionShape],
+    dtype: torch.dtype,
+    enable_scheduling: bool,
+    mfma_variant: MMAType,
+    request,
+):
+
+    torch.manual_seed(0)
+    assert shape.num_query_heads % shape.num_kv_heads == 0
+    (
+        q_extend,
+        k_extend,
+        v_extend,
+        k_buffer,
+        v_buffer,
+        req_to_tokens,
+        b_req_idx,
+        b_seq_len,
+        b_seq_len_extend,
+        b_start_loc,
+        b_start_loc_extend,
+        b_seq_len_prefix,
+        max_len_in_batch,
+        extend_token_num,
+    ) = create_inputs(shape, dtype)
+    shape.max_seq_len = max_len_in_batch
+
+    # Run the wave kernel.
+    output = device_zeros(
+        extend_token_num, shape.num_query_heads, shape.head_size, dtype=torch.float32
+    )
+    (extend_attention, hyperparams,) = get_extend_attention_kernels(
+        shape,
+        mfma_variant,
+        k_extend.shape,
+        v_extend.shape,
+        req_to_tokens.shape,
+        k_buffer.shape,
+        v_buffer.shape,
+        output.shape,
+    )
+    hyperparams.update(get_default_scheduling_params())
+    config = get_default_run_config()
+    run_bench = request.config.getoption("--runperf")
+    dump_perf = request.config.getoption("--dump-perf-files-path")
+    if run_bench:
+        config["benchmark_batch_size"] = 10
+        config["benchmark_repetitions"] = 3
+    if dump_perf is not None:
+        perf_filename = request.node.name + ".json"
+        config["benchmark_results_file"] = os.path.join(
+            dump_perf, "tk_" + perf_filename
+        )
+
+    log2e = 1.44269504089
+    dk_sqrt = math.sqrt(1.0 / shape.head_size)
+
+    with tk.gen.TestLaunchContext(
+        hyperparams,
+        canonicalize=True,
+        run=True,
+        run_bench=run_bench,
+        run_config=config,
+        schedule=enable_scheduling,
+        use_scheduling_barriers=enable_scheduling_barriers,
+    ):
+        # TODO: Add scaling of QK as part of kernel.
+        # TODO: Add variant of non-transposed V attention kernel.
+        mb_qk = extend_attention(
+            q_extend * dk_sqrt * log2e,
+            k_extend,
+            v_extend,
+            k_buffer,
+            v_buffer,
+            req_to_tokens,
+            b_req_idx,
+            b_seq_len,
+            b_seq_len_extend,
+            b_start_loc_extend,
+            output,
+        )
+        breakpoint()
+
+    if dump_generated_mlir:
+        filename = f"wave_extend_attention_kernel_{'x'.join(map(str, shape))}.mlir"
+        with open(filename, "w") as f:
+            f.write(mb_qk.module_op.get_asm())
+
+    # Run the reference implementation.
+    ref_output = ref_extend_attn(
+        q_extend=q_extend,
+        k_buffer=k_buffer,
+        v_buffer=v_buffer,
+        b_req_idx=b_req_idx,
+        b_start_loc=b_start_loc,
+        b_seq_len=b_seq_len,
+        b_seq_len_prefix=b_seq_len_prefix,
+        max_len_in_batch=max_len_in_batch,
+        extend_token_num=extend_token_num,
+        dtype=dtype,
+    )
+    breakpoint()
+
+    assert_allclose(output, ref_output, rtol=1e-3, atol=1e-3)