diff --git a/piton/design/chip/tile/l2/rtl/l2_pipe1_ctrl.v.pyv b/piton/design/chip/tile/l2/rtl/l2_pipe1_ctrl.v.pyv
index eb5692e33..67567b454 100644
--- a/piton/design/chip/tile/l2/rtl/l2_pipe1_ctrl.v.pyv
+++ b/piton/design/chip/tile/l2/rtl/l2_pipe1_ctrl.v.pyv
@@ -345,6 +345,8 @@ reg [`MSG_TYPE_WIDTH-1:0] msg_type_S2_f;
 reg msg_from_mshr_S2_f;
 reg [`MSG_TYPE_WIDTH-1:0] msg_type_S4_f;
 
+reg msg_data_pending, msg_data_pending_f;
+
 //============================
 // Stage 1
 //============================
@@ -932,15 +934,40 @@ begin
 end
 
 reg stall_msg_S1;
+reg stall_msg_data_S1;
 
 always @ *
 begin
     stall_msg_S1 = msg_data_rd_S1 && ~msg_data_valid_S1;
 end
 
+wire msg_carrying_data_S1 =   (msg_type_trans_S1 == `MSG_TYPE_CAS_P2Y_REQ    ) ||
+                              (msg_type_trans_S1 == `MSG_TYPE_SWAP_P2_REQ    ) ||
+                              (msg_type_trans_S1 == `MSG_TYPE_AMO_ADD_P2_REQ ) ||
+                              (msg_type_trans_S1 == `MSG_TYPE_AMO_AND_P2_REQ ) ||
+                              (msg_type_trans_S1 == `MSG_TYPE_AMO_OR_P2_REQ  ) ||
+                              (msg_type_trans_S1 == `MSG_TYPE_AMO_XOR_P2_REQ ) ||
+                              (msg_type_trans_S1 == `MSG_TYPE_AMO_MAX_P2_REQ ) ||
+                              (msg_type_trans_S1 == `MSG_TYPE_AMO_MAXU_P2_REQ) ||
+                              (msg_type_trans_S1 == `MSG_TYPE_AMO_MIN_P2_REQ ) ||
+                              (msg_type_trans_S1 == `MSG_TYPE_AMO_MINU_P2_REQ) ||
+                              (msg_type_trans_S1 == `MSG_TYPE_NC_STORE_REQ) ||
+                              (msg_type_trans_S1 == `MSG_TYPE_CAS_P1_REQ) ||
+                              (msg_type_trans_S1 == `MSG_TYPE_CAS_P2N_REQ) || (msg_type_trans_S1 == `MSG_TYPE_INTERRUPT_FWD);
+always @(*) begin
+    // We only need to worry about the case where a nc_store gets into mshr. 
+    // In this case, msg_data is not consumed immediately and msg_data_val 
+    // would be high for a long time, until that request come back from mshr 
+    // and raise msg_data_ready. During this period, if another request carrying 
+    // data (e.g. another nc_store, atomics, or interrupt_fwd) arrives, it will 
+    // read the wrong data. To prevent this, we simply stop accepting new request 
+    // until msg_data_pending is resolved.
+    stall_msg_data_S1 = (msg_data_pending || msg_data_pending_f) && msg_carrying_data_S1 && !msg_from_mshr_S1;
+end
+
 always @ *
 begin
-    stall_S1 = valid_S1 && (stall_pre_S1 || stall_hazard_S1 || stall_mshr_S1 || stall_msg_S1);
+    stall_S1 = valid_S1 && (stall_pre_S1 || stall_hazard_S1 || stall_mshr_S1 || stall_msg_S1 || stall_msg_data_S1);
 end
 
 
@@ -1029,6 +1056,7 @@ reg mshr_smc_miss_S2_f;
 reg [`L2_MSHR_INDEX_WIDTH-1:0] mshr_pending_index_S2_f;
 reg special_addr_type_S2_f;
 reg msg_data_rd_S2_f;
+reg msg_carrying_data_S2_f;
 
 always @ (posedge clk)
 begin
@@ -1047,6 +1075,7 @@ begin
         special_addr_type_S2_f <= 0;
         msg_data_rd_S2_f <= 0;
         amo_alu_op_S2_f <= `L2_AMO_ALU_OP_WIDTH'b0;
+        msg_carrying_data_S2_f <= 1'b0;
     end
     else if (!stall_S2)
     begin
@@ -1067,6 +1096,7 @@ begin
         special_addr_type_S2_f <= special_addr_type_S1;
         msg_data_rd_S2_f <= msg_data_rd_S1;
         amo_alu_op_S2_f <= amo_alu_op_S1;
+        msg_carrying_data_S2_f <= msg_carrying_data_S1;
     end
 end
 
@@ -2182,6 +2212,36 @@ begin
     msg_data_ready_S2 = valid_S2 && !stall_S2 && (cs_S2[`CS_STATE_DATA_RDY_P1S2] || msg_data_rd_S2_f);
 end
 
+// Actually it's conservative to use msg_carrying_data_S2_f as
+// the condition to raise the msg_data_pending flag. For atomic operations,
+// it's already guaranteed that no other noc reqs can be consumes by L2 
+// until it reaches phase 2. But being conservative makes no harm to
+// the performance, it's just a double check.
+
+always @(*) begin
+    msg_data_pending = 1'b0;
+    if (valid_S2 && msg_carrying_data_S2_f && !msg_data_ready_S2) begin
+        if (cs_S2[`CS_MSHR_WR_EN_P1S2]) begin
+            // CAUTION: We assume that after a request reads msg_data, it would not get into mshr again.
+            msg_data_pending = 1'b1;
+        end
+    end
+end
+
+always @(posedge clk) begin
+    if (~rst_n) begin
+        msg_data_pending_f <= 1'b0;
+    end
+    else if (valid_S2 && msg_carrying_data_S2_f && !msg_data_ready_S2) begin
+        if (cs_S2[`CS_MSHR_WR_EN_P1S2]) begin
+            msg_data_pending_f <= 1'b1;
+        end
+    end
+    else if (msg_data_ready_S2 && msg_data_valid_S2) begin
+        msg_data_pending_f <= 1'b0;
+    end
+end
+
 
 always @ *
 begin