codership · plampio · Dec 28, 2023 · sjaakola · Jul 3, 2024 · sjaakola
diff --git a/mysql-test/suite/galera/r/galera_defaults.result b/mysql-test/suite/galera/r/galera_defaults.result
@@ -18,6 +18,7 @@ AND VARIABLE_NAME NOT IN (
 ORDER BY VARIABLE_NAME;
 VARIABLE_NAME	VARIABLE_VALUE
 WSREP_ALLOWLIST	
+WSREP_APPLIER_RETRY_COUNT	0
 WSREP_AUTO_INCREMENT_CONTROL	ON
 WSREP_CERTIFICATION_RULES	strict
 WSREP_CERTIFY_NONPK	ON

diff --git a/mysql-test/suite/galera/r/galera_retry_applying.result b/mysql-test/suite/galera/r/galera_retry_applying.result
@@ -0,0 +1,73 @@
+connection node_2;
+connection node_1;
+CALL mtr.add_suppression("Event .* Update_rows.* apply failed");
+CALL mtr.add_suppression("Inconsistency detected");
+CALL mtr.add_suppression("Failed to apply write set:.*");
+CALL mtr.add_suppression("Event .* Write_rows.* apply failed");
+CREATE TABLE t1 (f1 INTEGER PRIMARY KEY DEFAULT 0, f2 char(12));
+CREATE TABLE t3 (f1 INTEGER PRIMARY KEY DEFAULT 0, f2 char(12));
+START TRANSACTION;
+INSERT INTO t3 (f1, f2) VALUES (1, 'a');
+INSERT INTO t3 (f1, f2) VALUES (2, 'b');
+INSERT INTO t3 (f1, f2) VALUES (3, 'c'), (4, 'd'), (5, 'e');
+COMMIT;
+connection node_2;
+SET GLOBAL wsrep_applier_retry_count = 2;
+SET GLOBAL debug_dbug = "d,apply_event_fail_once:o,/dev/null";
+connection node_1;
+START TRANSACTION;
+UPDATE t3 SET f2 = 'ax' WHERE f1 = 1;
+UPDATE t3 SET f2 = 'bx' WHERE f1 = 2;
+INSERT INTO t1 (f1, f2) VALUES (3, 'c'), (4, 'd'), (5, 'e');
+UPDATE t3 SET f2 = 'cx' WHERE f1 = 3;
+UPDATE t3 SET f2 = 'dx' WHERE f1 = 4;
+DELETE FROM t3 WHERE f1 = 5;
+COMMIT;
+connection node_2;
+connection node_1;
+SELECT COUNT(*) AS expect_3 FROM t1;
+expect_3
+3
+SELECT COUNT(*) AS expect_4 FROM t3;
+expect_4
+4
+connection node_2;
+SELECT COUNT(*) AS expect_3 FROM t1;
+expect_3
+3
+SELECT COUNT(*) AS expect_4 FROM t3;
+expect_4
+4
+connection node_1;
+DROP TABLE t1;
+DROP TABLE t3;
+connection node_2;
+Shutting down server ...
+SET wsrep_on=OFF;
+Restarting server ...
+connection node_1;
+SET wsrep_sync_wait=0;
+CREATE TABLE t2 (f1 INTEGER PRIMARY KEY DEFAULT 0, f2 char(12));
+connection node_2;
+SET GLOBAL wsrep_applier_retry_count = 2;
+SET GLOBAL debug_dbug = '';
+SET GLOBAL debug_dbug = "d,apply_event_fail_always:o,/dev/null";
+connection node_1;
+START TRANSACTION;
+INSERT INTO t2 (f1, f2) VALUES (1, 'a'), (2, 'b');
+COMMIT;
+connection node_2;
+Shutting down server ...
+SET wsrep_on=OFF;
+Restarting server ...
+connection node_1;
+SET wsrep_sync_wait=0;
+connection node_2;
+SELECT COUNT(*) AS expect_2 FROM t2;
+expect_2
+2
+SET GLOBAL debug_dbug = DEFAULT;
+connection node_1;
+SET GLOBAL wsrep_applier_retry_count = 0;
+SET DEBUG_SYNC = 'RESET';
+DROP TABLE t2;
diff --git a/mysql-test/suite/galera/t/galera_retry_applying.test b/mysql-test/suite/galera/t/galera_retry_applying.test
@@ -0,0 +1,136 @@
+#
+# Test retrying applying of a transaction
+#
+
+--source include/galera_cluster.inc
+--source include/have_debug_sync.inc
+
+CALL mtr.add_suppression("Event .* Update_rows.* apply failed");
+CALL mtr.add_suppression("Inconsistency detected");
+CALL mtr.add_suppression("Failed to apply write set:.*");
+CALL mtr.add_suppression("Event .* Write_rows.* apply failed");
+
+#
+# Case 1: Retrying succeeds after one retry event, no error is raised.
+#
+CREATE TABLE t1 (f1 INTEGER PRIMARY KEY DEFAULT 0, f2 char(12));
+CREATE TABLE t3 (f1 INTEGER PRIMARY KEY DEFAULT 0, f2 char(12));
+
+START TRANSACTION;
+INSERT INTO t3 (f1, f2) VALUES (1, 'a');
+INSERT INTO t3 (f1, f2) VALUES (2, 'b');
+INSERT INTO t3 (f1, f2) VALUES (3, 'c'), (4, 'd'), (5, 'e');
+COMMIT;
+
+# wait till the insert transaction has been replicated and committed in node_2
+--connection node_2
+--let $wait_condition = SELECT COUNT(*) > 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't3';
+--source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) > 0 FROM t3;
+--source include/wait_condition.inc
+
+SET GLOBAL wsrep_applier_retry_count = 2;
+SET GLOBAL debug_dbug = "d,apply_event_fail_once:o,/dev/null";
+
+--connection node_1
+START TRANSACTION;
+UPDATE t3 SET f2 = 'ax' WHERE f1 = 1;
+UPDATE t3 SET f2 = 'bx' WHERE f1 = 2;
+INSERT INTO t1 (f1, f2) VALUES (3, 'c'), (4, 'd'), (5, 'e');
+UPDATE t3 SET f2 = 'cx' WHERE f1 = 3;
+UPDATE t3 SET f2 = 'dx' WHERE f1 = 4;
+DELETE FROM t3 WHERE f1 = 5;
+COMMIT;
+
+# wait till the transaction has been replicated and committed in node_2
+--connection node_2
+--let $wait_condition = SELECT COUNT(*) = 4 FROM t3;
+--source include/wait_condition.inc
+
+--connection node_1
+SELECT COUNT(*) AS expect_3 FROM t1;
+SELECT COUNT(*) AS expect_4 FROM t3;
+
+--connection node_2
+SELECT COUNT(*) AS expect_3 FROM t1;
+SELECT COUNT(*) AS expect_4 FROM t3;
+
+#
+# Cleanup after Case 1.
+#
+
+--connection node_1
+DROP TABLE t1;
+DROP TABLE t3;
+
+# shutdown node 2 and restart it
+--connection node_2
+--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't3';
+--source include/wait_condition.inc
+--echo Shutting down server ...
+SET wsrep_on=OFF;
+--source include/shutdown_mysqld.inc
+--remove_file $MYSQLTEST_VARDIR/mysqld.2/data/grastate.dat
+--echo Restarting server ...
+--source include/start_mysqld.inc
+
+# wait till node 2 is back in the cluster
+--connection node_1
+SET wsrep_sync_wait=0;
+--let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM performance_schema.global_status WHERE VARIABLE_NAME = 'wsrep_cluster_size'
+--source include/wait_condition.inc
+
+
+#
+# Case 2: Slave retries applying of a transaction multiple times. All
+# retry attempts fail, and the applying will fail with the expected
+# error.
+#
+
+CREATE TABLE t2 (f1 INTEGER PRIMARY KEY DEFAULT 0, f2 char(12));
+
+--connection node_2
+
+SET GLOBAL wsrep_applier_retry_count = 2;
+SET GLOBAL debug_dbug = '';
+SET GLOBAL debug_dbug = "d,apply_event_fail_always:o,/dev/null";
+
+--connection node_1
+START TRANSACTION;
+INSERT INTO t2 (f1, f2) VALUES (1, 'a'), (2, 'b');
+COMMIT;
+
+# node 2 should crash now, wait for the crash
+--let $wait_condition = SELECT VARIABLE_VALUE = 1 FROM performance_schema.global_status WHERE VARIABLE_NAME = 'wsrep_cluster_size'
+--source include/wait_condition.inc
+
+# restart node 2
+--connection node_2
+--echo Shutting down server ...
+SET wsrep_on=OFF;
+--source include/shutdown_mysqld.inc
+--source include/wait_until_disconnected.inc
+--remove_file $MYSQLTEST_VARDIR/mysqld.2/data/grastate.dat
+--echo Restarting server ...
+--source include/start_mysqld.inc
+
+# wait till node 2 is back in the cluster
+--connection node_1
+SET wsrep_sync_wait=0;
+--let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM performance_schema.global_status WHERE VARIABLE_NAME = 'wsrep_cluster_size'
+--source include/wait_condition.inc
+
+--connection node_2
+--let $wait_condition = SELECT COUNT(*) = 2 FROM t2;
+--source include/wait_condition.inc
+SELECT COUNT(*) AS expect_2 FROM t2;
+SET GLOBAL debug_dbug = DEFAULT;
+
+#
+# Cleanup
+#
+
+--connection node_1
+SET GLOBAL wsrep_applier_retry_count = 0;
+SET DEBUG_SYNC = 'RESET';
+DROP TABLE t2;
diff --git a/mysql-test/suite/sys_vars/r/sysvars_wsrep.result b/mysql-test/suite/sys_vars/r/sysvars_wsrep.result
@@ -16,6 +16,21 @@ ENUM_VALUE_LIST	NULL
 READ_ONLY	YES
 COMMAND_LINE_ARGUMENT	REQUIRED
 GLOBAL_VALUE_PATH	NULL
+VARIABLE_NAME	WSREP_APPLIER_RETRY_COUNT
+SESSION_VALUE	NULL
+GLOBAL_VALUE	0
+GLOBAL_VALUE_ORIGIN	COMPILE-TIME
+DEFAULT_VALUE	0
+VARIABLE_SCOPE	GLOBAL
+VARIABLE_TYPE	INT UNSIGNED
+VARIABLE_COMMENT	Maximum number of applier retry attempts
+NUMERIC_MIN_VALUE	0
+NUMERIC_MAX_VALUE	4294967295
+NUMERIC_BLOCK_SIZE	1
+ENUM_VALUE_LIST	NULL
+READ_ONLY	NO
+COMMAND_LINE_ARGUMENT	OPTIONAL
+GLOBAL_VALUE_PATH	NULL
 VARIABLE_NAME	WSREP_AUTO_INCREMENT_CONTROL
 SESSION_VALUE	NULL
 GLOBAL_VALUE	ON

diff --git a/sql/log_event_server.cc b/sql/log_event_server.cc
@@ -4925,7 +4925,33 @@ int Rows_log_event::do_apply_event(rpl_group_info *rgi)
       /* remove trigger's tables */
       goto err;
     }
-
+#ifdef WITH_WSREP
+    DBUG_EXECUTE_IF("apply_event_fail_once", {
+        if (WSREP(thd)) {
+	  RPL_TABLE_LIST *ptr= static_cast<RPL_TABLE_LIST*>(rgi->tables_to_lock);
+	  error= HA_ERR_LOCK_WAIT_TIMEOUT;
+          slave_rows_error_report(
+            INFORMATION_LEVEL, error, rgi, thd, ptr->table,
+	    get_type_str(), RPL_LOG_NAME, log_pos);
+	  my_error(error, MYF(0));
+	  thd->is_slave_error= 1;
+	  DBUG_SET("-d,apply_event_fail_once");
+	  goto err;
+        }
+      };);
+    DBUG_EXECUTE_IF("apply_event_fail_always", {
+        if (WSREP(thd)) {
+	  RPL_TABLE_LIST *ptr= static_cast<RPL_TABLE_LIST*>(rgi->tables_to_lock);
+	  error= HA_ERR_LOCK_WAIT_TIMEOUT;
+          slave_rows_error_report(
+            INFORMATION_LEVEL, error, rgi, thd, ptr->table,
+	    get_type_str(), RPL_LOG_NAME, log_pos);
+	  my_error(error, MYF(0));
+	  thd->is_slave_error= 1;
+	  goto err;
+        }
+      };);
+#endif /* WITH_WSREP */
     /*
       When the open and locking succeeded, we check all tables to
       ensure that they still have the correct type.

diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc
@@ -6493,6 +6493,12 @@ static Sys_var_charptr Sys_wsrep_allowlist(
        READ_ONLY GLOBAL_VAR(wsrep_allowlist), CMD_LINE(REQUIRED_ARG),
        DEFAULT(""));
 
+static Sys_var_uint Sys_wsrep_applier_retry_count (
+       "wsrep_applier_retry_count", "Maximum number of applier retry attempts",
+       GLOBAL_VAR(wsrep_applier_retry_count), CMD_LINE(OPT_ARG),
+       VALID_RANGE(0, UINT_MAX), DEFAULT(0), BLOCK_SIZE(1),
+       NO_MUTEX_GUARD, NOT_IN_BINLOG);
+
 #endif /* WITH_WSREP */
 
 static bool fix_host_cache_size(sys_var *, THD *, enum_var_type)

diff --git a/sql/transaction.cc b/sql/transaction.cc
@@ -783,3 +783,14 @@ bool trans_release_savepoint(THD *thd, LEX_CSTRING name)
 
   DBUG_RETURN(MY_TEST(res));
 }
+
+#ifdef WITH_WSREP
+/* check if a named savepoint exists for the current transaction */
+bool trans_savepoint_exists(THD *thd, LEX_CSTRING name)
+{
+  SAVEPOINT **sv = find_savepoint(thd, name);
+
+  return (*sv != NULL);
+}
+#endif /* WITH_WSREP */
+
diff --git a/sql/transaction.h b/sql/transaction.h
@@ -38,6 +38,9 @@ bool trans_rollback_stmt(THD *thd);
 bool trans_savepoint(THD *thd, LEX_CSTRING name);
 bool trans_rollback_to_savepoint(THD *thd, LEX_CSTRING name);
 bool trans_release_savepoint(THD *thd, LEX_CSTRING name);
+#ifdef WITH_WSREP
+bool trans_savepoint_exists(THD *thd, LEX_CSTRING name);
+#endif /* WITH_WSREP */
 
 void trans_reset_one_shot_chistics(THD *thd);