Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Galera feature: retry applying of write sets at slave nodes #387

Open
wants to merge 1 commit into
base: 11.4
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions mysql-test/suite/galera/r/galera_defaults.result
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ AND VARIABLE_NAME NOT IN (
ORDER BY VARIABLE_NAME;
VARIABLE_NAME VARIABLE_VALUE
WSREP_ALLOWLIST
WSREP_APPLIER_RETRY_COUNT 0
WSREP_AUTO_INCREMENT_CONTROL ON
WSREP_CERTIFICATION_RULES strict
WSREP_CERTIFY_NONPK ON
Expand Down
73 changes: 73 additions & 0 deletions mysql-test/suite/galera/r/galera_retry_applying.result
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
connection node_2;
connection node_1;
CALL mtr.add_suppression("Event .* Update_rows.* apply failed");
CALL mtr.add_suppression("Inconsistency detected");
CALL mtr.add_suppression("Failed to apply write set:.*");
CALL mtr.add_suppression("Event .* Write_rows.* apply failed");
CREATE TABLE t1 (f1 INTEGER PRIMARY KEY DEFAULT 0, f2 char(12));
CREATE TABLE t3 (f1 INTEGER PRIMARY KEY DEFAULT 0, f2 char(12));
START TRANSACTION;
INSERT INTO t3 (f1, f2) VALUES (1, 'a');
INSERT INTO t3 (f1, f2) VALUES (2, 'b');
INSERT INTO t3 (f1, f2) VALUES (3, 'c'), (4, 'd'), (5, 'e');
COMMIT;
connection node_2;
SET GLOBAL wsrep_applier_retry_count = 2;
SET GLOBAL debug_dbug = "d,apply_event_fail_once:o,/dev/null";
connection node_1;
START TRANSACTION;
UPDATE t3 SET f2 = 'ax' WHERE f1 = 1;
UPDATE t3 SET f2 = 'bx' WHERE f1 = 2;
INSERT INTO t1 (f1, f2) VALUES (3, 'c'), (4, 'd'), (5, 'e');
UPDATE t3 SET f2 = 'cx' WHERE f1 = 3;
UPDATE t3 SET f2 = 'dx' WHERE f1 = 4;
DELETE FROM t3 WHERE f1 = 5;
COMMIT;
connection node_2;
connection node_1;
SELECT COUNT(*) AS expect_3 FROM t1;
expect_3
3
SELECT COUNT(*) AS expect_4 FROM t3;
expect_4
4
connection node_2;
SELECT COUNT(*) AS expect_3 FROM t1;
expect_3
3
SELECT COUNT(*) AS expect_4 FROM t3;
expect_4
4
connection node_1;
DROP TABLE t1;
DROP TABLE t3;
connection node_2;
Shutting down server ...
SET wsrep_on=OFF;
Restarting server ...
connection node_1;
SET wsrep_sync_wait=0;
CREATE TABLE t2 (f1 INTEGER PRIMARY KEY DEFAULT 0, f2 char(12));
connection node_2;
SET GLOBAL wsrep_applier_retry_count = 2;
SET GLOBAL debug_dbug = '';
SET GLOBAL debug_dbug = "d,apply_event_fail_always:o,/dev/null";
connection node_1;
START TRANSACTION;
INSERT INTO t2 (f1, f2) VALUES (1, 'a'), (2, 'b');
COMMIT;
connection node_2;
Shutting down server ...
SET wsrep_on=OFF;
Restarting server ...
connection node_1;
SET wsrep_sync_wait=0;
connection node_2;
SELECT COUNT(*) AS expect_2 FROM t2;
expect_2
2
SET GLOBAL debug_dbug = DEFAULT;
connection node_1;
SET GLOBAL wsrep_applier_retry_count = 0;
SET DEBUG_SYNC = 'RESET';
DROP TABLE t2;
136 changes: 136 additions & 0 deletions mysql-test/suite/galera/t/galera_retry_applying.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#
# Test retrying applying of a transaction
#

--source include/galera_cluster.inc
--source include/have_debug_sync.inc

CALL mtr.add_suppression("Event .* Update_rows.* apply failed");
CALL mtr.add_suppression("Inconsistency detected");
CALL mtr.add_suppression("Failed to apply write set:.*");
CALL mtr.add_suppression("Event .* Write_rows.* apply failed");

#
# Case 1: Retrying succeeds after one retry event, no error is raised.
#
CREATE TABLE t1 (f1 INTEGER PRIMARY KEY DEFAULT 0, f2 char(12));
CREATE TABLE t3 (f1 INTEGER PRIMARY KEY DEFAULT 0, f2 char(12));

START TRANSACTION;
INSERT INTO t3 (f1, f2) VALUES (1, 'a');
INSERT INTO t3 (f1, f2) VALUES (2, 'b');
INSERT INTO t3 (f1, f2) VALUES (3, 'c'), (4, 'd'), (5, 'e');
COMMIT;

# wait till the insert transaction has been replicated and committed in node_2
--connection node_2
--let $wait_condition = SELECT COUNT(*) > 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't3';
--source include/wait_condition.inc
--let $wait_condition = SELECT COUNT(*) > 0 FROM t3;
--source include/wait_condition.inc

SET GLOBAL wsrep_applier_retry_count = 2;
SET GLOBAL debug_dbug = "d,apply_event_fail_once:o,/dev/null";
temeo marked this conversation as resolved.
Show resolved Hide resolved

--connection node_1
START TRANSACTION;
UPDATE t3 SET f2 = 'ax' WHERE f1 = 1;
UPDATE t3 SET f2 = 'bx' WHERE f1 = 2;
INSERT INTO t1 (f1, f2) VALUES (3, 'c'), (4, 'd'), (5, 'e');
UPDATE t3 SET f2 = 'cx' WHERE f1 = 3;
UPDATE t3 SET f2 = 'dx' WHERE f1 = 4;
DELETE FROM t3 WHERE f1 = 5;
COMMIT;

# wait till the transaction has been replicated and committed in node_2
--connection node_2
--let $wait_condition = SELECT COUNT(*) = 4 FROM t3;
--source include/wait_condition.inc

--connection node_1
SELECT COUNT(*) AS expect_3 FROM t1;
SELECT COUNT(*) AS expect_4 FROM t3;

--connection node_2
SELECT COUNT(*) AS expect_3 FROM t1;
SELECT COUNT(*) AS expect_4 FROM t3;

#
# Cleanup after Case 1.
#

--connection node_1
DROP TABLE t1;
DROP TABLE t3;

# shutdown node 2 and restart it
--connection node_2
--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't3';
--source include/wait_condition.inc
--echo Shutting down server ...
SET wsrep_on=OFF;
temeo marked this conversation as resolved.
Show resolved Hide resolved
--source include/shutdown_mysqld.inc
--remove_file $MYSQLTEST_VARDIR/mysqld.2/data/grastate.dat
--echo Restarting server ...
--source include/start_mysqld.inc

# wait till node 2 is back in the cluster
--connection node_1
SET wsrep_sync_wait=0;
--let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM performance_schema.global_status WHERE VARIABLE_NAME = 'wsrep_cluster_size'
--source include/wait_condition.inc


#
# Case 2: Slave retries applying of a transaction multiple times. All
# retry attempts fail, and the applying will fail with the expected
# error.
#

CREATE TABLE t2 (f1 INTEGER PRIMARY KEY DEFAULT 0, f2 char(12));

--connection node_2

SET GLOBAL wsrep_applier_retry_count = 2;
SET GLOBAL debug_dbug = '';
SET GLOBAL debug_dbug = "d,apply_event_fail_always:o,/dev/null";

--connection node_1
START TRANSACTION;
INSERT INTO t2 (f1, f2) VALUES (1, 'a'), (2, 'b');
COMMIT;

# node 2 should crash now, wait for the crash
--let $wait_condition = SELECT VARIABLE_VALUE = 1 FROM performance_schema.global_status WHERE VARIABLE_NAME = 'wsrep_cluster_size'
--source include/wait_condition.inc

# restart node 2
--connection node_2
--echo Shutting down server ...
SET wsrep_on=OFF;
temeo marked this conversation as resolved.
Show resolved Hide resolved
--source include/shutdown_mysqld.inc
--source include/wait_until_disconnected.inc
--remove_file $MYSQLTEST_VARDIR/mysqld.2/data/grastate.dat
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here is a race condition: previously a transaction was committed in node_1 and here node_2 will shutdown. But there is no check for the fate of the replicated INSERT from node_1: it may still be replicating or is currently applying or has already committed in node_2. For deterministic test behavior, the state of the INSERT transaction should be synced here or documented if it does not matter for the test result and can be safely ignored

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also, as this test phase is supposed to cause sure applier failure, the node should crash, so no need to shutdown it anymore.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed table names in row0ins.c, but DBUG_EXECUTE_IF macros still
mention the test database test. There are now two separate DBUG_EXECUTE_IF labels:
innodb_insert_fail_once - for failing INSERT once inside InnoDB, and
innodb_insert_fail_always - for failing INSERT always inside InnoDB.

Synchronization.
There are now 4 synchronization points in the MTR test, two in each of
the 2 test cases:

  1. Test case 1: wait till the insert transaction has been replicated and committed in node_2 (line 25)

  2. Test case 1: wait till the transaction has been replicated and committed in node_2 (line 44)

  3. Test case 2: wait for node_2 to crash (line 54)

  4. Test case 2: wait till node 2 is back in the cluster (line 120)

The test does not work without shutting down the server on node 2 after applier failure.

--echo Restarting server ...
--source include/start_mysqld.inc

# wait till node 2 is back in the cluster
--connection node_1
SET wsrep_sync_wait=0;
--let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM performance_schema.global_status WHERE VARIABLE_NAME = 'wsrep_cluster_size'
--source include/wait_condition.inc

--connection node_2
--let $wait_condition = SELECT COUNT(*) = 2 FROM t2;
--source include/wait_condition.inc
SELECT COUNT(*) AS expect_2 FROM t2;
SET GLOBAL debug_dbug = DEFAULT;

#
# Cleanup
#

--connection node_1
SET GLOBAL wsrep_applier_retry_count = 0;
SET DEBUG_SYNC = 'RESET';
DROP TABLE t2;
15 changes: 15 additions & 0 deletions mysql-test/suite/sys_vars/r/sysvars_wsrep.result
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,21 @@ ENUM_VALUE_LIST NULL
READ_ONLY YES
COMMAND_LINE_ARGUMENT REQUIRED
GLOBAL_VALUE_PATH NULL
VARIABLE_NAME WSREP_APPLIER_RETRY_COUNT
SESSION_VALUE NULL
GLOBAL_VALUE 0
GLOBAL_VALUE_ORIGIN COMPILE-TIME
DEFAULT_VALUE 0
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE INT UNSIGNED
VARIABLE_COMMENT Maximum number of applier retry attempts
NUMERIC_MIN_VALUE 0
NUMERIC_MAX_VALUE 4294967295
NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT OPTIONAL
GLOBAL_VALUE_PATH NULL
VARIABLE_NAME WSREP_AUTO_INCREMENT_CONTROL
SESSION_VALUE NULL
GLOBAL_VALUE ON
Expand Down
28 changes: 27 additions & 1 deletion sql/log_event_server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4925,7 +4925,33 @@ int Rows_log_event::do_apply_event(rpl_group_info *rgi)
/* remove trigger's tables */
goto err;
}

#ifdef WITH_WSREP
DBUG_EXECUTE_IF("apply_event_fail_once", {
if (WSREP(thd)) {
RPL_TABLE_LIST *ptr= static_cast<RPL_TABLE_LIST*>(rgi->tables_to_lock);
error= HA_ERR_LOCK_WAIT_TIMEOUT;
slave_rows_error_report(
INFORMATION_LEVEL, error, rgi, thd, ptr->table,
get_type_str(), RPL_LOG_NAME, log_pos);
my_error(error, MYF(0));
thd->is_slave_error= 1;
DBUG_SET("-d,apply_event_fail_once");
goto err;
}
};);
DBUG_EXECUTE_IF("apply_event_fail_always", {
if (WSREP(thd)) {
RPL_TABLE_LIST *ptr= static_cast<RPL_TABLE_LIST*>(rgi->tables_to_lock);
error= HA_ERR_LOCK_WAIT_TIMEOUT;
slave_rows_error_report(
INFORMATION_LEVEL, error, rgi, thd, ptr->table,
get_type_str(), RPL_LOG_NAME, log_pos);
my_error(error, MYF(0));
thd->is_slave_error= 1;
goto err;
}
};);
#endif /* WITH_WSREP */
/*
When the open and locking succeeded, we check all tables to
ensure that they still have the correct type.
Expand Down
6 changes: 6 additions & 0 deletions sql/sys_vars.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6493,6 +6493,12 @@ static Sys_var_charptr Sys_wsrep_allowlist(
READ_ONLY GLOBAL_VAR(wsrep_allowlist), CMD_LINE(REQUIRED_ARG),
DEFAULT(""));

static Sys_var_uint Sys_wsrep_applier_retry_count (
"wsrep_applier_retry_count", "Maximum number of applier retry attempts",
GLOBAL_VAR(wsrep_applier_retry_count), CMD_LINE(OPT_ARG),
VALID_RANGE(0, UINT_MAX), DEFAULT(0), BLOCK_SIZE(1),
NO_MUTEX_GUARD, NOT_IN_BINLOG);

#endif /* WITH_WSREP */

static bool fix_host_cache_size(sys_var *, THD *, enum_var_type)
Expand Down
11 changes: 11 additions & 0 deletions sql/transaction.cc
Original file line number Diff line number Diff line change
Expand Up @@ -783,3 +783,14 @@ bool trans_release_savepoint(THD *thd, LEX_CSTRING name)

DBUG_RETURN(MY_TEST(res));
}

#ifdef WITH_WSREP
/* check if a named savepoint exists for the current transaction */
bool trans_savepoint_exists(THD *thd, LEX_CSTRING name)
{
SAVEPOINT **sv = find_savepoint(thd, name);

return (*sv != NULL);
}
#endif /* WITH_WSREP */

3 changes: 3 additions & 0 deletions sql/transaction.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ bool trans_rollback_stmt(THD *thd);
bool trans_savepoint(THD *thd, LEX_CSTRING name);
bool trans_rollback_to_savepoint(THD *thd, LEX_CSTRING name);
bool trans_release_savepoint(THD *thd, LEX_CSTRING name);
#ifdef WITH_WSREP
bool trans_savepoint_exists(THD *thd, LEX_CSTRING name);
#endif /* WITH_WSREP */

void trans_reset_one_shot_chistics(THD *thd);

Expand Down
Loading