Skip to content

Commit 4cab8da

Browse files
authored
Merge pull request #183 from keynslug/fix/race-system-limit
fix(bootstrap): wait core tables are ready before copying
2 parents ad1cbd7 + 6084346 commit 4cab8da

File tree

3 files changed

+58
-6
lines changed

3 files changed

+58
-6
lines changed

src/mria_rlog.hrl

+3-3
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@
2525
-define(unexpected_event_kind, "Mria worker received unexpected event").
2626
-define(unexpected_event_tp(Params),
2727
?tp(warning, ?unexpected_event_kind,
28-
Params#{ process => ?MODULE
29-
, callback => ?FUNCTION_NAME
30-
})).
28+
(begin Params end)#{ process => ?MODULE
29+
, callback => ?FUNCTION_NAME
30+
})).
3131

3232
-define(terminate_tp,
3333
?tp(debug, mria_worker_terminate, #{process => ?MODULE, callback => terminate})).

src/mria_schema.erl

+8-3
Original file line numberDiff line numberDiff line change
@@ -317,19 +317,20 @@ converge_schema(Entries, InitialState) ->
317317
%% is needed so we can replicate schema updates just like regular
318318
%% transactions.
319319
bootstrap() ->
320-
Storage = ram_copies,
321320
Opts = [{type, ordered_set},
322321
{record_name, ?schema},
323322
{attributes, record_info(fields, ?schema)}
324323
],
325324
MetaSpec = #?schema{ mnesia_table = ?schema
326325
, shard = ?mria_meta_shard
327-
, storage = Storage
326+
, storage = ram_copies
328327
, config = Opts
329328
},
330329
%% Create (or copy) the mnesia table and wait for it:
331330
ok = create_table(MetaSpec),
332-
ok = mria_mnesia:copy_table(?schema, Storage),
331+
%% Ensure replicas are available before starting copy:
332+
ok = mria_mnesia:wait_for_tables([?schema]),
333+
ok = mria_mnesia:copy_table(?schema, ram_copies),
333334
RlogSyncOpts = [{record_name, ?rlog_sync},
334335
{attributes, record_info(fields, ?rlog_sync)}
335336
],
@@ -339,6 +340,10 @@ bootstrap() ->
339340
, config = RlogSyncOpts
340341
},
341342
ok = create_table(RlogSyncSpec),
343+
%% Ensure replicas are available before starting copy:
344+
%% If we've managed to sync only mnesia schema up to this point, `copy_table/2` may
345+
%% fail if other nodes suddenly become unavailable.
346+
ok = mria_mnesia:wait_for_tables([?rlog_sync]),
342347
ok = mria_mnesia:copy_table(?rlog_sync, null_copies),
343348
mria_mnesia:wait_for_tables([?schema, ?rlog_sync]),
344349
%% Seed the table with the metadata:

test/mria_mnesia_SUITE.erl

+47
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,53 @@ t_join_after_node_down(_) ->
9999
end,
100100
[]).
101101

102+
%% Start a cluster of two nodes, then join the third, and simulate two nodes went down
103+
%% right after the third one joined. Restore them and verify the third one is healthy.
104+
t_cluster_down_after_join(_) ->
105+
ClusterEnv = mria_mnesia_test_util:common_env(),
106+
Cluster = [C1, C2, C3] = mria_ct:cluster([core, core, core], ClusterEnv),
107+
?check_trace(
108+
#{timetrap => 10_000},
109+
try
110+
%% Prepare cluster with 3 nodes:
111+
Ns = [N1, N2, N3] = mria_ct:start_cluster(node, Cluster),
112+
?assertEqual([{ok, ok} || _ <- Ns], erpc:multicall(Ns, mria, start, [])),
113+
%% Join together first 2 nodes:
114+
?assertEqual(ok, erpc:call(N1, mria, join, [N2])),
115+
?assertEqual([N1, N2], lists:sort(erpc:call(N1, mria_mnesia, running_nodes, []))),
116+
?assertEqual(ok, erpc:call(N1, mria_transaction_gen, init, [])),
117+
%% Tell N3 to join but simulate it goes down after joining but before bootstrap:
118+
?assertEqual(ok, erpc:call(N3, meck, new, [mria_app, [no_link, passthrough]])),
119+
?assertEqual(ok, erpc:call(N3, meck, expect, [mria_app, start, fun ?MODULE:suicide/2])),
120+
%% Node N3 expectedly dies:
121+
?assertError({erpc, _}, erpc:call(N3, mria, join, [N1])),
122+
?assertError({erpc, _}, erpc:call(N3, mria_mnesia, running_nodes, [])),
123+
%% Tell N1 and N2 to stop:
124+
?assertEqual(ok, erpc:call(N1, mria, stop, [])),
125+
?assertEqual(ok, erpc:call(N2, mria, stop, [])),
126+
?assertEqual([ok, ok], [slave:stop(N) || N <- [N1, N2]]),
127+
%% Restart N3 and tell mria to start:
128+
N3 = mria_ct:start_slave(node, C3),
129+
%% This will hang waiting for N1 or N2 to go online, thus `cast/4`:
130+
?assertEqual(ok, erpc:cast(N3, mria, start, [])),
131+
%% Tell N1 and N2 to get back up:
132+
[N1, N2] = [mria_ct:start_slave(node, C) || C <- [C1, C2]],
133+
%% Again, use `cast/4` to avoid hanging waiting for another node:
134+
?assertEqual(ok, erpc:cast(N1, mria, start, [])),
135+
?assertEqual(ok, erpc:cast(N2, mria, start, [])),
136+
?assertEqual(ok, erpc:call(N3, mria, start, [])),
137+
%% Verify that bootstrap process has finished and the node is alive:
138+
_ = erpc:call(N3, sys, get_state, [mria_schema]),
139+
?assertEqual([N1, N2, N3], lists:sort(erpc:call(N3, mria_mnesia, running_nodes, []))),
140+
ok
141+
after
142+
ok = mria_ct:teardown_cluster(Cluster)
143+
end,
144+
[]).
145+
146+
suicide(_Type, _Args) ->
147+
erlang:halt().
148+
102149
t_diagnosis_tab(_)->
103150
TestTab = test_tab_1,
104151
Cluster = [NS1, NS2] = mria_ct:cluster([core, core], []),

0 commit comments

Comments
 (0)