Skip to content

Commit ede5154

Browse files
committed
Avoid disconnect on message timeout while waiting on monitor/signer
When sending/receiving `commitment_signed`/`revoke_and_ack`, we may expect the counterparty to follow up with one of their own in response. In some cases, they are not allowed to send it because they are actually waiting for one from us. Async monitor updates and signing requests may result in the message we need to send to the counterparty being delayed, and our disconnect logic previously did not consider that. It doesn't make sense to disconnect our counterparty when we're the ones seemingly blocking progress. This commit ensures we no longer disconnect when we're waiting on an async monitor update or signing request, unless we're negotiating quiescence. Note that while our counterparty is still able to enforce a similar disconnect logic on their side, as they have no insight into why we're not able to make progress, this commit at least helps prevent reconnection cycles with those that don't enforce one.
1 parent c4d23bc commit ede5154

File tree

3 files changed

+174
-41
lines changed

3 files changed

+174
-41
lines changed

Diff for: lightning/src/ln/async_signer_tests.rs

+128
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use crate::chain::ChannelMonitorUpdateStatus;
2121
use crate::events::bump_transaction::WalletSource;
2222
use crate::events::{ClosureReason, Event};
2323
use crate::ln::chan_utils::ClosingTransaction;
24+
use crate::ln::channel::DISCONNECT_PEER_AWAITING_RESPONSE_TICKS;
2425
use crate::ln::channel_state::{ChannelDetails, ChannelShutdownState};
2526
use crate::ln::channelmanager::{PaymentId, RAACommitmentOrder, RecipientOnionFields};
2627
use crate::ln::msgs::{BaseMessageHandler, ChannelMessageHandler, MessageSendEvent};
@@ -1091,3 +1092,130 @@ fn do_test_closing_signed(extra_closing_signed: bool, reconnect: bool) {
10911092
check_closed_event!(nodes[0], 1, ClosureReason::LocallyInitiatedCooperativeClosure, [nodes[1].node.get_our_node_id()], 100000);
10921093
check_closed_event!(nodes[1], 1, ClosureReason::CounterpartyInitiatedCooperativeClosure, [nodes[0].node.get_our_node_id()], 100000);
10931094
}
1095+
1096+
#[test]
1097+
fn test_no_disconnect_while_async_revoke_and_ack_expecting_remote_commitment_signed() {
1098+
// Nodes with async signers may be expecting to receive a `commitment_signed` from the
1099+
// counterparty even if a `revoke_and_ack` has yet to be sent due to an async signer. Test that
1100+
// we don't disconnect the async signer node due to not receiving the `commitment_signed` within
1101+
// the timeout while the `revoke_and_ack` is not ready.
1102+
let chanmon_cfgs = create_chanmon_cfgs(2);
1103+
let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
1104+
let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
1105+
let nodes = create_network(2, &node_cfgs, &node_chanmgrs);
1106+
let chan_id = create_announced_chan_between_nodes(&nodes, 0, 1).2;
1107+
1108+
let node_id_0 = nodes[0].node.get_our_node_id();
1109+
let node_id_1 = nodes[1].node.get_our_node_id();
1110+
1111+
let payment_amount = 1_000_000;
1112+
send_payment(&nodes[0], &[&nodes[1]], payment_amount * 4);
1113+
1114+
nodes[1].disable_channel_signer_op(&node_id_0, &chan_id, SignerOp::ReleaseCommitmentSecret);
1115+
1116+
// We'll send a payment from both nodes to each other.
1117+
let (route1, payment_hash1, _, payment_secret1) =
1118+
get_route_and_payment_hash!(&nodes[0], &nodes[1], payment_amount);
1119+
let onion1 = RecipientOnionFields::secret_only(payment_secret1);
1120+
let payment_id1 = PaymentId(payment_hash1.0);
1121+
nodes[0].node.send_payment_with_route(route1, payment_hash1, onion1, payment_id1).unwrap();
1122+
check_added_monitors(&nodes[0], 1);
1123+
1124+
let (route2, payment_hash2, _, payment_secret2) =
1125+
get_route_and_payment_hash!(&nodes[1], &nodes[0], payment_amount);
1126+
let onion2 = RecipientOnionFields::secret_only(payment_secret2);
1127+
let payment_id2 = PaymentId(payment_hash2.0);
1128+
nodes[1].node.send_payment_with_route(route2, payment_hash2, onion2, payment_id2).unwrap();
1129+
check_added_monitors(&nodes[1], 1);
1130+
1131+
let update = get_htlc_update_msgs!(&nodes[0], node_id_1);
1132+
nodes[1].node.handle_update_add_htlc(node_id_0, &update.update_add_htlcs[0]);
1133+
nodes[1].node.handle_commitment_signed_batch_test(node_id_0, &update.commitment_signed);
1134+
check_added_monitors(&nodes[1], 1);
1135+
1136+
let update = get_htlc_update_msgs!(&nodes[1], node_id_0);
1137+
nodes[0].node.handle_update_add_htlc(node_id_1, &update.update_add_htlcs[0]);
1138+
nodes[0].node.handle_commitment_signed_batch_test(node_id_1, &update.commitment_signed);
1139+
check_added_monitors(&nodes[0], 1);
1140+
1141+
// nodes[0] can only respond with a `revoke_and_ack`. The `commitment_signed` that would follow
1142+
// is blocked on receiving a counterparty `revoke_and_ack`, which nodes[1] is still pending on.
1143+
let revoke_and_ack = get_event_msg!(&nodes[0], MessageSendEvent::SendRevokeAndACK, node_id_1);
1144+
nodes[1].node.handle_revoke_and_ack(node_id_0, &revoke_and_ack);
1145+
check_added_monitors(&nodes[1], 1);
1146+
1147+
assert!(nodes[0].node.get_and_clear_pending_msg_events().is_empty());
1148+
assert!(nodes[1].node.get_and_clear_pending_msg_events().is_empty());
1149+
1150+
// nodes[0] will disconnect the counterparty as it's waiting on a `revoke_and_ack`.
1151+
// nodes[1] is waiting on a `commitment_signed`, but since it hasn't yet sent its own
1152+
// `revoke_and_ack`, it shouldn't disconnect yet.
1153+
for _ in 0..DISCONNECT_PEER_AWAITING_RESPONSE_TICKS {
1154+
nodes[0].node.timer_tick_occurred();
1155+
nodes[1].node.timer_tick_occurred();
1156+
}
1157+
let has_disconnect_event = |event| {
1158+
matches!(
1159+
event, MessageSendEvent::HandleError { action , .. }
1160+
if matches!(action, msgs::ErrorAction::DisconnectPeerWithWarning { .. })
1161+
)
1162+
};
1163+
assert!(nodes[0].node.get_and_clear_pending_msg_events().into_iter().any(has_disconnect_event));
1164+
assert!(nodes[1].node.get_and_clear_pending_msg_events().is_empty());
1165+
}
1166+
1167+
#[test]
1168+
fn test_no_disconnect_while_async_commitment_signed_expecting_remote_revoke_and_ack() {
1169+
// Nodes with async signers may be expecting to receive a `revoke_and_ack` from the
1170+
// counterparty even if a `commitment_signed` has yet to be sent due to an async signer. Test
1171+
// that we don't disconnect the async signer node due to not receiving the `revoke_and_ack`
1172+
// within the timeout while the `commitment_signed` is not ready.
1173+
let chanmon_cfgs = create_chanmon_cfgs(2);
1174+
let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
1175+
let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
1176+
let nodes = create_network(2, &node_cfgs, &node_chanmgrs);
1177+
let chan_id = create_announced_chan_between_nodes(&nodes, 0, 1).2;
1178+
1179+
let node_id_0 = nodes[0].node.get_our_node_id();
1180+
let node_id_1 = nodes[1].node.get_our_node_id();
1181+
1182+
// Route a payment and attempt to claim it.
1183+
let payment_amount = 1_000_000;
1184+
let (preimage, payment_hash, ..) = route_payment(&nodes[0], &[&nodes[1]], payment_amount);
1185+
nodes[1].node.claim_funds(preimage);
1186+
check_added_monitors(&nodes[1], 1);
1187+
1188+
// We'll disable signing counterparty commitments on the payment sender.
1189+
nodes[0].disable_channel_signer_op(&node_id_1, &chan_id, SignerOp::SignCounterpartyCommitment);
1190+
1191+
// After processing the `update_fulfill`, they'll only be able to send `revoke_and_ack` until
1192+
// the `commitment_signed` is no longer pending.
1193+
let update = get_htlc_update_msgs!(&nodes[1], node_id_0);
1194+
nodes[0].node.handle_update_fulfill_htlc(node_id_1, &update.update_fulfill_htlcs[0]);
1195+
nodes[0].node.handle_commitment_signed_batch_test(node_id_1, &update.commitment_signed);
1196+
check_added_monitors(&nodes[0], 1);
1197+
1198+
let revoke_and_ack = get_event_msg!(&nodes[0], MessageSendEvent::SendRevokeAndACK, node_id_1);
1199+
nodes[1].node.handle_revoke_and_ack(node_id_0, &revoke_and_ack);
1200+
check_added_monitors(&nodes[1], 1);
1201+
1202+
// The payment sender shouldn't disconnect the counterparty due to a missing `revoke_and_ack`
1203+
// because the `commitment_signed` isn't ready yet. The payment recipient may disconnect the
1204+
// sender because it doesn't have an async signer and it's expecting a timely
1205+
// `commitment_signed` response.
1206+
for _ in 0..DISCONNECT_PEER_AWAITING_RESPONSE_TICKS {
1207+
nodes[0].node.timer_tick_occurred();
1208+
nodes[1].node.timer_tick_occurred();
1209+
}
1210+
let has_disconnect_event = |event| {
1211+
matches!(
1212+
event, MessageSendEvent::HandleError { action , .. }
1213+
if matches!(action, msgs::ErrorAction::DisconnectPeerWithWarning { .. })
1214+
)
1215+
};
1216+
assert!(nodes[0].node.get_and_clear_pending_msg_events().is_empty());
1217+
assert!(nodes[1].node.get_and_clear_pending_msg_events().into_iter().any(has_disconnect_event));
1218+
1219+
expect_payment_sent(&nodes[0], preimage, None, false, false);
1220+
expect_payment_claimed!(nodes[1], payment_hash, payment_amount);
1221+
}

Diff for: lightning/src/ln/channel.rs

+31-13
Original file line numberDiff line numberDiff line change
@@ -3205,23 +3205,33 @@ impl<SP: Deref> ChannelContext<SP> where SP::Target: SignerProvider {
32053205
}
32063206
}
32073207

3208+
/// Checks whether the channel has any HTLC additions, HTLC removals, or fee updates that have
3209+
/// been sent by either side but not yet irrevocably committed on both commitments because we're
3210+
/// waiting on a pending monitor update or signer request.
3211+
pub fn is_monitor_or_signer_pending_channel_update(&self) -> bool {
3212+
self.channel_state.is_monitor_update_in_progress()
3213+
|| self.signer_pending_revoke_and_ack
3214+
|| self.signer_pending_commitment_update
3215+
}
3216+
32083217
/// Checks whether the channel has any HTLC additions, HTLC removals, or fee updates that have
32093218
/// been sent by either side but not yet irrevocably committed on both commitments. Holding cell
32103219
/// updates are not considered because they haven't been sent to the peer yet.
32113220
///
32123221
/// This can be used to satisfy quiescence's requirement when sending `stfu`:
32133222
/// - MUST NOT send `stfu` if any of the sender's htlc additions, htlc removals
32143223
/// or fee updates are pending for either peer.
3215-
fn has_pending_channel_update(&self) -> bool {
3224+
///
3225+
/// Note that it is still possible for an update to be pending that's not captured here due to a
3226+
/// pending monitor update or signer request. `is_monitor_or_signer_pending_channel_update`
3227+
/// should also be checked in such cases.
3228+
fn is_waiting_on_peer_pending_channel_update(&self) -> bool {
32163229
// An update from the local/remote node may be pending on the remote/local commitment since
32173230
// they are not tracked within our state, so we rely on whether any `commitment_signed` or
32183231
// `revoke_and_ack` messages are owed.
32193232
//
32203233
// We check these flags first as they are more likely to be set.
3221-
if self.channel_state.is_awaiting_remote_revoke() || self.expecting_peer_commitment_signed
3222-
|| self.monitor_pending_revoke_and_ack || self.signer_pending_revoke_and_ack
3223-
|| self.monitor_pending_commitment_signed || self.signer_pending_commitment_update
3224-
{
3234+
if self.channel_state.is_awaiting_remote_revoke() || self.expecting_peer_commitment_signed {
32253235
return true;
32263236
}
32273237

@@ -7417,8 +7427,11 @@ impl<SP: Deref> FundedChannel<SP> where
74177427
|| self.context.channel_state.is_local_stfu_sent()
74187428
// Cleared upon receiving a message that triggers the end of quiescence.
74197429
|| self.context.channel_state.is_quiescent()
7420-
// Cleared upon receiving `revoke_and_ack`.
7421-
|| self.context.has_pending_channel_update()
7430+
// Cleared upon receiving `revoke_and_ack`. Since we're not queiscent, as we just
7431+
// checked above, we intentionally don't disconnect our counterparty if we're waiting on
7432+
// a monitor update or signer request.
7433+
|| (self.context.is_waiting_on_peer_pending_channel_update()
7434+
&& !self.context.is_monitor_or_signer_pending_channel_update())
74227435
{
74237436
// This is the first tick we've seen after expecting to make forward progress.
74247437
self.context.sent_message_awaiting_response = Some(1);
@@ -9306,7 +9319,9 @@ impl<SP: Deref> FundedChannel<SP> where
93069319
);
93079320
debug_assert!(self.context.is_live());
93089321

9309-
if self.context.has_pending_channel_update() {
9322+
if self.context.is_waiting_on_peer_pending_channel_update()
9323+
|| self.context.is_monitor_or_signer_pending_channel_update()
9324+
{
93109325
return Err(ChannelError::Ignore(
93119326
"We cannot send `stfu` while state machine is pending".to_owned()
93129327
));
@@ -9367,10 +9382,11 @@ impl<SP: Deref> FundedChannel<SP> where
93679382
));
93689383
}
93699384

9370-
// We don't check `has_pending_channel_update` prior to setting the flag because it
9371-
// considers pending updates from either node. This means we may accept a counterparty
9372-
// `stfu` while they had pending updates, but that's fine as we won't send ours until
9373-
// _all_ pending updates complete, allowing the channel to become quiescent then.
9385+
// We don't check `is_waiting_on_peer_pending_channel_update` prior to setting the flag
9386+
// because it considers pending updates from either node. This means we may accept a
9387+
// counterparty `stfu` while they had pending updates, but that's fine as we won't send
9388+
// ours until _all_ pending updates complete, allowing the channel to become quiescent
9389+
// then.
93749390
self.context.channel_state.set_remote_stfu_sent();
93759391

93769392
let is_holder_initiator = if self.context.channel_state.is_awaiting_quiescence() {
@@ -9394,7 +9410,9 @@ impl<SP: Deref> FundedChannel<SP> where
93949410
// We were expecting to receive `stfu` because we already sent ours.
93959411
self.mark_response_received();
93969412

9397-
if self.context.has_pending_channel_update() {
9413+
if self.context.is_waiting_on_peer_pending_channel_update()
9414+
|| self.context.is_monitor_or_signer_pending_channel_update()
9415+
{
93989416
// Since we've already sent `stfu`, it should not be possible for one of our updates to
93999417
// be pending, so anything pending currently must be from a counterparty update.
94009418
return Err(ChannelError::WarnAndDisconnect(

Diff for: lightning/src/ln/quiescence_tests.rs

+15-28
Original file line numberDiff line numberDiff line change
@@ -180,10 +180,10 @@ fn allow_shutdown_while_awaiting_quiescence(local_shutdown: bool) {
180180
}
181181

182182
#[test]
183-
fn test_quiescence_tracks_monitor_update_in_progress_and_waits_for_async_signer() {
183+
fn test_quiescence_waits_for_async_signer_and_monitor_update() {
184184
// Test that quiescence:
185185
// a) considers an async signer when determining whether a pending channel update exists
186-
// b) tracks in-progress monitor updates until no longer quiescent
186+
// b) waits until pending monitor updates complete to send `stfu`/become quiescent
187187
let chanmon_cfgs = create_chanmon_cfgs(2);
188188
let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
189189
let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
@@ -244,29 +244,12 @@ fn test_quiescence_tracks_monitor_update_in_progress_and_waits_for_async_signer(
244244
let revoke_and_ack = find_msg!(msg_events, SendRevokeAndACK);
245245
let stfu = find_msg!(msg_events, SendStfu);
246246

247-
// While handling the last `revoke_and_ack` on nodes[0], we'll hold the monitor update and
248-
// become quiescent.
247+
// While handling the last `revoke_and_ack` on nodes[0], we'll hold the monitor update. We
248+
// cannot become quiescent until it completes.
249249
chanmon_cfgs[0].persister.set_update_ret(ChannelMonitorUpdateStatus::InProgress);
250250
nodes[0].node.handle_revoke_and_ack(node_id_1, &revoke_and_ack);
251251

252252
nodes[0].node.handle_stfu(node_id_1, &stfu);
253-
let stfu = get_event_msg!(&nodes[0], MessageSendEvent::SendStfu, node_id_1);
254-
nodes[1].node.handle_stfu(node_id_0, &stfu);
255-
256-
nodes[0].node.exit_quiescence(&node_id_1, &chan_id).unwrap();
257-
nodes[1].node.exit_quiescence(&node_id_0, &chan_id).unwrap();
258-
259-
// After exiting quiescence, we should be able to resume payments from nodes[0], but the monitor
260-
// update has yet to complete. Attempting to send a payment now will be delayed until the
261-
// monitor update completes.
262-
{
263-
let (route, payment_hash, _, payment_secret) =
264-
get_route_and_payment_hash!(&nodes[0], &nodes[1], payment_amount);
265-
let onion = RecipientOnionFields::secret_only(payment_secret);
266-
let payment_id = PaymentId(payment_hash.0);
267-
nodes[0].node.send_payment_with_route(route, payment_hash, onion, payment_id).unwrap();
268-
}
269-
check_added_monitors(&nodes[0], 0);
270253
assert!(nodes[0].node.get_and_clear_pending_msg_events().is_empty());
271254

272255
// We have two updates pending:
@@ -276,17 +259,21 @@ fn test_quiescence_tracks_monitor_update_in_progress_and_waits_for_async_signer(
276259
chain_monitor.latest_monitor_update_id.lock().unwrap().get(&chan_id).unwrap().clone();
277260
let chain_monitor = &nodes[0].chain_monitor.chain_monitor;
278261
// One for the latest commitment transaction update from the last `revoke_and_ack`
279-
chain_monitor.channel_monitor_updated(chan_id, latest_update - 1).unwrap();
262+
chain_monitor.channel_monitor_updated(chan_id, latest_update).unwrap();
280263
expect_payment_sent(&nodes[0], preimage, None, true, true);
281264
// One for the commitment secret update from the last `revoke_and_ack`
282-
chain_monitor.channel_monitor_updated(chan_id, latest_update).unwrap();
265+
chain_monitor.channel_monitor_updated(chan_id, latest_update + 1).unwrap();
283266
}
284267

285-
// With the pending monitor updates complete, we'll see a new monitor update go out when freeing
286-
// the holding cells to send out the new HTLC.
287-
nodes[0].chain_monitor.complete_sole_pending_chan_update(&chan_id);
288-
let _ = get_htlc_update_msgs!(&nodes[0], node_id_1);
289-
check_added_monitors(&nodes[0], 1);
268+
// With the updates completed, we can now become quiescent.
269+
let stfu = get_event_msg!(&nodes[0], MessageSendEvent::SendStfu, node_id_1);
270+
nodes[1].node.handle_stfu(node_id_0, &stfu);
271+
272+
nodes[0].node.exit_quiescence(&node_id_1, &chan_id).unwrap();
273+
nodes[1].node.exit_quiescence(&node_id_0, &chan_id).unwrap();
274+
275+
// After exiting quiescence, we should be able to resume payments from nodes[0].
276+
send_payment(&nodes[0], &[&nodes[1]], payment_amount);
290277
}
291278

292279
#[test]

0 commit comments

Comments
 (0)