Skip to content

Commit caa2a9a

Browse files
committed
Panic if we're running with outdated state instead of force-closing
When we receive a `channel_reestablish` with a `data_loss_protect` that proves we're running with a stale state, instead of force-closing the channel, we immediately panic. This lines up with our refusal to run if we find a `ChannelMonitor` which is stale compared to our `ChannelManager` during `ChannelManager` deserialization. Ultimately both are an indication of the same thing - that the API requirements on `chain::Watch` were violated. In the "running with outdated state but ChannelMonitor(s) and ChannelManager lined up" case specifically its likely we're running off of an old backup, in which case connecting to peers with channels still live is explicitly dangerous. That said, because this could be an operator error that is correctable, panicing instead of force-closing may allow for normal operation again in the future (cc #1207). In any case, we provide instructions in the panic message for how to force-close channels prior to peer connection, as well as a note on how to broadcast the latest state if users are willing to take the risk. Note that this is still somewhat unsafe until we resolve #1563.
1 parent 5ed3f25 commit caa2a9a

File tree

3 files changed

+65
-60
lines changed

3 files changed

+65
-60
lines changed

lightning/src/ln/channel.rs

+20-6
Original file line numberDiff line numberDiff line change
@@ -802,7 +802,6 @@ pub(super) enum ChannelError {
802802
Ignore(String),
803803
Warn(String),
804804
Close(String),
805-
CloseDelayBroadcast(String),
806805
}
807806

808807
impl fmt::Debug for ChannelError {
@@ -811,7 +810,6 @@ impl fmt::Debug for ChannelError {
811810
&ChannelError::Ignore(ref e) => write!(f, "Ignore : {}", e),
812811
&ChannelError::Warn(ref e) => write!(f, "Warn : {}", e),
813812
&ChannelError::Close(ref e) => write!(f, "Close : {}", e),
814-
&ChannelError::CloseDelayBroadcast(ref e) => write!(f, "CloseDelayBroadcast : {}", e)
815813
}
816814
}
817815
}
@@ -3799,6 +3797,11 @@ impl<Signer: Sign> Channel<Signer> {
37993797

38003798
/// May panic if some calls other than message-handling calls (which will all Err immediately)
38013799
/// have been called between remove_uncommitted_htlcs_and_mark_paused and this call.
3800+
///
3801+
/// Some links printed in log lines are included here to check them during build (when run with
3802+
/// `cargo doc --document-private-items`):
3803+
/// [`super::channelmanager::ChannelManager::force_close_without_broadcasting_txn`] and
3804+
/// [`super::channelmanager::ChannelManager::force_close_all_channels_without_broadcasting_txn`].
38023805
pub fn channel_reestablish<L: Deref>(&mut self, msg: &msgs::ChannelReestablish, logger: &L,
38033806
node_pk: PublicKey, genesis_block_hash: BlockHash, best_block: &BestBlock)
38043807
-> Result<ReestablishResponses, ChannelError> where L::Target: Logger {
@@ -3824,9 +3827,20 @@ impl<Signer: Sign> Channel<Signer> {
38243827
return Err(ChannelError::Close("Peer sent a garbage channel_reestablish with secret key not matching the commitment height provided".to_owned()));
38253828
}
38263829
if msg.next_remote_commitment_number > INITIAL_COMMITMENT_NUMBER - self.cur_holder_commitment_transaction_number {
3827-
return Err(ChannelError::CloseDelayBroadcast(
3828-
"We have fallen behind - we have received proof that if we broadcast remote is going to claim our funds - we can't do any automated broadcasting".to_owned()
3829-
));
3830+
macro_rules! log_and_panic {
3831+
($err_msg: expr) => {
3832+
log_error!(logger, $err_msg, log_bytes!(self.channel_id), log_pubkey!(self.counterparty_node_id));
3833+
panic!($err_msg, log_bytes!(self.channel_id), log_pubkey!(self.counterparty_node_id));
3834+
}
3835+
}
3836+
log_and_panic!("We have fallen behind - we have received proof that if we broadcast our counterparty is going to claim all our funds.\n\
3837+
This implies you have restarted with lost ChannelMonitor and ChannelManager state, the first of which is a violation of the LDK chain::Watch requirements.\n\
3838+
More specifically, this means you have a bug in your implementation that can cause loss of funds, or you are running with an old backup, which is unsafe.\n\
3839+
If you have restored from an old backup and wish to force-close channels and return to operation, you should start up, call\n\
3840+
ChannelManager::force_close_without_broadcasting_txn on channel {} with counterparty {} or\n\
3841+
ChannelManager::force_close_all_channels_without_broadcasting_txn, then reconnect to peer(s).\n\
3842+
Note that due to a long-standing bug in lnd you may have to reach out to peers running lnd-based nodes to ask them to manually force-close channels\n\
3843+
See https://github.com/lightningdevkit/rust-lightning/issues/1565 for more info.");
38303844
}
38313845
},
38323846
OptionalField::Absent => {}
@@ -3933,7 +3947,7 @@ impl<Signer: Sign> Channel<Signer> {
39333947
// now!
39343948
match self.free_holding_cell_htlcs(logger) {
39353949
Err(ChannelError::Close(msg)) => Err(ChannelError::Close(msg)),
3936-
Err(ChannelError::Warn(_)) | Err(ChannelError::Ignore(_)) | Err(ChannelError::CloseDelayBroadcast(_)) =>
3950+
Err(ChannelError::Warn(_)) | Err(ChannelError::Ignore(_)) =>
39373951
panic!("Got non-channel-failing result from free_holding_cell_htlcs"),
39383952
Ok((Some((commitment_update, monitor_update)), holding_cell_failed_htlcs)) => {
39393953
Ok(ReestablishResponses {

lightning/src/ln/channelmanager.rs

-17
Original file line numberDiff line numberDiff line change
@@ -369,15 +369,6 @@ impl MsgHandleErrInternal {
369369
},
370370
},
371371
},
372-
ChannelError::CloseDelayBroadcast(msg) => LightningError {
373-
err: msg.clone(),
374-
action: msgs::ErrorAction::SendErrorMessage {
375-
msg: msgs::ErrorMessage {
376-
channel_id,
377-
data: msg
378-
},
379-
},
380-
},
381372
},
382373
chan_id: None,
383374
shutdown_finish: None,
@@ -1273,13 +1264,6 @@ macro_rules! convert_chan_err {
12731264
(true, MsgHandleErrInternal::from_finish_shutdown(msg, *$channel_id, $channel.get_user_id(),
12741265
shutdown_res, $self.get_channel_update_for_broadcast(&$channel).ok()))
12751266
},
1276-
ChannelError::CloseDelayBroadcast(msg) => {
1277-
log_error!($self.logger, "Channel {} need to be shutdown but closing transactions not broadcast due to {}", log_bytes!($channel_id[..]), msg);
1278-
update_maps_on_chan_removal!($self, $short_to_id, $channel);
1279-
let shutdown_res = $channel.force_shutdown(false);
1280-
(true, MsgHandleErrInternal::from_finish_shutdown(msg, *$channel_id, $channel.get_user_id(),
1281-
shutdown_res, $self.get_channel_update_for_broadcast(&$channel).ok()))
1282-
}
12831267
}
12841268
}
12851269
}
@@ -3213,7 +3197,6 @@ impl<Signer: Sign, M: Deref, T: Deref, K: Deref, F: Deref, L: Deref> ChannelMana
32133197
// ChannelClosed event is generated by handle_error for us.
32143198
Err(MsgHandleErrInternal::from_finish_shutdown(msg, channel.channel_id(), channel.get_user_id(), channel.force_shutdown(true), self.get_channel_update_for_broadcast(&channel).ok()))
32153199
},
3216-
ChannelError::CloseDelayBroadcast(_) => { panic!("Wait is only generated on receipt of channel_reestablish, which is handled by try_chan_entry, we don't bother to support it here"); }
32173200
};
32183201
handle_errors.push((counterparty_node_id, err));
32193202
continue;

lightning/src/ln/functional_tests.rs

+45-37
Original file line numberDiff line numberDiff line change
@@ -7399,14 +7399,11 @@ fn test_user_configurable_csv_delay() {
73997399
} else { assert!(false); }
74007400
}
74017401

7402-
#[test]
7403-
fn test_data_loss_protect() {
7404-
// We want to be sure that :
7405-
// * we don't broadcast our Local Commitment Tx in case of fallen behind
7406-
// (but this is not quite true - we broadcast during Drop because chanmon is out of sync with chanmgr)
7407-
// * we close channel in case of detecting other being fallen behind
7408-
// * we are able to claim our own outputs thanks to to_remote being static
7409-
// TODO: this test is incomplete and the data_loss_protect implementation is incomplete - see issue #775
7402+
fn do_test_data_loss_protect(reconnect_panicing: bool) {
7403+
// When we get a data_loss_protect proving we're behind, we immediately panic as the
7404+
// chain::Watch API requirements have been violated (e.g. the user restored from a backup). The
7405+
// panic message informs the user they should force-close without broadcasting, which is tested
7406+
// if `reconnect_panicing` is not set.
74107407
let persister;
74117408
let logger;
74127409
let fee_estimator;
@@ -7464,53 +7461,53 @@ fn test_data_loss_protect() {
74647461

74657462
check_added_monitors!(nodes[0], 1);
74667463

7467-
nodes[0].node.peer_connected(&nodes[1].node.get_our_node_id(), &msgs::Init { features: InitFeatures::empty(), remote_network_address: None });
7468-
nodes[1].node.peer_connected(&nodes[0].node.get_our_node_id(), &msgs::Init { features: InitFeatures::empty(), remote_network_address: None });
7464+
if reconnect_panicing {
7465+
nodes[0].node.peer_connected(&nodes[1].node.get_our_node_id(), &msgs::Init { features: InitFeatures::empty(), remote_network_address: None });
7466+
nodes[1].node.peer_connected(&nodes[0].node.get_our_node_id(), &msgs::Init { features: InitFeatures::empty(), remote_network_address: None });
74697467

7470-
let reestablish_0 = get_chan_reestablish_msgs!(nodes[1], nodes[0]);
7468+
let reestablish_1 = get_chan_reestablish_msgs!(nodes[0], nodes[1]);
74717469

7472-
// Check we don't broadcast any transactions following learning of per_commitment_point from B
7473-
nodes[0].node.handle_channel_reestablish(&nodes[1].node.get_our_node_id(), &reestablish_0[0]);
7474-
check_added_monitors!(nodes[0], 1);
7470+
// Check we close channel detecting A is fallen-behind
7471+
// Check that we sent the warning message when we detected that A has fallen behind,
7472+
// and give the possibility for A to recover from the warning.
7473+
nodes[1].node.handle_channel_reestablish(&nodes[0].node.get_our_node_id(), &reestablish_1[0]);
7474+
let warn_msg = "Peer attempted to reestablish channel with a very old local commitment transaction".to_owned();
7475+
assert!(check_warn_msg!(nodes[1], nodes[0].node.get_our_node_id(), chan.2).contains(&warn_msg));
7476+
7477+
{
7478+
let mut node_txn = nodes[1].tx_broadcaster.txn_broadcasted.lock().unwrap().clone();
7479+
// The node B should not broadcast the transaction to force close the channel!
7480+
assert!(node_txn.is_empty());
7481+
}
7482+
7483+
let reestablish_0 = get_chan_reestablish_msgs!(nodes[1], nodes[0]);
7484+
// Check A panics upon seeing proof it has fallen behind.
7485+
nodes[0].node.handle_channel_reestablish(&nodes[1].node.get_our_node_id(), &reestablish_0[0]);
7486+
return; // By this point we should have panic'ed!
7487+
}
74757488

7489+
nodes[0].node.force_close_without_broadcasting_txn(&chan.2, &nodes[1].node.get_our_node_id()).unwrap();
7490+
check_added_monitors!(nodes[0], 1);
7491+
check_closed_event!(nodes[0], 1, ClosureReason::HolderForceClosed);
74767492
{
7477-
let node_txn = nodes[0].tx_broadcaster.txn_broadcasted.lock().unwrap().clone();
7493+
let node_txn = nodes[0].tx_broadcaster.txn_broadcasted.lock().unwrap();
74787494
assert_eq!(node_txn.len(), 0);
74797495
}
74807496

7481-
let mut reestablish_1 = Vec::with_capacity(1);
74827497
for msg in nodes[0].node.get_and_clear_pending_msg_events() {
7483-
if let MessageSendEvent::SendChannelReestablish { ref node_id, ref msg } = msg {
7484-
assert_eq!(*node_id, nodes[1].node.get_our_node_id());
7485-
reestablish_1.push(msg.clone());
7486-
} else if let MessageSendEvent::BroadcastChannelUpdate { .. } = msg {
7498+
if let MessageSendEvent::BroadcastChannelUpdate { .. } = msg {
74877499
} else if let MessageSendEvent::HandleError { ref action, .. } = msg {
74887500
match action {
74897501
&ErrorAction::SendErrorMessage { ref msg } => {
7490-
assert_eq!(msg.data, "We have fallen behind - we have received proof that if we broadcast remote is going to claim our funds - we can't do any automated broadcasting");
7502+
assert_eq!(msg.data, "Channel force-closed");
74917503
},
74927504
_ => panic!("Unexpected event!"),
74937505
}
74947506
} else {
7495-
panic!("Unexpected event")
7507+
panic!("Unexpected event {:?}", msg)
74967508
}
74977509
}
74987510

7499-
// Check we close channel detecting A is fallen-behind
7500-
// Check that we sent the warning message when we detected that A has fallen behind,
7501-
// and give the possibility for A to recover from the warning.
7502-
nodes[1].node.handle_channel_reestablish(&nodes[0].node.get_our_node_id(), &reestablish_1[0]);
7503-
let warn_msg = "Peer attempted to reestablish channel with a very old local commitment transaction".to_owned();
7504-
assert!(check_warn_msg!(nodes[1], nodes[0].node.get_our_node_id(), chan.2).contains(&warn_msg));
7505-
7506-
// Check A is able to claim to_remote output
7507-
let mut node_txn = nodes[1].tx_broadcaster.txn_broadcasted.lock().unwrap().clone();
7508-
// The node B should not broadcast the transaction to force close the channel!
7509-
assert!(node_txn.is_empty());
7510-
// B should now detect that there is something wrong and should force close the channel.
7511-
let exp_err = "We have fallen behind - we have received proof that if we broadcast remote is going to claim our funds - we can\'t do any automated broadcasting";
7512-
check_closed_event!(nodes[0], 1, ClosureReason::ProcessingError { err: exp_err.to_string() });
7513-
75147511
// after the warning message sent by B, we should not able to
75157512
// use the channel, or reconnect with success to the channel.
75167513
assert!(nodes[0].node.list_usable_channels().is_empty());
@@ -7541,6 +7538,17 @@ fn test_data_loss_protect() {
75417538
check_closed_broadcast!(nodes[1], false);
75427539
}
75437540

7541+
#[test]
7542+
#[should_panic]
7543+
fn test_data_loss_protect_showing_stale_state_panics() {
7544+
do_test_data_loss_protect(true);
7545+
}
7546+
7547+
#[test]
7548+
fn test_force_close_without_broadcast() {
7549+
do_test_data_loss_protect(false);
7550+
}
7551+
75447552
#[test]
75457553
fn test_check_htlc_underpaying() {
75467554
// Send payment through A -> B but A is maliciously

0 commit comments

Comments
 (0)