Panic if we're running with outdated state instead of force-closing

TheBlueMatt · TheBlueMatt · commit caa2a9a55bc9 · 2022-06-25T02:25:32.000Z
When we receive a `channel_reestablish` with a `data_loss_protect` that proves we're running with a stale state, instead of force-closing the channel, we immediately panic. This lines up with our refusal to run if we find a `ChannelMonitor` which is stale compared to our `ChannelManager` during `ChannelManager` deserialization. Ultimately both are an indication of the same thing - that the API requirements on `chain::Watch` were violated. In the "running with outdated state but ChannelMonitor(s) and ChannelManager lined up" case specifically its likely we're running off of an old backup, in which case connecting to peers with channels still live is explicitly dangerous. That said, because this could be an operator error that is correctable, panicing instead of force-closing may allow for normal operation again in the future (cc #1207). In any case, we provide instructions in the panic message for how to force-close channels prior to peer connection, as well as a note on how to broadcast the latest state if users are willing to take the risk. Note that this is still somewhat unsafe until we resolve #1563.
diff --git a/lightning/src/ln/channel.rs b/lightning/src/ln/channel.rs
@@ -802,7 +802,6 @@ pub(super) enum ChannelError {
 	Ignore(String),
 	Warn(String),
 	Close(String),
-	CloseDelayBroadcast(String),
 }
 
 impl fmt::Debug for ChannelError {
@@ -811,7 +810,6 @@ impl fmt::Debug for ChannelError {
 			&ChannelError::Ignore(ref e) => write!(f, "Ignore : {}", e),
 			&ChannelError::Warn(ref e) => write!(f, "Warn : {}", e),
 			&ChannelError::Close(ref e) => write!(f, "Close : {}", e),
-			&ChannelError::CloseDelayBroadcast(ref e) => write!(f, "CloseDelayBroadcast : {}", e)
 		}
 	}
 }
@@ -3799,6 +3797,11 @@ impl<Signer: Sign> Channel<Signer> {
 
 	/// May panic if some calls other than message-handling calls (which will all Err immediately)
 	/// have been called between remove_uncommitted_htlcs_and_mark_paused and this call.
+	///
+	/// Some links printed in log lines are included here to check them during build (when run with
+	/// `cargo doc --document-private-items`):
+	/// [`super::channelmanager::ChannelManager::force_close_without_broadcasting_txn`] and
+	/// [`super::channelmanager::ChannelManager::force_close_all_channels_without_broadcasting_txn`].
 	pub fn channel_reestablish<L: Deref>(&mut self, msg: &msgs::ChannelReestablish, logger: &L,
 		node_pk: PublicKey, genesis_block_hash: BlockHash, best_block: &BestBlock)
 	-> Result<ReestablishResponses, ChannelError> where L::Target: Logger {
@@ -3824,9 +3827,20 @@ impl<Signer: Sign> Channel<Signer> {
 						return Err(ChannelError::Close("Peer sent a garbage channel_reestablish with secret key not matching the commitment height provided".to_owned()));
 					}
 					if msg.next_remote_commitment_number > INITIAL_COMMITMENT_NUMBER - self.cur_holder_commitment_transaction_number {
-						return Err(ChannelError::CloseDelayBroadcast(
-							"We have fallen behind - we have received proof that if we broadcast remote is going to claim our funds - we can't do any automated broadcasting".to_owned()
-						));
+						macro_rules! log_and_panic {
+							($err_msg: expr) => {
+								log_error!(logger, $err_msg, log_bytes!(self.channel_id), log_pubkey!(self.counterparty_node_id));
+								panic!($err_msg, log_bytes!(self.channel_id), log_pubkey!(self.counterparty_node_id));
+							}
+						}
+						log_and_panic!("We have fallen behind - we have received proof that if we broadcast our counterparty is going to claim all our funds.\n\
+							This implies you have restarted with lost ChannelMonitor and ChannelManager state, the first of which is a violation of the LDK chain::Watch requirements.\n\
+							More specifically, this means you have a bug in your implementation that can cause loss of funds, or you are running with an old backup, which is unsafe.\n\
+							If you have restored from an old backup and wish to force-close channels and return to operation, you should start up, call\n\
+							ChannelManager::force_close_without_broadcasting_txn on channel {} with counterparty {} or\n\
+							ChannelManager::force_close_all_channels_without_broadcasting_txn, then reconnect to peer(s).\n\
+							Note that due to a long-standing bug in lnd you may have to reach out to peers running lnd-based nodes to ask them to manually force-close channels\n\
+							See https://github.com/lightningdevkit/rust-lightning/issues/1565 for more info.");
 					}
 				},
 				OptionalField::Absent => {}
@@ -3933,7 +3947,7 @@ impl<Signer: Sign> Channel<Signer> {
 				// now!
 				match self.free_holding_cell_htlcs(logger) {
 					Err(ChannelError::Close(msg)) => Err(ChannelError::Close(msg)),
-					Err(ChannelError::Warn(_)) | Err(ChannelError::Ignore(_)) | Err(ChannelError::CloseDelayBroadcast(_)) =>
+					Err(ChannelError::Warn(_)) | Err(ChannelError::Ignore(_)) =>
 						panic!("Got non-channel-failing result from free_holding_cell_htlcs"),
 					Ok((Some((commitment_update, monitor_update)), holding_cell_failed_htlcs)) => {
 						Ok(ReestablishResponses {
diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs
@@ -369,15 +369,6 @@ impl MsgHandleErrInternal {
 						},
 					},
 				},
-				ChannelError::CloseDelayBroadcast(msg) => LightningError {
-					err: msg.clone(),
-					action: msgs::ErrorAction::SendErrorMessage {
-						msg: msgs::ErrorMessage {
-							channel_id,
-							data: msg
-						},
-					},
-				},
 			},
 			chan_id: None,
 			shutdown_finish: None,
@@ -1273,13 +1264,6 @@ macro_rules! convert_chan_err {
 				(true, MsgHandleErrInternal::from_finish_shutdown(msg, *$channel_id, $channel.get_user_id(),
 					shutdown_res, $self.get_channel_update_for_broadcast(&$channel).ok()))
 			},
-			ChannelError::CloseDelayBroadcast(msg) => {
-				log_error!($self.logger, "Channel {} need to be shutdown but closing transactions not broadcast due to {}", log_bytes!($channel_id[..]), msg);
-				update_maps_on_chan_removal!($self, $short_to_id, $channel);
-				let shutdown_res = $channel.force_shutdown(false);
-				(true, MsgHandleErrInternal::from_finish_shutdown(msg, *$channel_id, $channel.get_user_id(),
-					shutdown_res, $self.get_channel_update_for_broadcast(&$channel).ok()))
-			}
 		}
 	}
 }
@@ -3213,7 +3197,6 @@ impl<Signer: Sign, M: Deref, T: Deref, K: Deref, F: Deref, L: Deref> ChannelMana
 											// ChannelClosed event is generated by handle_error for us.
 											Err(MsgHandleErrInternal::from_finish_shutdown(msg, channel.channel_id(), channel.get_user_id(), channel.force_shutdown(true), self.get_channel_update_for_broadcast(&channel).ok()))
 										},
-										ChannelError::CloseDelayBroadcast(_) => { panic!("Wait is only generated on receipt of channel_reestablish, which is handled by try_chan_entry, we don't bother to support it here"); }
 									};
 									handle_errors.push((counterparty_node_id, err));
 									continue;
diff --git a/lightning/src/ln/functional_tests.rs b/lightning/src/ln/functional_tests.rs
@@ -7399,14 +7399,11 @@ fn test_user_configurable_csv_delay() {
 	} else { assert!(false); }
 }
 
-#[test]
-fn test_data_loss_protect() {
-	// We want to be sure that :
-	// * we don't broadcast our Local Commitment Tx in case of fallen behind
-	//   (but this is not quite true - we broadcast during Drop because chanmon is out of sync with chanmgr)
-	// * we close channel in case of detecting other being fallen behind
-	// * we are able to claim our own outputs thanks to to_remote being static
-	// TODO: this test is incomplete and the data_loss_protect implementation is incomplete - see issue #775
+fn do_test_data_loss_protect(reconnect_panicing: bool) {
+	// When we get a data_loss_protect proving we're behind, we immediately panic as the
+	// chain::Watch API requirements have been violated (e.g. the user restored from a backup). The
+	// panic message informs the user they should force-close without broadcasting, which is tested
+	// if `reconnect_panicing` is not set.
 	let persister;
 	let logger;
 	let fee_estimator;
@@ -7464,53 +7461,53 @@ fn test_data_loss_protect() {
 
 	check_added_monitors!(nodes[0], 1);
 
-	nodes[0].node.peer_connected(&nodes[1].node.get_our_node_id(), &msgs::Init { features: InitFeatures::empty(), remote_network_address: None });
-	nodes[1].node.peer_connected(&nodes[0].node.get_our_node_id(), &msgs::Init { features: InitFeatures::empty(), remote_network_address: None });
+	if reconnect_panicing {
+		nodes[0].node.peer_connected(&nodes[1].node.get_our_node_id(), &msgs::Init { features: InitFeatures::empty(), remote_network_address: None });
+		nodes[1].node.peer_connected(&nodes[0].node.get_our_node_id(), &msgs::Init { features: InitFeatures::empty(), remote_network_address: None });
 
-	let reestablish_0 = get_chan_reestablish_msgs!(nodes[1], nodes[0]);
+		let reestablish_1 = get_chan_reestablish_msgs!(nodes[0], nodes[1]);
 
-	// Check we don't broadcast any transactions following learning of per_commitment_point from B
-	nodes[0].node.handle_channel_reestablish(&nodes[1].node.get_our_node_id(), &reestablish_0[0]);
-	check_added_monitors!(nodes[0], 1);
+		// Check we close channel detecting A is fallen-behind
+		// Check that we sent the warning message when we detected that A has fallen behind,
+		// and give the possibility for A to recover from the warning.
+		nodes[1].node.handle_channel_reestablish(&nodes[0].node.get_our_node_id(), &reestablish_1[0]);
+		let warn_msg = "Peer attempted to reestablish channel with a very old local commitment transaction".to_owned();
+		assert!(check_warn_msg!(nodes[1], nodes[0].node.get_our_node_id(), chan.2).contains(&warn_msg));
+
+		{
+			let mut node_txn = nodes[1].tx_broadcaster.txn_broadcasted.lock().unwrap().clone();
+			// The node B should not broadcast the transaction to force close the channel!
+			assert!(node_txn.is_empty());
+		}
+
+		let reestablish_0 = get_chan_reestablish_msgs!(nodes[1], nodes[0]);
+		// Check A panics upon seeing proof it has fallen behind.
+		nodes[0].node.handle_channel_reestablish(&nodes[1].node.get_our_node_id(), &reestablish_0[0]);
+		return; // By this point we should have panic'ed!
+	}
 
+	nodes[0].node.force_close_without_broadcasting_txn(&chan.2, &nodes[1].node.get_our_node_id()).unwrap();
+	check_added_monitors!(nodes[0], 1);
+	check_closed_event!(nodes[0], 1, ClosureReason::HolderForceClosed);
 	{
-		let node_txn = nodes[0].tx_broadcaster.txn_broadcasted.lock().unwrap().clone();
+		let node_txn = nodes[0].tx_broadcaster.txn_broadcasted.lock().unwrap();
 		assert_eq!(node_txn.len(), 0);
 	}
 
-	let mut reestablish_1 = Vec::with_capacity(1);
 	for msg in nodes[0].node.get_and_clear_pending_msg_events() {
-		if let MessageSendEvent::SendChannelReestablish { ref node_id, ref msg } = msg {
-			assert_eq!(*node_id, nodes[1].node.get_our_node_id());
-			reestablish_1.push(msg.clone());
-		} else if let MessageSendEvent::BroadcastChannelUpdate { .. } = msg {
+		if let MessageSendEvent::BroadcastChannelUpdate { .. } = msg {
 		} else if let MessageSendEvent::HandleError { ref action, .. } = msg {
 			match action {
 				&ErrorAction::SendErrorMessage { ref msg } => {
-					assert_eq!(msg.data, "We have fallen behind - we have received proof that if we broadcast remote is going to claim our funds - we can't do any automated broadcasting");
+					assert_eq!(msg.data, "Channel force-closed");
 				},
 				_ => panic!("Unexpected event!"),
 			}
 		} else {
-			panic!("Unexpected event")
+			panic!("Unexpected event {:?}", msg)
 		}
 	}
 
-	// Check we close channel detecting A is fallen-behind
-	// Check that we sent the warning message when we detected that A has fallen behind,
-	// and give the possibility for A to recover from the warning.
-	nodes[1].node.handle_channel_reestablish(&nodes[0].node.get_our_node_id(), &reestablish_1[0]);
-	let warn_msg = "Peer attempted to reestablish channel with a very old local commitment transaction".to_owned();
-	assert!(check_warn_msg!(nodes[1], nodes[0].node.get_our_node_id(), chan.2).contains(&warn_msg));
-
-	// Check A is able to claim to_remote output
-	let mut node_txn = nodes[1].tx_broadcaster.txn_broadcasted.lock().unwrap().clone();
-	// The node B should not broadcast the transaction to force close the channel!
-	assert!(node_txn.is_empty());
-	// B should now detect that there is something wrong and should force close the channel.
-	let exp_err = "We have fallen behind - we have received proof that if we broadcast remote is going to claim our funds - we can\'t do any automated broadcasting";
-	check_closed_event!(nodes[0], 1, ClosureReason::ProcessingError { err: exp_err.to_string() });
-
 	// after the warning message sent by B, we should not able to
 	// use the channel, or reconnect with success to the channel.
 	assert!(nodes[0].node.list_usable_channels().is_empty());
@@ -7541,6 +7538,17 @@ fn test_data_loss_protect() {
 	check_closed_broadcast!(nodes[1], false);
 }
 
+#[test]
+#[should_panic]
+fn test_data_loss_protect_showing_stale_state_panics() {
+	do_test_data_loss_protect(true);
+}
+
+#[test]
+fn test_force_close_without_broadcast() {
+	do_test_data_loss_protect(false);
+}
+
 #[test]
 fn test_check_htlc_underpaying() {
 	// Send payment through A -> B but A is maliciously