From be42db8007c88482ad68cd4400504cdd180c252e Mon Sep 17 00:00:00 2001 From: Till Rohrmann Date: Thu, 27 Feb 2025 23:51:23 +0100 Subject: [PATCH] Add remove node command and DisableNodeChecker This commit introduces the remove node command which uses the DisableNodeChecker to verify that it is safe to disable/remove this node from the cluster. --- crates/core/protobuf/node_ctl_svc.proto | 2 +- .../src/commands/node/disable_node_checker.rs | 130 ++++++++++++++++++ tools/restatectl/src/commands/node/mod.rs | 6 + .../src/commands/node/remove_nodes.rs | 100 ++++++++++++++ tools/restatectl/src/connection.rs | 2 +- 5 files changed, 238 insertions(+), 2 deletions(-) create mode 100644 tools/restatectl/src/commands/node/disable_node_checker.rs create mode 100644 tools/restatectl/src/commands/node/remove_nodes.rs diff --git a/crates/core/protobuf/node_ctl_svc.proto b/crates/core/protobuf/node_ctl_svc.proto index 462bcb580..914c29865 100644 --- a/crates/core/protobuf/node_ctl_svc.proto +++ b/crates/core/protobuf/node_ctl_svc.proto @@ -91,4 +91,4 @@ message ClusterHealthResponse { message EmbeddedMetadataClusterHealth { repeated restate.common.NodeId members = 1; -} +} \ No newline at end of file diff --git a/tools/restatectl/src/commands/node/disable_node_checker.rs b/tools/restatectl/src/commands/node/disable_node_checker.rs new file mode 100644 index 000000000..c3df57622 --- /dev/null +++ b/tools/restatectl/src/commands/node/disable_node_checker.rs @@ -0,0 +1,130 @@ +// Copyright (c) 2023 - 2025 Restate Software, Inc., Restate GmbH. +// All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use restate_types::PlainNodeId; +use restate_types::logs::LogletId; +use restate_types::logs::metadata::{Logs, ProviderKind}; +use restate_types::nodes_config::{ + MetadataServerState, NodesConfigError, NodesConfiguration, Role, StorageState, +}; + +#[derive(Debug, thiserror::Error)] +pub enum DisableNodeError { + #[error("log server is part of a node set of loglet {0}")] + NodeSetMember(LogletId), + #[error("log server cannot be disabled because it is in read-write state")] + ReadWrite, + #[error("The current default loglet provider '{0}', does not support disabling nodes")] + DefaultLogletProvider(ProviderKind), + #[error("metadata server is an active member")] + MetadataMember, +} + +pub struct DisableNodeChecker { + nodes_configuration: NodesConfiguration, + logs: Logs, +} + +impl DisableNodeChecker { + pub fn new(nodes_configuration: NodesConfiguration, logs: Logs) -> Self { + DisableNodeChecker { + nodes_configuration, + logs, + } + } + + pub fn nodes_configuration(&self) -> &NodesConfiguration { + &self.nodes_configuration + } + + pub fn safe_to_disable_node(&self, node_id: PlainNodeId) -> Result<(), DisableNodeError> { + let node_config = match self.nodes_configuration.find_node_by_id(node_id) { + Ok(node_config) => node_config, + // unknown or deleted nodes can be safely disabled + Err(NodesConfigError::UnknownNodeId(_)) | Err(NodesConfigError::Deleted(_)) => { + return Ok(()); + } + Err(NodesConfigError::GenerationMismatch { .. }) + | Err(NodesConfigError::InvalidUri(_)) => { + unreachable!("impossible nodes config errors") + } + }; + + self.safe_to_disable_log_server(node_id, node_config.log_server_config.storage_state)?; + + // only safe to disable node if it does not run a metadata server or is not a member + // todo atm we must consider the role because the default metadata server state is MetadataServerState::Member. + // We need to introduce a provisioning state or make the metadata server state optional in the NodeConfig. + if node_config.roles.contains(Role::MetadataServer) + && node_config.metadata_server_config.metadata_server_state + == MetadataServerState::Member + { + return Err(DisableNodeError::MetadataMember); + } + + Ok(()) + } + + /// Checks whether it is safe to disable the given log server identified by the node_id. It is + /// safe to disable the log server if it can no longer be added to new node sets (== not being + /// a candidate) and it is no longer part of any known node sets. + fn safe_to_disable_log_server( + &self, + node_id: PlainNodeId, + storage_state: StorageState, + ) -> Result<(), DisableNodeError> { + match storage_state { + StorageState::ReadWrite => { + return Err(DisableNodeError::ReadWrite); + } + // it's safe to disable a disabled node + StorageState::Disabled => return Ok(()), + // we need to check whether this node is no longer part of any known node sets + StorageState::ReadOnly | StorageState::DataLoss | StorageState::Provisioning => {} + } + + // if the default provider kind is local or in-memory than it is not safe to disable the + // given node because it is included in the implicit node set + if matches!( + self.logs.configuration().default_provider.kind(), + ProviderKind::Local | ProviderKind::InMemory + ) { + return Err(DisableNodeError::DefaultLogletProvider( + self.logs.configuration().default_provider.kind(), + )); + } + + // check for every log that the given node is not contained in any node set + for (log_id, chain) in self.logs.iter() { + for segment in chain.iter() { + if match segment.config.kind { + // we assume that the given node runs the local and memory loglet and, therefore, + // is part of the implicit node set + ProviderKind::Local => true, + ProviderKind::InMemory => true, + ProviderKind::Replicated => self + .logs + .get_replicated_loglet(&LogletId::new(*log_id, segment.index())) + .expect("to be present") + .params + .nodeset + .contains(node_id), + } { + return Err(DisableNodeError::NodeSetMember(LogletId::new( + *log_id, + segment.index(), + ))); + } + } + } + + Ok(()) + } +} diff --git a/tools/restatectl/src/commands/node/mod.rs b/tools/restatectl/src/commands/node/mod.rs index 55b950296..79a80940e 100644 --- a/tools/restatectl/src/commands/node/mod.rs +++ b/tools/restatectl/src/commands/node/mod.rs @@ -8,7 +8,9 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. +pub mod disable_node_checker; pub mod list_nodes; +mod remove_nodes; use cling::prelude::*; @@ -16,4 +18,8 @@ use cling::prelude::*; pub enum Nodes { /// Print a summary of the active nodes registered in a cluster List(list_nodes::ListNodesOpts), + /// Removes the given node/s from the cluster. You should only use this command if you are + /// certain that the specified nodes are no longer part of any node sets, not members of the + /// metadata cluster nor required to run partition processors. + Remove(remove_nodes::RemoveNodesOpts), } diff --git a/tools/restatectl/src/commands/node/remove_nodes.rs b/tools/restatectl/src/commands/node/remove_nodes.rs new file mode 100644 index 000000000..95f835eb9 --- /dev/null +++ b/tools/restatectl/src/commands/node/remove_nodes.rs @@ -0,0 +1,100 @@ +// Copyright (c) 2023 - 2025 Restate Software, Inc., Restate GmbH. +// All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use crate::commands::node::disable_node_checker::DisableNodeChecker; +use crate::connection::ConnectionInfo; +use anyhow::Context; +use clap::Parser; +use cling::{Collect, Run}; +use itertools::Itertools; +use restate_cli_util::c_println; +use restate_core::metadata_store::{MetadataStoreClient, Precondition}; +use restate_metadata_server::create_client; +use restate_types::PlainNodeId; +use restate_types::config::{MetadataClientKind, MetadataClientOptions}; +use restate_types::metadata_store::keys::NODES_CONFIG_KEY; +use restate_types::nodes_config::{NodesConfiguration, Role}; + +#[derive(Run, Parser, Collect, Clone, Debug)] +#[clap(alias = "rm")] +#[cling(run = "remove_nodes")] +pub struct RemoveNodesOpts { + /// The node/s to remove from the cluster. Specify multiple nodes as a comma-separated list or + /// specify the option multiple times. + #[arg(long, required = true, visible_alias = "node", value_delimiter = ',')] + nodes: Vec, +} + +pub async fn remove_nodes( + connection: &ConnectionInfo, + opts: &RemoveNodesOpts, +) -> anyhow::Result<()> { + let nodes_configuration = connection.get_nodes_configuration().await?; + let metadata_client = create_metadata_client(&nodes_configuration).await?; + let logs = connection.get_logs().await?; + + let disable_node_checker = DisableNodeChecker::new(nodes_configuration, logs); + + for node_id in &opts.nodes { + disable_node_checker + .safe_to_disable_node(*node_id) + .context("It is not safe to disable node {node_id}")? + } + + let nodes_configuration = disable_node_checker.nodes_configuration(); + let mut updated_nodes_configuration = nodes_configuration.clone(); + + for node_id in &opts.nodes { + updated_nodes_configuration.remove_node_unchecked(*node_id); + } + + updated_nodes_configuration.increment_version(); + + metadata_client + .put( + NODES_CONFIG_KEY.clone(), + &updated_nodes_configuration, + Precondition::MatchesVersion(nodes_configuration.version()), + ) + .await?; + + c_println!( + "Successfully removed nodes [{}]", + opts.nodes.iter().join(",") + ); + + Ok(()) +} + +async fn create_metadata_client( + nodes_configuration: &NodesConfiguration, +) -> anyhow::Result { + // find metadata nodes + let addresses: Vec<_> = nodes_configuration + .iter_role(Role::MetadataServer) + .map(|(_, config)| config.address.clone()) + .collect(); + if addresses.is_empty() { + return Err(anyhow::anyhow!( + "No nodes are configured to run metadata-server role, this command only \ + supports replicated metadata deployment" + )); + } + + // todo make this work with other metadata client kinds as well; maybe proxy through Restate server + let metadata_store_client_options = MetadataClientOptions { + kind: MetadataClientKind::Replicated { addresses }, + ..Default::default() + }; + + create_client(metadata_store_client_options) + .await + .map_err(|e| anyhow::anyhow!("Failed to create metadata store client: {}", e)) +} diff --git a/tools/restatectl/src/connection.rs b/tools/restatectl/src/connection.rs index 74b01ebed..4b94ba15b 100644 --- a/tools/restatectl/src/connection.rs +++ b/tools/restatectl/src/connection.rs @@ -10,7 +10,7 @@ use std::collections::{HashMap, HashSet}; use std::sync::RwLock; -use std::{cmp::Ordering, fmt::Display, future::Future, sync::Arc}; +use std::{cmp::Ordering, fmt::Display, sync::Arc}; use cling::{Collect, prelude::Parser}; use itertools::{Either, Itertools, Position};