From e895e701125c402adc53a57b095d5857f35240fc Mon Sep 17 00:00:00 2001 From: Danny Zaken Date: Tue, 25 Feb 2025 20:52:19 +0200 Subject: [PATCH] disable node disconnect on error and detention - Added `config.NODES_DISCONNECT_ON_ERROR` which is false by default. - In `nodes_monitor. report_error_on_node_blocks`, disconnect the node only if config.NODES_DISCONNECT_ON_ERROR is true. - Changed the default of NODE_IO_DETENTION_DISABLE to true. Most users usually have a single node in a pool, so putting a node into detention causes many issues. Signed-off-by: Danny Zaken --- config.js | 8 +++++++- src/server/node_services/nodes_monitor.js | 24 +++++++++++++---------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/config.js b/config.js index 3735e5446e..248fa655dc 100644 --- a/config.js +++ b/config.js @@ -91,7 +91,13 @@ config.NODES_FREE_SPACE_RESERVE = 100 * (1024 ** 2); // don't use agents with less than reserve + 5 GB config.MINIMUM_AGENT_TOTAL_STORAGE = config.NODES_FREE_SPACE_RESERVE + (5 * (1024 ** 3)); -config.NODE_IO_DETENTION_DISABLE = false; + +// by default not disconnecting nodes on error. This caused more issues than benefits +config.NODES_DISCONNECT_ON_ERROR = false; + +// by default not detaining nodes on io errors. This caused more issues than benefits +config.NODE_IO_DETENTION_DISABLE = true; + config.NODE_IO_DETENTION_THRESHOLD = 60 * 1000; config.NODE_IO_DETENTION_RECENT_ISSUES = 5; // Picked two because minimum of nodes per pool is three diff --git a/src/server/node_services/nodes_monitor.js b/src/server/node_services/nodes_monitor.js index e3b2d7d140..066511b9d3 100644 --- a/src/server/node_services/nodes_monitor.js +++ b/src/server/node_services/nodes_monitor.js @@ -247,7 +247,7 @@ class NodesMonitor extends EventEmitter { return P.resolve() .then(() => this._run()) .then(() => { - // do nothing. + // do nothing. }); } @@ -1012,7 +1012,7 @@ class NodesMonitor extends EventEmitter { }) .then(() => this._update_nodes_store('force')) .then(() => { - // do nothing. + // do nothing. }); } @@ -1236,7 +1236,7 @@ class NodesMonitor extends EventEmitter { if (item.node.deleted) return; if (!item.connection) return; if (!item.agent_info) return; - //The node should be set as enable if it is not decommissioned. + //The node should be set as enable if it is not decommissioned. const should_enable = !item.node.decommissioned; const item_pool = system_store.data.get_by_id(item.node.pool); const location_info = { @@ -1244,7 +1244,7 @@ class NodesMonitor extends EventEmitter { host_id: String(item.node.host_id), pool_id: String(item.node.pool), }; - // We should only add region if it is defined. + // We should only add region if it is defined. if (item_pool && !_.isUndefined(item_pool.region)) location_info.region = item_pool.region; // We should change the service enable field if the field is not equal to the decommissioned decision. const service_enabled_not_changed = (item.node.enabled === should_enable); @@ -3373,12 +3373,16 @@ class NodesMonitor extends EventEmitter { 'node', item.node.name, 'issues_report', item.node.issues_report, 'block_report', block_report); - // disconnect from the node to force reconnect - // only disconnect if enough time passed since last disconnect to avoid amplification of errors in R\W flows - const DISCONNECT_GRACE_PERIOD = 2 * 60 * 1000; // 2 minutes grace before another disconnect - if (!item.disconnect_time || item.disconnect_time + DISCONNECT_GRACE_PERIOD < Date.now()) { - dbg.log0('disconnecting node to force reconnect. node:', item.node.name); - this._disconnect_node(item); + + + if (config.NODES_DISCONNECT_ON_ERROR) { + // disconnect from the node to force reconnect + // only disconnect if enough time passed since last disconnect to avoid amplification of errors in R\W flows + const DISCONNECT_GRACE_PERIOD = 2 * 60 * 1000; // 2 minutes grace before another disconnect + if (!item.disconnect_time || item.disconnect_time + DISCONNECT_GRACE_PERIOD < Date.now()) { + dbg.log0('disconnecting node to force reconnect. node:', item.node.name); + this._disconnect_node(item); + } } } }