Don't deplete all the startup nodes after a series of ConnectionError or TimeoutError against all nodes, rather keep one around so that retry algorithm has at least one node to work with

eoghanmurray · eoghanmurray · commit 699f8f6b05ea · 2025-07-04T13:55:28.000+01:00
diff --git a/redis/asyncio/cluster.py b/redis/asyncio/cluster.py
@@ -816,16 +816,28 @@ async def _execute_command(
                 return await target_node.execute_command(*args, **kwargs)
             except (BusyLoadingError, MaxConnectionsError):
                 raise
-            except (ConnectionError, TimeoutError):
-                # Connection retries are being handled in the node's
-                # Retry object.
-                # Remove the failed node from the startup nodes before we try
-                # to reinitialize the cluster
-                self.nodes_manager.startup_nodes.pop(target_node.name, None)
+            except (ConnectionError, TimeoutError) as e:
+                if len(self.nodes_manager.startup_nodes) == 1:
+                    # keep at least one node for retrying
+                    ce = RedisClusterException(
+                        'Redis Cluster cannot be connected. '
+                        'Connection or Timeout Errors across all startup nodes'
+                    )
+                    ce.__cause__ = e
+                    e = ce
+                else:
+                    # Connection retries are being handled in the node's
+                    # Retry object.
+                    # Remove the failed node from the startup nodes before we
+                    # try to reinitialize the cluster
+                    self.nodes_manager.startup_nodes.pop(
+                        target_node.name,
+                        None
+                    )
                 # Hard force of reinitialize of the node/slots setup
                 # and try again with the new setup
                 await self.aclose()
-                raise
+                raise e
             except (ClusterDownError, SlotNotCoveredError):
                 # ClusterDownError can occur during a failover and to get
                 # self-healed, we will try to reinitialize the cluster layout