Skip to content

Commit 17e6dc3

Browse files
nvijayakCopilot
andcommitted
Fix regression: use legacy rebalancer in
addStorageClusterResourceToControllerCluster The previous fix called addVeniceStorageClusterToControllerCluster() from createClusterIfRequired(), which uses WagedRebalancer. WagedRebalancer requires instance capacity configuration not present in integration tests, so waitUntilClusterResourceIsVisibleInEV() blocked for 5 minutes, causing all integration tests that start a Venice cluster to hang and hit their TestNG timeouts. Fix: introduce addStorageClusterResourceToControllerCluster(clusterName, replicaCount) in HelixAdminClient/ZkHelixAdminClient. This method: - Uses controllerClusterHelixAdmin (routes to controller.cluster.zk.address) so it is correct in both shared-ZK and split-ZK deployments - Keeps the original DelayedAutoRebalancer + CrushRebalanceStrategy config from the old createClusterIfRequired implementation, preserving assignment behavior for the non-HAAS path createClusterIfRequired now calls helixAdminClient.addStorageClusterResourceToControllerCluster instead of the direct admin.addResource/setResourceIdealState/rebalance calls. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 6e69e74 commit 17e6dc3

3 files changed

Lines changed: 46 additions & 7 deletions

File tree

services/venice-controller/src/main/java/com/linkedin/venice/controller/HelixAdminClient.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,22 @@ public interface HelixAdminClient {
5151
*/
5252
void addVeniceStorageClusterToControllerCluster(String clusterName);
5353

54+
/**
55+
* Register the given Venice storage cluster as a resource in the controller cluster using the
56+
* non-HAAS rebalancer configuration ({@code DelayedAutoRebalancer} +
57+
* {@code CrushRebalanceStrategy}). Must route to the controller-cluster ZK (not the storage ZK)
58+
* so that the operation is visible to the Helix manager connected via
59+
* {@code controller.cluster.zk.address} in split-ZK deployments.
60+
*
61+
* <p>Used by {@link VeniceHelixAdmin#createClusterIfRequired} which handles the non-HAAS storage
62+
* cluster path. Unlike {@link #addVeniceStorageClusterToControllerCluster} (which uses
63+
* {@code WagedRebalancer}), this method preserves the original rebalancer used in the legacy path.
64+
*
65+
* @param clusterName name of the Venice storage cluster being registered
66+
* @param replicaCount number of replicas (controller instances) for this resource
67+
*/
68+
void addStorageClusterResourceToControllerCluster(String clusterName, int replicaCount);
69+
5470
/**
5571
* Check if the grand cluster managed by HaaS controllers is aware of the given cluster.
5672
* @param clusterName of the cluster.

services/venice-controller/src/main/java/com/linkedin/venice/controller/VeniceHelixAdmin.java

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7292,12 +7292,10 @@ private void createClusterIfRequired(String clusterName) {
72927292
delayedTime);
72937293
admin.addStateModelDef(clusterName, LeaderStandbySMD.name, LeaderStandbySMD.build());
72947294

7295-
// Register the storage cluster as a resource in the controller cluster. Must use
7296-
// helixAdminClient (routes to controller.cluster.zk.address) rather than admin (storage ZK),
7297-
// because in split-ZK deployments the controller cluster lives on the controller ZK.
7298-
if (!helixAdminClient.isVeniceStorageClusterInControllerCluster(clusterName)) {
7299-
helixAdminClient.addVeniceStorageClusterToControllerCluster(clusterName);
7300-
}
7295+
// Register the storage cluster as a resource in the controller cluster using the legacy
7296+
// (non-HAAS) rebalancer config. Must route to the controller ZK in split-ZK deployments.
7297+
int controllerClusterReplica = config.getControllerClusterReplica();
7298+
helixAdminClient.addStorageClusterResourceToControllerCluster(clusterName, controllerClusterReplica);
73017299
}
73027300

73037301
public boolean updateIdealState(String clusterName, String resourceName, int minReplica) {

services/venice-controller/src/main/java/com/linkedin/venice/controller/ZkHelixAdminClient.java

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import org.apache.helix.constants.InstanceConstants;
1818
import org.apache.helix.controller.rebalancer.DelayedAutoRebalancer;
1919
import org.apache.helix.controller.rebalancer.strategy.AutoRebalanceStrategy;
20+
import org.apache.helix.controller.rebalancer.strategy.CrushRebalanceStrategy;
2021
import org.apache.helix.controller.rebalancer.waged.WagedRebalancer;
2122
import org.apache.helix.manager.zk.ZKHelixAdmin;
2223
import org.apache.helix.manager.zk.ZKHelixManager;
@@ -291,8 +292,32 @@ public void addVeniceStorageClusterToControllerCluster(String clusterName) {
291292
}
292293

293294
/**
294-
* @see HelixAdminClient#isClusterInGrandCluster(String)
295+
* @see HelixAdminClient#addStorageClusterResourceToControllerCluster(String, int)
295296
*/
297+
@Override
298+
public void addStorageClusterResourceToControllerCluster(String clusterName, int replicaCount) {
299+
if (isVeniceStorageClusterInControllerCluster(clusterName)) {
300+
return;
301+
}
302+
// Use controllerClusterHelixAdmin (routes to controller.cluster.zk.address) so this works in
303+
// both shared-ZK and split-ZK deployments. Keep the legacy non-HAAS rebalancer config so
304+
// that the assignment algorithm is unchanged for this path.
305+
controllerClusterHelixAdmin.addResource(
306+
controllerClusterName,
307+
clusterName,
308+
CONTROLLER_CLUSTER_PARTITION_COUNT,
309+
LeaderStandbySMD.name,
310+
IdealState.RebalanceMode.FULL_AUTO.toString(),
311+
AutoRebalanceStrategy.class.getName());
312+
IdealState idealState = controllerClusterHelixAdmin.getResourceIdealState(controllerClusterName, clusterName);
313+
idealState.setReplicas(String.valueOf(replicaCount));
314+
idealState.setMinActiveReplicas(replicaCount);
315+
idealState.setRebalancerClassName(DelayedAutoRebalancer.class.getName());
316+
idealState.setRebalanceStrategy(CrushRebalanceStrategy.class.getName());
317+
controllerClusterHelixAdmin.setResourceIdealState(controllerClusterName, clusterName, idealState);
318+
controllerClusterHelixAdmin.rebalance(controllerClusterName, clusterName, replicaCount);
319+
}
320+
296321
@Override
297322
public boolean isClusterInGrandCluster(String clusterName) {
298323
return controllerClusterHelixAdmin.getResourcesInCluster(haasSuperClusterName).contains(clusterName);

0 commit comments

Comments
 (0)