From 030564ec8969e09c62bf20e09513f1f6fef56d79 Mon Sep 17 00:00:00 2001 From: Koorous Vargha Date: Mon, 27 Jan 2025 11:33:26 -0800 Subject: [PATCH] [controller] Set rebalance preference to prioritize evenness when creating the controller cluster. Also set capacity keys to enable top-state even distribution --- .../venice/controller/ZkHelixAdminClient.java | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/ZkHelixAdminClient.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/ZkHelixAdminClient.java index d08c8c551a7..b1f667c72ea 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/ZkHelixAdminClient.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/ZkHelixAdminClient.java @@ -7,6 +7,7 @@ import com.linkedin.venice.utils.RetryUtils; import io.tehuti.metrics.MetricsRepository; import java.time.Duration; +import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -99,6 +100,32 @@ public void createVeniceControllerCluster() { // choose proper instance to hold the replica. clusterConfig.setTopologyAwareEnabled(false); + // We want to prioritize evenness over less movement when it comes to resource assignment, because the cost + // of rebalancing for the controller is cheap as it is stateless. + Map globalRebalancePreference = new HashMap<>(); + globalRebalancePreference.put(ClusterConfig.GlobalRebalancePreferenceKey.EVENNESS, 10); + globalRebalancePreference.put(ClusterConfig.GlobalRebalancePreferenceKey.LESS_MOVEMENT, 1); + // This should be turned off, so it doesn't overpower other constraint calculations + globalRebalancePreference.put(ClusterConfig.GlobalRebalancePreferenceKey.FORCE_BASELINE_CONVERGE, 0); + clusterConfig.setGlobalRebalancePreference(globalRebalancePreference); + + String resourceCapacityKey = "cluster_resource_weight"; + List instanceCapacityKeys = new ArrayList<>(); + instanceCapacityKeys.add(resourceCapacityKey); + clusterConfig.setInstanceCapacityKeys(instanceCapacityKeys); + + // This is how much capacity a participant can take. The Helix documentation recommends setting this to a high + // value to avoid rebalance failures. The primary goal of setting this is to enable a constraint that takes the + // current top-state distribution into account when rebalancing. + Map defaultInstanceCapacityMap = new HashMap<>(); + defaultInstanceCapacityMap.put(resourceCapacityKey, 10000); + clusterConfig.setDefaultInstanceCapacityMap(defaultInstanceCapacityMap); + + // This is how much weight each resource in a cluster has + Map defaultPartitionWeightMap = new HashMap<>(); + defaultPartitionWeightMap.put(resourceCapacityKey, 100); + clusterConfig.setDefaultPartitionWeightMap(defaultPartitionWeightMap); + updateClusterConfigs(controllerClusterName, clusterConfig); helixAdmin.addStateModelDef(controllerClusterName, LeaderStandbySMD.name, LeaderStandbySMD.build());