Skip to content

Commit 6e69e74

Browse files
nvijayakCopilot
andcommitted
Fix createClusterIfRequired to use controller ZK for controller-cluster ops
In split-ZK mode, the controller cluster (venice-controllers) lives on controller.cluster.zk.address, not on zookeeper.address. The four admin.addResource/getResourceIdealState/setResourceIdealState/rebalance calls in createClusterIfRequired all used admin (storage ZK), causing 'cluster venice-controllers is not setup yet' because that cluster does not exist on the storage ZK. Fix: replace those calls with helixAdminClient.addVeniceStorageClusterToControllerCluster() which correctly routes to the controller ZK via controllerClusterHelixAdmin. This is safe in both shared-ZK (no behavioral change, same ZK) and split-ZK deployments. Also remove now-unused DelayedAutoRebalancer, CrushRebalanceStrategy imports and the CONTROLLER_CLUSTER_NUMBER_OF_PARTITION constant. Also fix TestVeniceControllerZkClientIsolation: change @BeforeClass/@afterclass for storageZk and controllerZk to @BeforeMethod/@AfterMethod. The split-ZK test creates venice-controllers on controllerZk; without per-method ZK lifecycle, that data leaks into testSharedZkBaseline which then fails its assertion that controllerZk holds no Venice clusters. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 743975d commit 6e69e74

2 files changed

Lines changed: 8 additions & 27 deletions

File tree

internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/controller/TestVeniceControllerZkClientIsolation.java

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,8 @@
3434
import java.util.Properties;
3535
import java.util.concurrent.TimeUnit;
3636
import org.apache.helix.manager.zk.ZKHelixAdmin;
37-
import org.testng.annotations.AfterClass;
3837
import org.testng.annotations.AfterMethod;
39-
import org.testng.annotations.BeforeClass;
38+
import org.testng.annotations.BeforeMethod;
4039
import org.testng.annotations.Test;
4140

4241

@@ -74,7 +73,7 @@ public class TestVeniceControllerZkClientIsolation {
7473
private String clusterName;
7574
private String controllerClusterName;
7675

77-
@BeforeClass
76+
@BeforeMethod(alwaysRun = true)
7877
public void setUp() {
7978
Utils.thisIsLocalhost();
8079
storageZk = ServiceFactory.getZkServer();
@@ -86,10 +85,6 @@ public void setUp() {
8685
public void tearDownAdmin() {
8786
Utils.closeQuietlyWithErrorLogged(veniceAdmin);
8887
veniceAdmin = null;
89-
}
90-
91-
@AfterClass(alwaysRun = true)
92-
public void tearDown() {
9388
Utils.closeQuietlyWithErrorLogged(pubSubBrokerWrapper);
9489
Utils.closeQuietlyWithErrorLogged(controllerZk);
9590
Utils.closeQuietlyWithErrorLogged(storageZk);

services/venice-controller/src/main/java/com/linkedin/venice/controller/VeniceHelixAdmin.java

Lines changed: 6 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -307,9 +307,6 @@
307307
import org.apache.helix.HelixManagerProperty;
308308
import org.apache.helix.InstanceType;
309309
import org.apache.helix.PropertyKey;
310-
import org.apache.helix.controller.rebalancer.DelayedAutoRebalancer;
311-
import org.apache.helix.controller.rebalancer.strategy.AutoRebalanceStrategy;
312-
import org.apache.helix.controller.rebalancer.strategy.CrushRebalanceStrategy;
313310
import org.apache.helix.manager.zk.ZKHelixAdmin;
314311
import org.apache.helix.manager.zk.ZKHelixManager;
315312
import org.apache.helix.manager.zk.ZNRecordSerializer;
@@ -362,7 +359,6 @@ public class VeniceHelixAdmin implements Admin, StoreCleaner {
362359
private final String kafkaSSLBootstrapServers;
363360
private final Map<String, AdminConsumerService> adminConsumerServices = new ConcurrentHashMap<>();
364361

365-
private static final int CONTROLLER_CLUSTER_NUMBER_OF_PARTITION = 1;
366362
private static final long CONTROLLER_CLUSTER_RESOURCE_EV_TIMEOUT_MS = TimeUnit.MINUTES.toMillis(5);
367363
private static final long CONTROLLER_CLUSTER_RESOURCE_EV_CHECK_DELAY_MS = 500;
368364
private static final long HELIX_RESOURCE_ASSIGNMENT_RETRY_INTERVAL_MS = 500;
@@ -7296,22 +7292,12 @@ private void createClusterIfRequired(String clusterName) {
72967292
delayedTime);
72977293
admin.addStateModelDef(clusterName, LeaderStandbySMD.name, LeaderStandbySMD.build());
72987294

7299-
admin.addResource(
7300-
controllerClusterName,
7301-
clusterName,
7302-
CONTROLLER_CLUSTER_NUMBER_OF_PARTITION,
7303-
LeaderStandbySMD.name,
7304-
IdealState.RebalanceMode.FULL_AUTO.toString(),
7305-
AutoRebalanceStrategy.class.getName());
7306-
IdealState idealState = admin.getResourceIdealState(controllerClusterName, clusterName);
7307-
// Use crush alg to allocate controller as well.
7308-
int controllerClusterReplica = config.getControllerClusterReplica();
7309-
idealState.setReplicas(String.valueOf(controllerClusterReplica));
7310-
idealState.setMinActiveReplicas(controllerClusterReplica);
7311-
idealState.setRebalancerClassName(DelayedAutoRebalancer.class.getName());
7312-
idealState.setRebalanceStrategy(CrushRebalanceStrategy.class.getName());
7313-
admin.setResourceIdealState(controllerClusterName, clusterName, idealState);
7314-
admin.rebalance(controllerClusterName, clusterName, controllerClusterReplica);
7295+
// Register the storage cluster as a resource in the controller cluster. Must use
7296+
// helixAdminClient (routes to controller.cluster.zk.address) rather than admin (storage ZK),
7297+
// because in split-ZK deployments the controller cluster lives on the controller ZK.
7298+
if (!helixAdminClient.isVeniceStorageClusterInControllerCluster(clusterName)) {
7299+
helixAdminClient.addVeniceStorageClusterToControllerCluster(clusterName);
7300+
}
73157301
}
73167302

73177303
public boolean updateIdealState(String clusterName, String resourceName, int minReplica) {

0 commit comments

Comments
 (0)