Skip to content

Commit 4b6c223

Browse files
nvijayakCopilot
andcommitted
[controller][test] Add ZK isolation integration tests for split
controller/storage ZK Add two integration test classes that prove a Venice controller can be pointed at two separate ZooKeeper ensembles: TestVeniceControllerZkClientIsolation - testSharedZkBaseline: shared ZK regression guard (both clusters land on the single ZK, aliasing path in ZkHelixAdminClient is exercised) - testControllerClusterIsolatedFromStorageZk: split-ZK path — asserts via ZKHelixAdmin.getClusters() that the controller cluster lands on the controller ZK only, and the storage cluster lands on the storage ZK only. TestVeniceControllerZkIsolationWithHAAS - testControllerClusterAndSuperClusterIsolatedOnDedicatedZk: mirrors the TestHAASController pattern (VeniceClusterWrapper + HelixAsAServiceWrapper) but starts HaaS on a dedicated controller ZK and sets controller.cluster.zk.address accordingly. Asserts that each ZK ensemble holds exactly the Helix clusters it should — the controller ZK gets helix_controllers + venice-controllers, the Venice ZK gets only the storage cluster. These are the 'work backwards' integration tests Xun recommended: run the full controller integration suite once with shared ZK (existing TestHAASController covers this) and once with split ZK (new tests here). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent e171982 commit 4b6c223

2 files changed

Lines changed: 365 additions & 0 deletions

File tree

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
package com.linkedin.venice.controller;
2+
3+
import static com.linkedin.venice.ConfigKeys.ADMIN_HELIX_MESSAGING_CHANNEL_ENABLED;
4+
import static com.linkedin.venice.ConfigKeys.CLUSTER_NAME;
5+
import static com.linkedin.venice.ConfigKeys.CLUSTER_TO_D2;
6+
import static com.linkedin.venice.ConfigKeys.CLUSTER_TO_SERVER_D2;
7+
import static com.linkedin.venice.ConfigKeys.CONTROLLER_ADD_VERSION_VIA_ADMIN_PROTOCOL;
8+
import static com.linkedin.venice.ConfigKeys.CONTROLLER_CLUSTER_ZK_ADDRESSS;
9+
import static com.linkedin.venice.ConfigKeys.CONTROLLER_SSL_ENABLED;
10+
import static com.linkedin.venice.ConfigKeys.CONTROLLER_SYSTEM_SCHEMA_CLUSTER_NAME;
11+
import static com.linkedin.venice.ConfigKeys.DEFAULT_MAX_NUMBER_OF_PARTITIONS;
12+
import static com.linkedin.venice.ConfigKeys.DEFAULT_PARTITION_SIZE;
13+
import static com.linkedin.venice.ConfigKeys.KAFKA_BOOTSTRAP_SERVERS;
14+
import static com.linkedin.venice.ConfigKeys.KAFKA_REPLICATION_FACTOR;
15+
import static com.linkedin.venice.ConfigKeys.PARTICIPANT_MESSAGE_STORE_ENABLED;
16+
import static com.linkedin.venice.ConfigKeys.ZOOKEEPER_ADDRESS;
17+
import static org.testng.Assert.assertFalse;
18+
import static org.testng.Assert.assertNotNull;
19+
import static org.testng.Assert.assertTrue;
20+
21+
import com.linkedin.venice.integration.utils.D2TestUtils;
22+
import com.linkedin.venice.integration.utils.PubSubBrokerWrapper;
23+
import com.linkedin.venice.integration.utils.ServiceFactory;
24+
import com.linkedin.venice.integration.utils.ZkServerWrapper;
25+
import com.linkedin.venice.pubsub.PubSubTopicRepository;
26+
import com.linkedin.venice.utils.TestUtils;
27+
import com.linkedin.venice.utils.Time;
28+
import com.linkedin.venice.utils.Utils;
29+
import com.linkedin.venice.utils.VeniceProperties;
30+
import io.tehuti.metrics.MetricsRepository;
31+
import java.util.Collections;
32+
import java.util.List;
33+
import java.util.Optional;
34+
import java.util.Properties;
35+
import java.util.concurrent.TimeUnit;
36+
import org.apache.helix.manager.zk.ZKHelixAdmin;
37+
import org.testng.annotations.AfterClass;
38+
import org.testng.annotations.AfterMethod;
39+
import org.testng.annotations.BeforeClass;
40+
import org.testng.annotations.Test;
41+
42+
43+
/**
44+
* Proves that a Venice controller can be pointed at TWO separate ZooKeeper ensembles:
45+
* <ul>
46+
* <li>{@code zookeeper.address} (storage ZK) – holds the Venice storage-cluster Helix metadata
47+
* (storage cluster, its resources, store/version znodes).</li>
48+
* <li>{@code controller.cluster.zk.address} (controller ZK) – holds the controller-cluster Helix
49+
* metadata (the {@code venice-controllers} cluster: leader election, participant registration,
50+
* storage-cluster-to-controller assignment) and the HAAS grand cluster.</li>
51+
* </ul>
52+
*
53+
* This is the "work-backwards" integration test Xun asked for: it runs the controller bring-up once
54+
* with a SHARED ZK (the default/aliasing path) and once with SPLIT ZK, and asserts via a
55+
* ZkInspector-style probe ({@link ZKHelixAdmin#getClusters()} against each ensemble) that the
56+
* controller-cluster znodes and the storage-cluster znodes land on the expected — and only the
57+
* expected — ensemble.
58+
*
59+
* The split-ZK branch in {@link ZkHelixAdminClient} and {@code connectToControllerCluster} in
60+
* {@link VeniceHelixAdmin} is exercised here for the first time; with
61+
* {@code controller.cluster.zk.address} defaulting to {@code zookeeper.address}, no existing test
62+
* ever drives the two addresses apart.
63+
*/
64+
public class TestVeniceControllerZkClientIsolation {
65+
private static final long TIMEOUT_MS = 60 * Time.MS_PER_SECOND;
66+
67+
private ZkServerWrapper storageZk;
68+
private ZkServerWrapper controllerZk;
69+
private PubSubBrokerWrapper pubSubBrokerWrapper;
70+
71+
private final PubSubTopicRepository pubSubTopicRepository = new PubSubTopicRepository();
72+
73+
private VeniceHelixAdmin veniceAdmin;
74+
private String clusterName;
75+
private String controllerClusterName;
76+
77+
@BeforeClass
78+
public void setUp() {
79+
Utils.thisIsLocalhost();
80+
storageZk = ServiceFactory.getZkServer();
81+
controllerZk = ServiceFactory.getZkServer();
82+
pubSubBrokerWrapper = ServiceFactory.getPubSubBroker();
83+
}
84+
85+
@AfterMethod(alwaysRun = true)
86+
public void tearDownAdmin() {
87+
Utils.closeQuietlyWithErrorLogged(veniceAdmin);
88+
veniceAdmin = null;
89+
}
90+
91+
@AfterClass(alwaysRun = true)
92+
public void tearDown() {
93+
Utils.closeQuietlyWithErrorLogged(pubSubBrokerWrapper);
94+
Utils.closeQuietlyWithErrorLogged(controllerZk);
95+
Utils.closeQuietlyWithErrorLogged(storageZk);
96+
}
97+
98+
/**
99+
* SPLIT ZK: controller-cluster Helix metadata must live ONLY on the controller ZK, and the
100+
* storage-cluster Helix metadata must live ONLY on the storage ZK.
101+
*/
102+
@Test(timeOut = TIMEOUT_MS)
103+
public void testControllerClusterIsolatedFromStorageZk() {
104+
bringUpController(/* splitZk */ true);
105+
106+
// Create a store so the storage cluster has real Venice metadata on the storage ZK.
107+
String storeName = Utils.getUniqueString("isolation-store");
108+
veniceAdmin.createStore(clusterName, storeName, "owner", "\"string\"", "\"string\"");
109+
assertNotNull(veniceAdmin.getStore(clusterName, storeName), "Store should have been created");
110+
111+
List<String> clustersOnControllerZk = listHelixClusters(controllerZk.getAddress());
112+
List<String> clustersOnStorageZk = listHelixClusters(storageZk.getAddress());
113+
114+
// Controller ZK owns the controller cluster, NOT the storage cluster.
115+
assertTrue(
116+
clustersOnControllerZk.contains(controllerClusterName),
117+
"Controller ZK " + controllerZk.getAddress() + " must hold the controller cluster '" + controllerClusterName
118+
+ "', found: " + clustersOnControllerZk);
119+
assertFalse(
120+
clustersOnControllerZk.contains(clusterName),
121+
"Controller ZK must NOT hold the storage cluster '" + clusterName + "', found: " + clustersOnControllerZk);
122+
123+
// Storage ZK owns the storage cluster, NOT the controller cluster.
124+
assertTrue(
125+
clustersOnStorageZk.contains(clusterName),
126+
"Storage ZK " + storageZk.getAddress() + " must hold the storage cluster '" + clusterName + "', found: "
127+
+ clustersOnStorageZk);
128+
assertFalse(
129+
clustersOnStorageZk.contains(controllerClusterName),
130+
"Storage ZK must NOT hold the controller cluster '" + controllerClusterName + "', found: "
131+
+ clustersOnStorageZk);
132+
}
133+
134+
/**
135+
* SHARED ZK baseline (the default/aliasing path): with {@code controller.cluster.zk.address} ==
136+
* {@code zookeeper.address}, BOTH clusters land on the single ZK and the second ZK stays empty of
137+
* Venice clusters. Guards against the split-ZK change regressing the common deployment.
138+
*/
139+
@Test(timeOut = TIMEOUT_MS)
140+
public void testSharedZkBaseline() {
141+
bringUpController(/* splitZk */ false);
142+
143+
List<String> clustersOnStorageZk = listHelixClusters(storageZk.getAddress());
144+
List<String> clustersOnControllerZk = listHelixClusters(controllerZk.getAddress());
145+
146+
assertTrue(
147+
clustersOnStorageZk.contains(controllerClusterName) && clustersOnStorageZk.contains(clusterName),
148+
"With shared ZK, both controller and storage clusters must live on the single ZK, found: "
149+
+ clustersOnStorageZk);
150+
assertFalse(
151+
clustersOnControllerZk.contains(controllerClusterName) || clustersOnControllerZk.contains(clusterName),
152+
"The unused second ZK must not host any Venice cluster in the shared-ZK case, found: "
153+
+ clustersOnControllerZk);
154+
}
155+
156+
/** Builds and starts a real VeniceHelixAdmin (controller) wired to one or two ZK ensembles. */
157+
private void bringUpController(boolean splitZk) {
158+
clusterName = Utils.getUniqueString("test-cluster");
159+
160+
Properties props = TestUtils.getPropertiesForControllerConfig();
161+
props.put(ZOOKEEPER_ADDRESS, storageZk.getAddress());
162+
if (splitZk) {
163+
props.put(CONTROLLER_CLUSTER_ZK_ADDRESSS, controllerZk.getAddress());
164+
}
165+
props.put(CLUSTER_NAME, clusterName);
166+
props.put(KAFKA_REPLICATION_FACTOR, 1);
167+
props.put(KAFKA_BOOTSTRAP_SERVERS, pubSubBrokerWrapper.getAddress());
168+
props.put(DEFAULT_MAX_NUMBER_OF_PARTITIONS, 16);
169+
props.put(DEFAULT_PARTITION_SIZE, 10);
170+
props.put(CLUSTER_TO_D2, TestUtils.getClusterToD2String(Collections.singletonMap(clusterName, "dummy_d2")));
171+
props.put(
172+
CLUSTER_TO_SERVER_D2,
173+
TestUtils.getClusterToD2String(Collections.singletonMap(clusterName, "dummy_server_d2")));
174+
props.put(CONTROLLER_ADD_VERSION_VIA_ADMIN_PROTOCOL, true);
175+
props.put(ADMIN_HELIX_MESSAGING_CHANNEL_ENABLED, false);
176+
props.put(PARTICIPANT_MESSAGE_STORE_ENABLED, true);
177+
props.put(CONTROLLER_SYSTEM_SCHEMA_CLUSTER_NAME, clusterName);
178+
props.put(CONTROLLER_SSL_ENABLED, false);
179+
props.putAll(PubSubBrokerWrapper.getBrokerDetailsForClients(Collections.singletonList(pubSubBrokerWrapper)));
180+
181+
VeniceControllerClusterConfig controllerConfig = new VeniceControllerClusterConfig(new VeniceProperties(props));
182+
VeniceControllerMultiClusterConfig multiClusterConfig =
183+
TestUtils.getMultiClusterConfigFromOneCluster(controllerConfig);
184+
controllerClusterName = multiClusterConfig.getControllerClusterName();
185+
186+
veniceAdmin = new VeniceHelixAdmin(
187+
multiClusterConfig,
188+
new MetricsRepository(),
189+
D2TestUtils.getAndStartD2Client(storageZk.getAddress()),
190+
pubSubTopicRepository,
191+
pubSubBrokerWrapper.getPubSubClientsFactory(),
192+
pubSubBrokerWrapper.getPubSubPositionTypeRegistry(),
193+
Optional.empty(),
194+
Optional.empty(),
195+
Optional.empty());
196+
veniceAdmin.initStorageCluster(clusterName);
197+
198+
TestUtils.waitForNonDeterministicCompletion(
199+
TIMEOUT_MS,
200+
TimeUnit.MILLISECONDS,
201+
() -> veniceAdmin.isLeaderControllerFor(clusterName));
202+
}
203+
204+
/** ZkInspector-style probe: which Helix clusters physically exist on the given ZK ensemble. */
205+
private static List<String> listHelixClusters(String zkAddress) {
206+
ZKHelixAdmin admin = new ZKHelixAdmin(zkAddress);
207+
try {
208+
return admin.getClusters();
209+
} finally {
210+
admin.close();
211+
}
212+
}
213+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
package com.linkedin.venice.controller;
2+
3+
import static com.linkedin.venice.utils.TestUtils.waitForNonDeterministicAssertion;
4+
import static org.testng.Assert.assertFalse;
5+
import static org.testng.Assert.assertNotNull;
6+
import static org.testng.Assert.assertTrue;
7+
8+
import com.linkedin.venice.ConfigKeys;
9+
import com.linkedin.venice.integration.utils.HelixAsAServiceWrapper;
10+
import com.linkedin.venice.integration.utils.ServiceFactory;
11+
import com.linkedin.venice.integration.utils.VeniceClusterCreateOptions;
12+
import com.linkedin.venice.integration.utils.VeniceClusterWrapper;
13+
import com.linkedin.venice.integration.utils.VeniceControllerWrapper;
14+
import com.linkedin.venice.integration.utils.ZkServerWrapper;
15+
import com.linkedin.venice.utils.Time;
16+
import com.linkedin.venice.utils.Utils;
17+
import java.util.List;
18+
import java.util.Properties;
19+
import java.util.concurrent.TimeUnit;
20+
import org.apache.helix.manager.zk.ZKHelixAdmin;
21+
import org.testng.annotations.Test;
22+
23+
24+
/**
25+
* Production-shaped ZK isolation test using {@link HelixAsAServiceWrapper} — the same setup
26+
* pattern as {@link TestHAASController}.
27+
*
28+
* <p>{@link TestHAASController} already covers the shared-ZK case (HaaS and Venice on the same
29+
* ZK). This class covers the complementary split-ZK case: {@link HelixAsAServiceWrapper} is
30+
* started on a DEDICATED controller ZK and the Venice controller is configured with
31+
* {@code controller.cluster.zk.address = <controller ZK>}. The assertions are exactly what a
32+
* human would see by opening a ZK inspector against each ensemble: the controller ZK hosts only
33+
* the HAAS super cluster + {@code venice-controllers}, while the Venice ZK hosts only the storage
34+
* cluster.
35+
*/
36+
public class TestVeniceControllerZkIsolationWithHAAS {
37+
private static final String CONTROLLER_CLUSTER_NAME = "venice-controllers";
38+
39+
/**
40+
* SPLIT ZK (isolation): {@link HelixAsAServiceWrapper} runs on a dedicated controller ZK,
41+
* separate from the Venice storage ZK. The controller is configured with
42+
* {@code controller.cluster.zk.address = <controller ZK>}. After the controller becomes leader:
43+
* <ul>
44+
* <li>The controller ZK must hold the HAAS super cluster and the {@code venice-controllers}
45+
* controller cluster — but NOT the storage cluster.</li>
46+
* <li>The Venice ZK must hold only the storage cluster — but NOT the controller/super
47+
* clusters.</li>
48+
* </ul>
49+
* This is the "ZK inspector" assertion Xun described: open two ZK inspectors, confirm each
50+
* ensemble holds exactly the clusters it should.
51+
*/
52+
@Test(timeOut = 120 * Time.MS_PER_SECOND)
53+
public void testControllerClusterAndSuperClusterIsolatedOnDedicatedZk() {
54+
VeniceClusterCreateOptions options = new VeniceClusterCreateOptions.Builder().numberOfControllers(0)
55+
.numberOfServers(0)
56+
.numberOfRouters(0)
57+
.replicationFactor(1)
58+
.build();
59+
60+
// Dedicated ZK for the controller cluster + HAAS super cluster, separate from the Venice cluster ZK.
61+
try (ZkServerWrapper controllerZk = ServiceFactory.getZkServer();
62+
VeniceClusterWrapper venice = ServiceFactory.getVeniceCluster(options);
63+
HelixAsAServiceWrapper haas = startAndWaitForHAASToBeAvailable(controllerZk.getAddress())) {
64+
65+
String storageZkAddress = venice.getZk().getAddress();
66+
String storageClusterName = venice.getClusterName();
67+
68+
Properties controllerProps = new Properties();
69+
controllerProps.put(ConfigKeys.CONTROLLER_CLUSTER_LEADER_HAAS, String.valueOf(true));
70+
controllerProps
71+
.put(ConfigKeys.CONTROLLER_HAAS_SUPER_CLUSTER_NAME, HelixAsAServiceWrapper.HELIX_SUPER_CLUSTER_NAME);
72+
// The crux: route controller-cluster + HAAS grand-cluster Helix ops/participant join to the dedicated ZK.
73+
controllerProps.put(ConfigKeys.CONTROLLER_CLUSTER_ZK_ADDRESSS, controllerZk.getAddress());
74+
75+
VeniceControllerWrapper controller = venice.addVeniceController(controllerProps);
76+
77+
// Controller becomes leader of both its controller cluster (via HAAS) and the storage cluster.
78+
waitForNonDeterministicAssertion(
79+
30,
80+
TimeUnit.SECONDS,
81+
true,
82+
() -> assertTrue(
83+
controller.isLeaderControllerOfControllerCluster(),
84+
"The controller should be the leader of the controller cluster"));
85+
waitForNonDeterministicAssertion(
86+
30,
87+
TimeUnit.SECONDS,
88+
true,
89+
() -> assertTrue(
90+
controller.isLeaderController(storageClusterName),
91+
"The controller should be the leader of storage cluster " + storageClusterName));
92+
93+
// Create a store so the storage cluster carries real Venice metadata on the Venice ZK.
94+
String storeName = Utils.getUniqueString("haas-isolation-store");
95+
assertNotNull(venice.getNewStore(storeName), "Store should have been created");
96+
97+
assertNotNull(haas.getSuperClusterLeader(), "HAAS super cluster should have a leader");
98+
99+
List<String> clustersOnControllerZk = listHelixClusters(controllerZk.getAddress());
100+
List<String> clustersOnStorageZk = listHelixClusters(storageZkAddress);
101+
102+
// Controller ZK: super cluster + controller cluster, and NOT the storage cluster.
103+
assertTrue(
104+
clustersOnControllerZk.contains(HelixAsAServiceWrapper.HELIX_SUPER_CLUSTER_NAME),
105+
"Controller ZK must host the HAAS super cluster, found: " + clustersOnControllerZk);
106+
assertTrue(
107+
clustersOnControllerZk.contains(CONTROLLER_CLUSTER_NAME),
108+
"Controller ZK must host the controller cluster '" + CONTROLLER_CLUSTER_NAME + "', found: "
109+
+ clustersOnControllerZk);
110+
assertFalse(
111+
clustersOnControllerZk.contains(storageClusterName),
112+
"Controller ZK must NOT host the storage cluster '" + storageClusterName + "', found: "
113+
+ clustersOnControllerZk);
114+
115+
// Venice ZK: storage cluster only, and NOT the controller/super clusters.
116+
assertTrue(
117+
clustersOnStorageZk.contains(storageClusterName),
118+
"Venice ZK must host the storage cluster '" + storageClusterName + "', found: " + clustersOnStorageZk);
119+
assertFalse(
120+
clustersOnStorageZk.contains(CONTROLLER_CLUSTER_NAME)
121+
|| clustersOnStorageZk.contains(HelixAsAServiceWrapper.HELIX_SUPER_CLUSTER_NAME),
122+
"Venice ZK must NOT host the controller/super clusters, found: " + clustersOnStorageZk);
123+
}
124+
}
125+
126+
private HelixAsAServiceWrapper startAndWaitForHAASToBeAvailable(String zkAddress) {
127+
HelixAsAServiceWrapper haas = null;
128+
try {
129+
haas = ServiceFactory.getHelixController(zkAddress);
130+
final HelixAsAServiceWrapper finalHaas = haas;
131+
waitForNonDeterministicAssertion(
132+
30,
133+
TimeUnit.SECONDS,
134+
true,
135+
() -> assertNotNull(finalHaas.getSuperClusterLeader(), "Helix super cluster doesn't have a leader yet"));
136+
return haas;
137+
} catch (Exception e) {
138+
Utils.closeQuietlyWithErrorLogged(haas);
139+
throw e;
140+
}
141+
}
142+
143+
/** ZkInspector-style probe: which Helix clusters physically exist on the given ZK ensemble. */
144+
private static List<String> listHelixClusters(String zkAddress) {
145+
ZKHelixAdmin admin = new ZKHelixAdmin(zkAddress);
146+
try {
147+
return admin.getClusters();
148+
} finally {
149+
admin.close();
150+
}
151+
}
152+
}

0 commit comments

Comments
 (0)