From df033f1c569acfabd47f072800a588a216e45068 Mon Sep 17 00:00:00 2001 From: Lakshya Taragi Date: Fri, 21 Feb 2025 21:11:53 +0530 Subject: [PATCH 1/6] Fix flaky tests in SegmentReplicationAllocationIT Signed-off-by: Lakshya Taragi --- .../SegmentReplicationAllocationIT.java | 34 ++++++++++++++----- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java index 669e24f9fb555..5a311b1a4035f 100644 --- a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java @@ -25,6 +25,7 @@ import org.opensearch.test.junit.annotations.TestLogging; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; @@ -169,14 +170,16 @@ public void testSingleIndexShardAllocation() throws Exception { // Remove a node internalCluster().stopRandomNode(InternalTestCluster.nameFilter(nodeNames.get(0))); - ensureGreen(TimeValue.timeValueSeconds(60)); + internalCluster().validateClusterFormed(); + ensureGreen(TimeValue.timeValueSeconds(120)); state = client().admin().cluster().prepareState().execute().actionGet().getState(); logger.info(ShardAllocations.printShardDistribution(state)); verifyPerIndexPrimaryBalance(); // Add a new node internalCluster().startDataOnlyNode(); - ensureGreen(TimeValue.timeValueSeconds(60)); + internalCluster().validateClusterFormed(); + ensureGreen(TimeValue.timeValueSeconds(120)); state = client().admin().cluster().prepareState().execute().actionGet().getState(); logger.info(ShardAllocations.printShardDistribution(state)); verifyPerIndexPrimaryBalance(); @@ -250,12 +253,21 @@ public void testAllocationAndRebalanceWithDisruption() throws Exception { internalCluster().startClusterManagerOnlyNode(); final int maxReplicaCount = 2; final int maxShardCount = 2; - // Create higher number of nodes than number of shards to reduce chances of SameShardAllocationDecider kicking-in - // and preventing primary relocations - final int nodeCount = randomIntBetween(5, 10); final int numberOfIndices = randomIntBetween(1, 10); - final float buffer = randomIntBetween(1, 4) * 0.10f; + List> shardAndReplicaCounts = new ArrayList<>(); + int shardCount, replicaCount, totalPrimaryShards = 0; + for (int i = 0; i < numberOfIndices; i++) { + shardCount = randomIntBetween(1, maxShardCount); + replicaCount = randomIntBetween(1, maxReplicaCount); + shardAndReplicaCounts.add(Arrays.asList(shardCount, replicaCount)); + totalPrimaryShards += shardCount; + } + + // Create a strictly higher number of nodes than number of shards to reduce chances of SameShardAllocationDecider kicking-in + // and preventing primary relocations + final int nodeCount = randomIntBetween(Math.max(3, totalPrimaryShards + 1), numberOfIndices * maxShardCount + 1); + final float buffer = randomIntBetween(1, 4) * 0.10f; logger.info("--> Creating {} nodes", nodeCount); final List nodeNames = new ArrayList<>(); for (int i = 0; i < nodeCount; i++) { @@ -263,11 +275,10 @@ public void testAllocationAndRebalanceWithDisruption() throws Exception { } setAllocationRelocationStrategy(true, true, buffer); - int shardCount, replicaCount; ClusterState state; for (int i = 0; i < numberOfIndices; i++) { - shardCount = randomIntBetween(1, maxShardCount); - replicaCount = randomIntBetween(1, maxReplicaCount); + shardCount = shardAndReplicaCounts.get(i).get(0); + replicaCount = shardAndReplicaCounts.get(i).get(1); logger.info("--> Creating index test{} with primary {} and replica {}", i, shardCount, replicaCount); createIndex("test" + i, shardCount, replicaCount, i % 2 == 0); ensureGreen(TimeValue.timeValueSeconds(60)); @@ -353,12 +364,17 @@ private void verifyPrimaryBalance(float buffer) throws Exception { totalPrimaryShards += index.primaryShardsActive(); } final int avgPrimaryShardsPerNode = (int) Math.ceil(totalPrimaryShards * 1f / currentState.getRoutingNodes().size()); + logger.info("--> totalPrimaryShards = {}", totalPrimaryShards); + logger.info("--> totalNodes = {}", currentState.getRoutingNodes().size()); + logger.info("--> avgPrimaryShardsPerNode = {}", avgPrimaryShardsPerNode); + logger.info("--> UL = {}", avgPrimaryShardsPerNode * (1 + buffer)); for (RoutingNode node : nodes) { final int primaryCount = node.shardsWithState(STARTED) .stream() .filter(ShardRouting::primary) .collect(Collectors.toList()) .size(); + logger.info("--> {}: primaryCount = {}", node.nodeId(), primaryCount); assertTrue(primaryCount <= (avgPrimaryShardsPerNode * (1 + buffer))); } }, 60, TimeUnit.SECONDS); From a20b30587870de025081a7274a916c0186c5bc13 Mon Sep 17 00:00:00 2001 From: Lakshya Taragi Date: Fri, 21 Feb 2025 21:25:35 +0530 Subject: [PATCH 2/6] Remove extra logs Signed-off-by: Lakshya Taragi --- .../indices/replication/SegmentReplicationAllocationIT.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java index 5a311b1a4035f..f85b5a671bf6e 100644 --- a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java @@ -364,17 +364,12 @@ private void verifyPrimaryBalance(float buffer) throws Exception { totalPrimaryShards += index.primaryShardsActive(); } final int avgPrimaryShardsPerNode = (int) Math.ceil(totalPrimaryShards * 1f / currentState.getRoutingNodes().size()); - logger.info("--> totalPrimaryShards = {}", totalPrimaryShards); - logger.info("--> totalNodes = {}", currentState.getRoutingNodes().size()); - logger.info("--> avgPrimaryShardsPerNode = {}", avgPrimaryShardsPerNode); - logger.info("--> UL = {}", avgPrimaryShardsPerNode * (1 + buffer)); for (RoutingNode node : nodes) { final int primaryCount = node.shardsWithState(STARTED) .stream() .filter(ShardRouting::primary) .collect(Collectors.toList()) .size(); - logger.info("--> {}: primaryCount = {}", node.nodeId(), primaryCount); assertTrue(primaryCount <= (avgPrimaryShardsPerNode * (1 + buffer))); } }, 60, TimeUnit.SECONDS); From fcf302ea23d1f7d3feead3c2c78b110a0509cccc Mon Sep 17 00:00:00 2001 From: Lakshya Taragi Date: Fri, 21 Feb 2025 23:09:36 +0530 Subject: [PATCH 3/6] Account for replicas as well Signed-off-by: Lakshya Taragi --- .../replication/SegmentReplicationAllocationIT.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java index f85b5a671bf6e..efaba8ac2cbf3 100644 --- a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java @@ -254,19 +254,19 @@ public void testAllocationAndRebalanceWithDisruption() throws Exception { final int maxReplicaCount = 2; final int maxShardCount = 2; final int numberOfIndices = randomIntBetween(1, 10); + final int maxPossibleShards = numberOfIndices * maxShardCount * (1 + maxReplicaCount); List> shardAndReplicaCounts = new ArrayList<>(); - int shardCount, replicaCount, totalPrimaryShards = 0; + int shardCount, replicaCount, totalShards = 0; for (int i = 0; i < numberOfIndices; i++) { shardCount = randomIntBetween(1, maxShardCount); replicaCount = randomIntBetween(1, maxReplicaCount); shardAndReplicaCounts.add(Arrays.asList(shardCount, replicaCount)); - totalPrimaryShards += shardCount; + totalShards += shardCount * (1 + replicaCount); } - - // Create a strictly higher number of nodes than number of shards to reduce chances of SameShardAllocationDecider kicking-in + // Create a strictly higher number of nodes than the number of shards to reduce chances of SameShardAllocationDecider kicking-in // and preventing primary relocations - final int nodeCount = randomIntBetween(Math.max(3, totalPrimaryShards + 1), numberOfIndices * maxShardCount + 1); + final int nodeCount = randomIntBetween(totalShards, maxPossibleShards) + 1; final float buffer = randomIntBetween(1, 4) * 0.10f; logger.info("--> Creating {} nodes", nodeCount); final List nodeNames = new ArrayList<>(); From d9ba117ae3cadce321819bbb2f78310bbb581baf Mon Sep 17 00:00:00 2001 From: Lakshya Taragi Date: Sun, 23 Feb 2025 12:53:43 +0530 Subject: [PATCH 4/6] Reduce upper limit on no. of indices Signed-off-by: Lakshya Taragi --- .../indices/replication/SegmentReplicationAllocationIT.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java index efaba8ac2cbf3..0ee006130fbb3 100644 --- a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java @@ -253,7 +253,7 @@ public void testAllocationAndRebalanceWithDisruption() throws Exception { internalCluster().startClusterManagerOnlyNode(); final int maxReplicaCount = 2; final int maxShardCount = 2; - final int numberOfIndices = randomIntBetween(1, 10); + final int numberOfIndices = randomIntBetween(1, 3); final int maxPossibleShards = numberOfIndices * maxShardCount * (1 + maxReplicaCount); List> shardAndReplicaCounts = new ArrayList<>(); From 74b9b94237cb4c3411a62ae9f803fca564ac9718 Mon Sep 17 00:00:00 2001 From: Lakshya Taragi Date: Sun, 23 Feb 2025 12:56:30 +0530 Subject: [PATCH 5/6] Only verified changes Signed-off-by: Lakshya Taragi --- .../indices/replication/SegmentReplicationAllocationIT.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java index 0ee006130fbb3..610563f38d9af 100644 --- a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java @@ -170,16 +170,14 @@ public void testSingleIndexShardAllocation() throws Exception { // Remove a node internalCluster().stopRandomNode(InternalTestCluster.nameFilter(nodeNames.get(0))); - internalCluster().validateClusterFormed(); - ensureGreen(TimeValue.timeValueSeconds(120)); + ensureGreen(TimeValue.timeValueSeconds(60)); state = client().admin().cluster().prepareState().execute().actionGet().getState(); logger.info(ShardAllocations.printShardDistribution(state)); verifyPerIndexPrimaryBalance(); // Add a new node internalCluster().startDataOnlyNode(); - internalCluster().validateClusterFormed(); - ensureGreen(TimeValue.timeValueSeconds(120)); + ensureGreen(TimeValue.timeValueSeconds(60)); state = client().admin().cluster().prepareState().execute().actionGet().getState(); logger.info(ShardAllocations.printShardDistribution(state)); verifyPerIndexPrimaryBalance(); From 0651098e25afb91c418ba9cba541845391b9ff34 Mon Sep 17 00:00:00 2001 From: Lakshya Taragi Date: Tue, 25 Feb 2025 13:42:52 +0530 Subject: [PATCH 6/6] Fix testSingleIndexShardAllocation Signed-off-by: Lakshya Taragi --- .../indices/replication/SegmentReplicationAllocationIT.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java index 610563f38d9af..0b2cf93903ed9 100644 --- a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java @@ -170,14 +170,16 @@ public void testSingleIndexShardAllocation() throws Exception { // Remove a node internalCluster().stopRandomNode(InternalTestCluster.nameFilter(nodeNames.get(0))); - ensureGreen(TimeValue.timeValueSeconds(60)); + internalCluster().validateClusterFormed(); + ensureGreen(TimeValue.timeValueSeconds(100)); state = client().admin().cluster().prepareState().execute().actionGet().getState(); logger.info(ShardAllocations.printShardDistribution(state)); verifyPerIndexPrimaryBalance(); // Add a new node internalCluster().startDataOnlyNode(); - ensureGreen(TimeValue.timeValueSeconds(60)); + internalCluster().validateClusterFormed(); + ensureGreen(TimeValue.timeValueSeconds(100)); state = client().admin().cluster().prepareState().execute().actionGet().getState(); logger.info(ShardAllocations.printShardDistribution(state)); verifyPerIndexPrimaryBalance();