Skip to content

Commit

Permalink
[controller] Delete backup version before start of new push in target…
Browse files Browse the repository at this point in the history
… regions (#1049)

When target region push is enabled, after pushing to the target region, rest of the regions do not delete the backup version before the start of the push. This increases the disk usage during the duration of the push to 3x of the store size (backup + current + future versions). This PR deletes the backup version before starting ingestion in non-target regions.
  • Loading branch information
majisourav99 authored Jun 27, 2024
1 parent c81f413 commit 7c582e6
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
import com.linkedin.venice.integration.utils.VeniceControllerWrapper;
import com.linkedin.venice.integration.utils.VeniceMultiClusterWrapper;
import com.linkedin.venice.integration.utils.VeniceRouterWrapper;
import com.linkedin.venice.integration.utils.VeniceServerWrapper;
import com.linkedin.venice.integration.utils.VeniceTwoLayerMultiRegionMultiClusterWrapper;
import com.linkedin.venice.meta.DataReplicationPolicy;
import com.linkedin.venice.meta.HybridStoreConfig;
Expand Down Expand Up @@ -142,7 +143,6 @@
public class TestPushJobWithNativeReplication {
private static final Logger LOGGER = LogManager.getLogger(TestPushJobWithNativeReplication.class);
private static final int TEST_TIMEOUT = 2 * Time.MS_PER_MINUTE;

private static final int NUMBER_OF_CHILD_DATACENTERS = 2;
private static final int NUMBER_OF_CLUSTERS = 1;
private static final String[] CLUSTER_NAMES =
Expand All @@ -161,6 +161,7 @@ public class TestPushJobWithNativeReplication {

private PubSubTopicRepository pubSubTopicRepository = new PubSubTopicRepository();
private D2Client d2Client;
private VeniceServerWrapper serverWrapper;

@DataProvider(name = "storeSize")
public static Object[][] storeSize() {
Expand All @@ -179,7 +180,6 @@ public void setUp() {
serverProperties.setProperty(ROCKSDB_PLAIN_TABLE_FORMAT_ENABLED, "false");
serverProperties.setProperty(SERVER_DATABASE_CHECKSUM_VERIFICATION_ENABLED, "true");
serverProperties.setProperty(SERVER_DATABASE_SYNC_BYTES_INTERNAL_FOR_DEFERRED_WRITE_MODE, "300");

Properties controllerProps = new Properties();
// This property is required for test stores that have 10 partitions
controllerProps.put(DEFAULT_MAX_NUMBER_OF_PARTITIONS, 10);
Expand Down Expand Up @@ -209,7 +209,7 @@ public void setUp() {
.setZkStartupTimeout(3, TimeUnit.SECONDS)
.build();
D2ClientUtils.startClient(d2Client);

serverWrapper = clusterWrapper.getVeniceServers().get(0);
}

@AfterClass(alwaysRun = true)
Expand Down Expand Up @@ -957,19 +957,47 @@ public void testTargetedRegionPushJobFullConsumptionForBatchStore() throws Excep
}
});
}
String dataDBPathV1 = serverWrapper.getDataDirectory() + "/rocksdb/" + storeName + "_v1";
long storeSize = FileUtils.sizeOfDirectory(new File(dataDBPathV1));
try (VenicePushJob job = new VenicePushJob("Test push job 2", props)) {
job.run();

TestUtils.waitForNonDeterministicAssertion(45, TimeUnit.SECONDS, () -> {
for (int version: parentControllerClient.getStore(storeName)
.getStore()
.getColoToCurrentVersions()
.values()) {
Assert.assertEquals(version, 2);
}
});
}
props.put(TARGETED_REGION_PUSH_ENABLED, true);
// props.put(POST_VALIDATION_CONSUMPTION_ENABLED, true);
TestWriteUtils.writeSimpleAvroFileWithStringToStringSchema(inputDir, 20);
try (VenicePushJob job = new VenicePushJob("Test push job 2", props)) {
try (VenicePushJob job = new VenicePushJob("Test push job 3", props)) {
job.run(); // the job should succeed

File directory = new File(serverWrapper.getDataDirectory() + "/rocksdb/");
File[] storeDBDirs = directory.listFiles(File::isDirectory);
long totalStoreSize = 0;
if (storeDBDirs != null) {
for (File storeDB: storeDBDirs) {
if (storeDB.getName().startsWith(storeName)) {
long size = FileUtils
.sizeOfDirectory(new File(serverWrapper.getDataDirectory() + "/rocksdb/" + storeDB.getName()));
;
totalStoreSize += size;
}
}
Assert.assertTrue(
storeSize * 2 >= totalStoreSize,
"2x of store size " + storeSize + " is more than total " + totalStoreSize);
}
TestUtils.waitForNonDeterministicAssertion(45, TimeUnit.SECONDS, () -> {
// Current version should become 2
for (int version: parentControllerClient.getStore(storeName)
.getStore()
.getColoToCurrentVersions()
.values()) {
Assert.assertEquals(version, 2);
Assert.assertEquals(version, 3);
}
// should be able to read all 20 records.
validateDaVinciClient(storeName, 20);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2206,17 +2206,26 @@ public void createHelixResourceAndStartMonitoring(String clusterName, String sto
if (store == null) {
throwStoreDoesNotExist(clusterName, storeName);
} else {
helixAdminClient.createVeniceStorageClusterResources(
clusterName,
version.kafkaTopicName(),
version.getPartitionCount(),
store.getReplicationFactor());
startMonitorOfflinePush(
clusterName,
version.kafkaTopicName(),
version.getPartitionCount(),
store.getReplicationFactor(),
store.getOffLinePushStrategy());
helixAdminClient.createVeniceStorageClusterResources(
clusterName,
version.kafkaTopicName(),
version.getPartitionCount(),
store.getReplicationFactor());
try {
retireOldStoreVersions(clusterName, storeName, true, store.getCurrentVersion());
} catch (Throwable t) {
LOGGER.error(
"Failed to delete previous backup version while data recovery to store {} in cluster {}",
storeName,
clusterName,
t);
}
}
}

Expand Down Expand Up @@ -3395,7 +3404,7 @@ public void retireOldStoreVersions(
return;
}

if (store.getBackupStrategy() == BackupStrategy.DELETE_ON_NEW_PUSH_START) {
if (deleteBackupOnStartPush) {
LOGGER.info("Deleting backup versions as the new push started for upcoming version for store: {}", storeName);
} else {
LOGGER.info("Retiring old versions after successful push for store: {}", storeName);
Expand Down

0 comments on commit 7c582e6

Please sign in to comment.