[server] Allocated a dedicated thread pool for ingestion DB lookup (#1525)

gaojieliu · web-flow · commit bb092b964de0 · 2025-02-12T11:44:22.000-08:00
Based on the prod experiment, the SSD being used is not performing
well when enabling parallel processing for AA/WC workload as the
database lookup concurrency will be roughly same as the concurrency
of parallel processing and we saw a higher lookup latency with high
cpu wait.
This PR introduces a separated thread pool for ingestion database
lookup (value and RMD) and the concurrency can be controlled
separately from the parallel processing concurrency, and based on
a canary test, a lower concurrency of DB lookup can reduce cpu wait
while delivering a slightly better overall throughput.

New Config:
server.aa.wc.ingestion.storage.lookup.thread.pool.size: default 4
diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/config/VeniceServerConfig.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/config/VeniceServerConfig.java
@@ -54,6 +54,7 @@
 import static com.linkedin.venice.ConfigKeys.PUBSUB_TOPIC_MANAGER_METADATA_FETCHER_CONSUMER_POOL_SIZE;
 import static com.linkedin.venice.ConfigKeys.PUBSUB_TOPIC_MANAGER_METADATA_FETCHER_THREAD_POOL_SIZE;
 import static com.linkedin.venice.ConfigKeys.ROUTER_PRINCIPAL_NAME;
+import static com.linkedin.venice.ConfigKeys.SERVER_AA_WC_INGESTION_STORAGE_LOOKUP_THREAD_POOL_SIZE;
 import static com.linkedin.venice.ConfigKeys.SERVER_AA_WC_LEADER_QUOTA_RECORDS_PER_SECOND;
 import static com.linkedin.venice.ConfigKeys.SERVER_AA_WC_WORKLOAD_PARALLEL_PROCESSING_ENABLED;
 import static com.linkedin.venice.ConfigKeys.SERVER_AA_WC_WORKLOAD_PARALLEL_PROCESSING_THREAD_POOL_SIZE;
@@ -587,6 +588,8 @@ public class VeniceServerConfig extends VeniceClusterConfig {
   private final boolean deleteUnassignedPartitionsOnStartup;
   private final int aclInMemoryCacheTTLMs;
 
+  private final int aaWCIngestionStorageLookupThreadPoolSize;
+
   public VeniceServerConfig(VeniceProperties serverProperties) throws ConfigurationException {
     this(serverProperties, Collections.emptyMap());
   }
@@ -991,7 +994,8 @@ public VeniceServerConfig(VeniceProperties serverProperties, Map<String, Map<Str
     deleteUnassignedPartitionsOnStartup =
         serverProperties.getBoolean(SERVER_DELETE_UNASSIGNED_PARTITIONS_ON_STARTUP, false);
     aclInMemoryCacheTTLMs = serverProperties.getInt(ACL_IN_MEMORY_CACHE_TTL_MS, -1); // acl caching is disabled by
-                                                                                     // default
+    aaWCIngestionStorageLookupThreadPoolSize =
+        serverProperties.getInt(SERVER_AA_WC_INGESTION_STORAGE_LOOKUP_THREAD_POOL_SIZE, 4);
   }
 
   long extractIngestionMemoryLimit(
@@ -1809,4 +1813,8 @@ public boolean isDeleteUnassignedPartitionsOnStartupEnabled() {
   public int getAclInMemoryCacheTTLMs() {
     return aclInMemoryCacheTTLMs;
   }
+
+  public int getAaWCIngestionStorageLookupThreadPoolSize() {
+    return aaWCIngestionStorageLookupThreadPoolSize;
+  }
 }
diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/ActiveActiveStoreIngestionTask.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/ActiveActiveStoreIngestionTask.java
@@ -389,8 +389,8 @@ byte[] getRmdWithValueSchemaByteBufferFromStorage(
       ChunkedValueManifestContainer rmdManifestContainer,
       long currentTimeForMetricsMs) {
     final long lookupStartTimeInNS = System.nanoTime();
-    ValueRecord result = SingleGetChunkingAdapter
-        .getReplicationMetadata(getStorageEngine(), partition, key, isChunked(), rmdManifestContainer);
+    ValueRecord result = databaseLookupWithConcurrencyLimit(
+        () -> getRmdWithValueSchemaByteBufferFromStorageInternal(partition, key, rmdManifestContainer));
     getHostLevelIngestionStats().recordIngestionReplicationMetadataLookUpLatency(
         LatencyUtils.getElapsedTimeFromNSToMS(lookupStartTimeInNS),
         currentTimeForMetricsMs);
@@ -400,6 +400,15 @@ byte[] getRmdWithValueSchemaByteBufferFromStorage(
     return result.serialize();
   }
 
+  // For testing purpose
+  ValueRecord getRmdWithValueSchemaByteBufferFromStorageInternal(
+      int partition,
+      byte[] key,
+      ChunkedValueManifestContainer rmdManifestContainer) {
+    return SingleGetChunkingAdapter
+        .getReplicationMetadata(getStorageEngine(), partition, key, isChunked(), rmdManifestContainer);
+  }
+
   @Override
   protected IngestionBatchProcessor getIngestionBatchProcessor() {
     return ingestionBatchProcessorLazy.get();
@@ -755,16 +764,18 @@ private ByteBufferValueRecord<ByteBuffer> getValueBytesForKey(
       ReusableObjects reusableObjects = threadLocalReusableObjects.get();
       ByteBuffer reusedRawValue = reusableObjects.reusedByteBuffer;
       BinaryDecoder binaryDecoder = reusableObjects.binaryDecoder;
-      originalValue = RawBytesChunkingAdapter.INSTANCE.getWithSchemaId(
-          storageEngine,
-          topicPartition.getPartitionNumber(),
-          ByteBuffer.wrap(key),
-          isChunked,
-          reusedRawValue,
-          binaryDecoder,
-          RawBytesStoreDeserializerCache.getInstance(),
-          compressor.get(),
-          valueManifestContainer);
+
+      originalValue = databaseLookupWithConcurrencyLimit(
+          () -> RawBytesChunkingAdapter.INSTANCE.getWithSchemaId(
+              storageEngine,
+              topicPartition.getPartitionNumber(),
+              ByteBuffer.wrap(key),
+              isChunked,
+              reusedRawValue,
+              binaryDecoder,
+              RawBytesStoreDeserializerCache.getInstance(),
+              compressor.get(),
+              valueManifestContainer));
       hostLevelIngestionStats.recordIngestionValueBytesLookUpLatency(
           LatencyUtils.getElapsedTimeFromNSToMS(lookupStartTimeInNS),
           currentTimeForMetricsMs);
diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/KafkaStoreIngestionService.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/KafkaStoreIngestionService.java
@@ -197,6 +197,8 @@ public class KafkaStoreIngestionService extends AbstractVeniceService implements
 
   private Lazy<ZKHelixAdmin> zkHelixAdmin;
 
+  private final ExecutorService aaWCIngestionStorageLookupThreadPool;
+
   public KafkaStoreIngestionService(
       StorageService storageService,
       VeniceConfigLoader veniceConfigLoader,
@@ -456,6 +458,13 @@ public void handleStoreDeleted(Store store) {
       this.aaWCWorkLoadProcessingThreadPool = null;
     }
 
+    this.aaWCIngestionStorageLookupThreadPool = Executors.newFixedThreadPool(
+        serverConfig.getAaWCIngestionStorageLookupThreadPoolSize(),
+        new DaemonThreadFactory("AA_WC_INGESTION_STORAGE_LOOKUP"));
+    LOGGER.info(
+        "Enabled a thread pool for AA/WC ingestion lookup with {} threads.",
+        serverConfig.getAaWCIngestionStorageLookupThreadPoolSize());
+
     ingestionTaskFactory = StoreIngestionTaskFactory.builder()
         .setVeniceWriterFactory(veniceWriterFactory)
         .setStorageEngineRepository(storageService.getStorageEngineRepository())
@@ -482,6 +491,7 @@ public void handleStoreDeleted(Store store) {
             serverConfig.getIngestionMemoryLimit() > 0 ? () -> killConsumptionTaskForNonCurrentVersions() : null)
         .setHeartbeatMonitoringService(heartbeatMonitoringService)
         .setAAWCWorkLoadProcessingThreadPool(aaWCWorkLoadProcessingThreadPool)
+        .setAAWCIngestionStorageLookupThreadPool(aaWCIngestionStorageLookupThreadPool)
         .build();
   }
 
@@ -605,6 +615,7 @@ public void stopInner() {
     Utils.closeQuietlyWithErrorLogged(metaStoreWriter);
 
     shutdownExecutorService(aaWCWorkLoadProcessingThreadPool, "aaWCWorkLoadProcessingThreadPool", true);
+    shutdownExecutorService(aaWCIngestionStorageLookupThreadPool, "aaWCIngestionStorageLookupThreadPool", true);
 
     kafkaMessageEnvelopeSchemaReader.ifPresent(Utils::closeQuietlyWithErrorLogged);
 
diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/LeaderFollowerStoreIngestionTask.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/LeaderFollowerStoreIngestionTask.java
@@ -102,6 +102,7 @@
 import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.CompletionException;
 import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
@@ -208,6 +209,8 @@ public class LeaderFollowerStoreIngestionTask extends StoreIngestionTask {
   private final Lazy<IngestionBatchProcessor> ingestionBatchProcessingLazy;
   private final Version version;
 
+  protected final ExecutorService aaWCIngestionStorageLookupThreadPool;
+
   public LeaderFollowerStoreIngestionTask(
       StorageService storageService,
       StoreIngestionTaskFactory.Builder builder,
@@ -363,6 +366,7 @@ public LeaderFollowerStoreIngestionTask(
           builder.getVersionedStorageIngestionStats(),
           getHostLevelIngestionStats());
     });
+    this.aaWCIngestionStorageLookupThreadPool = builder.getAaWCIngestionStorageLookupThreadPool();
   }
 
   public static VeniceWriter<byte[], byte[], byte[]> constructVeniceWriter(
@@ -3536,18 +3540,19 @@ private GenericRecord readStoredValueRecord(
     if (transientRecord == null) {
       try {
         long lookupStartTimeInNS = System.nanoTime();
-        currValue = GenericRecordChunkingAdapter.INSTANCE.get(
-            storageEngine,
-            topicPartition.getPartitionNumber(),
-            ByteBuffer.wrap(keyBytes),
-            isChunked,
-            null,
-            null,
-            NoOpReadResponseStats.SINGLETON,
-            readerValueSchemaID,
-            storeDeserializerCache,
-            compressor.get(),
-            manifestContainer);
+        currValue = databaseLookupWithConcurrencyLimit(
+            () -> GenericRecordChunkingAdapter.INSTANCE.get(
+                storageEngine,
+                topicPartition.getPartitionNumber(),
+                ByteBuffer.wrap(keyBytes),
+                isChunked,
+                null,
+                null,
+                NoOpReadResponseStats.SINGLETON,
+                readerValueSchemaID,
+                storeDeserializerCache,
+                compressor.get(),
+                manifestContainer));
         hostLevelIngestionStats
             .recordWriteComputeLookUpLatency(LatencyUtils.getElapsedTimeFromNSToMS(lookupStartTimeInNS));
       } catch (Exception e) {
@@ -4026,4 +4031,16 @@ private void maybeQueueCMWritesToVersionTopic(
       produceCall.run();
     }
   }
+
+  <T> T databaseLookupWithConcurrencyLimit(Supplier<T> supplier) {
+    if (serverConfig.isAAWCWorkloadParallelProcessingEnabled()) {
+      try {
+        return aaWCIngestionStorageLookupThreadPool.submit(() -> supplier.get()).get();
+      } catch (InterruptedException | ExecutionException e) {
+        throw new VeniceException(e);
+      }
+    } else {
+      return supplier.get();
+    }
+  }
 }
diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/StoreIngestionTaskFactory.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/StoreIngestionTaskFactory.java
@@ -126,6 +126,7 @@ public static class Builder {
     private PubSubTopicRepository pubSubTopicRepository;
     private Runnable runnableForKillIngestionTasksForNonCurrentVersions;
     private ExecutorService aaWCWorkLoadProcessingThreadPool;
+    private ExecutorService aaWCIngestionStorageLookupThreadPool;
 
     private interface Setter {
       void apply();
@@ -333,6 +334,14 @@ public Builder setAAWCWorkLoadProcessingThreadPool(ExecutorService executorServi
       return set(() -> this.aaWCWorkLoadProcessingThreadPool = executorService);
     }
 
+    public Builder setAAWCIngestionStorageLookupThreadPool(ExecutorService executorService) {
+      return set(() -> this.aaWCIngestionStorageLookupThreadPool = executorService);
+    }
+
+    public ExecutorService getAaWCIngestionStorageLookupThreadPool() {
+      return aaWCIngestionStorageLookupThreadPool;
+    }
+
     public ExecutorService getAAWCWorkLoadProcessingThreadPool() {
       return this.aaWCWorkLoadProcessingThreadPool;
     }
diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/ActiveActiveStoreIngestionTaskTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/ActiveActiveStoreIngestionTaskTest.java
@@ -520,14 +520,14 @@ public void testReadingChunkedRmdFromStorage() {
     when(ingestionTask.getStorageEngine()).thenReturn(storageEngine);
     when(ingestionTask.getSchemaRepo()).thenReturn(schemaRepository);
     when(ingestionTask.getServerConfig()).thenReturn(serverConfig);
-    when(ingestionTask.getRmdWithValueSchemaByteBufferFromStorage(anyInt(), any(), any(), anyLong()))
-        .thenCallRealMethod();
+    when(ingestionTask.getRmdWithValueSchemaByteBufferFromStorageInternal(anyInt(), any(), any())).thenCallRealMethod();
     when(ingestionTask.isChunked()).thenReturn(true);
     when(ingestionTask.getHostLevelIngestionStats()).thenReturn(mock(HostLevelIngestionStats.class));
     ChunkedValueManifestContainer container = new ChunkedValueManifestContainer();
     when(storageEngine.getReplicationMetadata(partition, ByteBuffer.wrap(topLevelKey1)))
         .thenReturn(expectedNonChunkedValue);
-    byte[] result = ingestionTask.getRmdWithValueSchemaByteBufferFromStorage(partition, key1, container, 0L);
+    byte[] result =
+        ingestionTask.getRmdWithValueSchemaByteBufferFromStorageInternal(partition, key1, container).serialize();
     Assert.assertNotNull(result);
     Assert.assertNull(container.getManifest());
     Assert.assertEquals(result, expectedNonChunkedValue);
@@ -557,7 +557,8 @@ public void testReadingChunkedRmdFromStorage() {
     when(storageEngine.getReplicationMetadata(partition, ByteBuffer.wrap(topLevelKey2)))
         .thenReturn(chunkedManifestBytes.array());
     when(storageEngine.getReplicationMetadata(partition, ByteBuffer.wrap(chunkedKey1InKey2))).thenReturn(chunkedValue1);
-    byte[] result2 = ingestionTask.getRmdWithValueSchemaByteBufferFromStorage(partition, key2, container, 0L);
+    byte[] result2 =
+        ingestionTask.getRmdWithValueSchemaByteBufferFromStorageInternal(partition, key2, container).serialize();
     Assert.assertNotNull(result2);
     Assert.assertNotNull(container.getManifest());
     Assert.assertEquals(container.getManifest().getKeysWithChunkIdSuffix().size(), 1);
@@ -593,7 +594,8 @@ public void testReadingChunkedRmdFromStorage() {
         .thenReturn(chunkedManifestBytes.array());
     when(storageEngine.getReplicationMetadata(partition, ByteBuffer.wrap(chunkedKey1InKey3))).thenReturn(chunkedValue1);
     when(storageEngine.getReplicationMetadata(partition, ByteBuffer.wrap(chunkedKey2InKey3))).thenReturn(chunkedValue2);
-    byte[] result3 = ingestionTask.getRmdWithValueSchemaByteBufferFromStorage(partition, key3, container, 0L);
+    byte[] result3 =
+        ingestionTask.getRmdWithValueSchemaByteBufferFromStorageInternal(partition, key3, container).serialize();
     Assert.assertNotNull(result3);
     Assert.assertNotNull(container.getManifest());
     Assert.assertEquals(container.getManifest().getKeysWithChunkIdSuffix().size(), 2);
diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/KafkaStoreIngestionServiceTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/KafkaStoreIngestionServiceTest.java
@@ -131,6 +131,7 @@ private void setupMockConfig() {
     doReturn(Object2IntMaps.emptyMap()).when(mockVeniceServerConfig).getKafkaClusterUrlToIdMap();
     doReturn(KafkaConsumerServiceDelegator.ConsumerPoolStrategyType.DEFAULT).when(mockVeniceServerConfig)
         .getConsumerPoolStrategyType();
+    doReturn(2).when(mockVeniceServerConfig).getAaWCIngestionStorageLookupThreadPoolSize();
 
     // Consumer related configs for preparing kafka consumer service.
     doReturn(dummyKafkaUrl).when(mockVeniceServerConfig).getKafkaBootstrapServers();
diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/StoreIngestionTaskTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/StoreIngestionTaskTest.java
@@ -1127,7 +1127,9 @@ private StoreIngestionTaskFactory.Builder getIngestionTaskFactoryBuilder(
         .setPartitionStateSerializer(partitionStateSerializer)
         .setRunnableForKillIngestionTasksForNonCurrentVersions(runnableForKillNonCurrentVersion)
         .setAAWCWorkLoadProcessingThreadPool(
-            Executors.newFixedThreadPool(2, new DaemonThreadFactory("AA_WC_PARALLEL_PROCESSING")));
+            Executors.newFixedThreadPool(2, new DaemonThreadFactory("AA_WC_PARALLEL_PROCESSING")))
+        .setAAWCIngestionStorageLookupThreadPool(
+            Executors.newFixedThreadPool(1, new DaemonThreadFactory("AA_WC_INGESTION_STORAGE_LOOKUP")));
   }
 
   abstract KafkaConsumerService.ConsumerAssignmentStrategy getConsumerAssignmentStrategy();
diff --git a/internal/venice-common/src/main/java/com/linkedin/venice/ConfigKeys.java b/internal/venice-common/src/main/java/com/linkedin/venice/ConfigKeys.java
@@ -2379,6 +2379,13 @@ private ConfigKeys() {
       "server.aa.wc.workload.parallel.processing.thread.pool.size";
   public static final String SERVER_GLOBAL_RT_DIV_ENABLED = "server.global.rt.div.enabled";
 
+  /**
+   * This config is used to control the RocksDB lookup concurrency when handling AA/WC workload with parallel processing enabled.
+   * Check {@link #SERVER_AA_WC_WORKLOAD_PARALLEL_PROCESSING_ENABLED} for more details.
+   */
+  public static final String SERVER_AA_WC_INGESTION_STORAGE_LOOKUP_THREAD_POOL_SIZE =
+      "server.aa.wc.ingestion.storage.lookup.thread.pool.size";
+
   /**
    * Whether to enable producer throughput optimization for realtime workload or not.
    * Two strategies: