10
10
import com .linkedin .venice .pubsub .api .PubSubTopicPartition ;
11
11
import com .linkedin .venice .utils .ExceptionUtils ;
12
12
import com .linkedin .venice .utils .LatencyUtils ;
13
+ import com .linkedin .venice .utils .RedundantExceptionFilter ;
13
14
import com .linkedin .venice .utils .Utils ;
14
15
import com .linkedin .venice .utils .concurrent .VeniceConcurrentHashMap ;
15
16
import io .tehuti .metrics .MetricConfig ;
19
20
import java .util .List ;
20
21
import java .util .Map ;
21
22
import java .util .Set ;
23
+ import java .util .function .Function ;
22
24
import java .util .function .IntConsumer ;
23
25
import java .util .function .Supplier ;
24
26
import org .apache .logging .log4j .LogManager ;
@@ -49,6 +51,8 @@ class ConsumptionTask implements Runnable {
49
51
new VeniceConcurrentHashMap <>();
50
52
private final long readCycleDelayMs ;
51
53
private final Supplier <Map <PubSubTopicPartition , List <PubSubMessage <KafkaKey , KafkaMessageEnvelope , Long >>>> pollFunction ;
54
+
55
+ private final Function <PubSubTopicPartition , Long > offsetLagGetter ;
52
56
private final IntConsumer bandwidthThrottler ;
53
57
private final IntConsumer recordsThrottler ;
54
58
private final AggKafkaConsumerServiceStats aggStats ;
@@ -61,6 +65,8 @@ class ConsumptionTask implements Runnable {
61
65
*/
62
66
private final Map <PubSubTopicPartition , Rate > messageRatePerTopicPartition = new VeniceConcurrentHashMap <>();
63
67
private final Map <PubSubTopicPartition , Rate > bytesRatePerTopicPartition = new VeniceConcurrentHashMap <>();
68
+ private final Map <PubSubTopicPartition , Rate > pollRatePerTopicPartition = new VeniceConcurrentHashMap <>();
69
+ private final RedundantExceptionFilter redundantExceptionFilter ;
64
70
private final Map <PubSubTopicPartition , Long > lastSuccessfulPollTimestampPerTopicPartition =
65
71
new VeniceConcurrentHashMap <>();
66
72
@@ -87,14 +93,18 @@ public ConsumptionTask(
87
93
final IntConsumer bandwidthThrottler ,
88
94
final IntConsumer recordsThrottler ,
89
95
final AggKafkaConsumerServiceStats aggStats ,
90
- final ConsumerSubscriptionCleaner cleaner ) {
96
+ final ConsumerSubscriptionCleaner cleaner ,
97
+ Function <PubSubTopicPartition , Long > offsetLagGetter ,
98
+ RedundantExceptionFilter redundantExceptionFilter ) {
91
99
this .readCycleDelayMs = readCycleDelayMs ;
92
100
this .pollFunction = pollFunction ;
93
101
this .bandwidthThrottler = bandwidthThrottler ;
94
102
this .recordsThrottler = recordsThrottler ;
95
103
this .aggStats = aggStats ;
96
104
this .cleaner = cleaner ;
97
105
this .taskId = taskId ;
106
+ this .offsetLagGetter = offsetLagGetter ;
107
+ this .redundantExceptionFilter = redundantExceptionFilter ;
98
108
this .consumptionTaskIdStr = Utils .getSanitizedStringForLogger (consumerNamePrefix ) + " - " + taskId ;
99
109
this .LOGGER = LogManager .getLogger (getClass ().getSimpleName () + "[ " + consumptionTaskIdStr + " ]" );
100
110
}
@@ -180,8 +190,11 @@ public void run() {
180
190
bytesRatePerTopicPartition
181
191
.computeIfAbsent (pubSubTopicPartition , tp -> createRate (lastSuccessfulPollTimestamp ))
182
192
.record (payloadSizePerTopicPartition , lastSuccessfulPollTimestamp );
183
-
193
+ pollRatePerTopicPartition
194
+ .computeIfAbsent (pubSubTopicPartition , tp -> createRate (lastSuccessfulPollTimestamp ))
195
+ .record (1 , lastSuccessfulPollTimestamp );
184
196
consumedDataReceiver .write (topicPartitionMessages );
197
+ checkSlowPartitionWithHighLag (pubSubTopicPartition );
185
198
}
186
199
aggStats .recordTotalConsumerRecordsProducingToWriterBufferLatency (
187
200
LatencyUtils .getElapsedTimeFromMsToMs (beforeProducingToWriteBufferTimestamp ));
@@ -278,6 +291,29 @@ Double getByteRate(PubSubTopicPartition topicPartition) {
278
291
return 0.0D ;
279
292
}
280
293
294
+ Double getPollRate (PubSubTopicPartition topicPartition ) {
295
+ if (pollRatePerTopicPartition .containsKey (topicPartition )) {
296
+ return pollRatePerTopicPartition .get (topicPartition ).measure (metricConfig , System .currentTimeMillis ());
297
+ }
298
+ return 0.0D ;
299
+ }
300
+
301
+ private void checkSlowPartitionWithHighLag (PubSubTopicPartition pubSubTopicPartition ) {
302
+ Long offsetLag = offsetLagGetter .apply (pubSubTopicPartition );
303
+ Double messageRate = getMessageRate (pubSubTopicPartition );
304
+ Double pollRate = getPollRate (pubSubTopicPartition );
305
+ String slowTaskWithPartitionStr = consumptionTaskIdStr + " - " + pubSubTopicPartition ;
306
+ if (offsetLag > 200000 && messageRate < 200
307
+ && !redundantExceptionFilter .isRedundantException (slowTaskWithPartitionStr )) {
308
+ LOGGER .warn (
309
+ "Slow partition with high lag detected: {}. Lag: {}, Message Rate: {}, Poll Rate: {}" ,
310
+ pubSubTopicPartition ,
311
+ offsetLag ,
312
+ messageRate ,
313
+ pollRate );
314
+ }
315
+ }
316
+
281
317
PubSubTopic getDestinationIdentifier (PubSubTopicPartition topicPartition ) {
282
318
ConsumedDataReceiver <List <PubSubMessage <KafkaKey , KafkaMessageEnvelope , Long >>> dataReceiver =
283
319
dataReceiverMap .get (topicPartition );
0 commit comments