Skip to content

Commit

Permalink
[server][davinci] Implement Heartbeat Monitoring Service (#834)
Browse files Browse the repository at this point in the history
* [server][davinci] Implement Heartbeat Monitoring Service

This PR implements the heartbeat monitoring service.

The service functions on two ends.  Getting notified for partitions
that should be monitored (as leader or follower), and notified when
heartbeats are consumed (in the context of leader or follower consumption).

Stats are then reported and updated from a monitoring thread that is seperate
from ingestion tasks.  The purpose of this is to enable monitoring to be a true
catch all.  For a hybrid store that is assigned Leader/Follower in helix, theres
no acceptable situation where the server should not be consuming records for
a prolonged duration.  Ingestion tasks may die, get paused, or recycled.  That
is fine, but if Helix says we should be up, then we'll report based on that.
  • Loading branch information
ZacAttack authored Feb 16, 2024
1 parent 4115afc commit e0e56d3
Show file tree
Hide file tree
Showing 25 changed files with 782 additions and 49 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,9 @@ public DaVinciBackend(
// TODO: consider how/if a repair task would be valid for Davinci users?
null,
pubSubClientsFactory,
Optional.empty());
Optional.empty(),
// TODO: It would be good to monitor heartbeats like this from davinci, but needs some work
null);

ingestionService.start();
ingestionService.addIngestionNotifier(ingestionListener);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import com.linkedin.davinci.notifier.PushMonitorNotifier;
import com.linkedin.davinci.notifier.VeniceNotifier;
import com.linkedin.davinci.stats.ParticipantStateTransitionStats;
import com.linkedin.davinci.stats.ingestion.heartbeat.HeartbeatMonitoringService;
import com.linkedin.davinci.storage.StorageMetadataService;
import com.linkedin.davinci.storage.StorageService;
import com.linkedin.venice.common.VeniceSystemStoreType;
Expand Down Expand Up @@ -93,6 +94,7 @@ public class HelixParticipationService extends AbstractVeniceService
private HelixPartitionStatusAccessor partitionPushStatusAccessor;
private ThreadPoolExecutor leaderFollowerHelixStateTransitionThreadPool;
private VeniceOfflinePushMonitorAccessor veniceOfflinePushMonitorAccessor;
private final HeartbeatMonitoringService heartbeatMonitoringService;

// This is ONLY for testing purpose.
public ThreadPoolExecutor getLeaderFollowerHelixStateTransitionThreadPool() {
Expand All @@ -111,10 +113,12 @@ public HelixParticipationService(
String clusterName,
int port,
String hostname,
CompletableFuture<SafeHelixManager> managerFuture) {
CompletableFuture<SafeHelixManager> managerFuture,
HeartbeatMonitoringService heartbeatMonitoringService) {
this.ingestionService = storeIngestionService;
this.storageService = storageService;
this.clusterName = clusterName;
this.heartbeatMonitoringService = heartbeatMonitoringService;
// The format of instance name must be "$host_$port", otherwise Helix can not get these information correctly.
this.participantName = Utils.getHelixNodeIdentifier(hostname, port);
this.zkAddress = zkAddress;
Expand Down Expand Up @@ -194,7 +198,8 @@ public boolean startInner() {
futureVersionStateTransitionStats,
helixReadOnlyStoreRepository,
partitionPushStatusAccessorFuture,
instance.getNodeId());
instance.getNodeId(),
heartbeatMonitoringService);
} else {
leaderFollowerParticipantModelFactory = new LeaderFollowerPartitionStateModelFactory(
ingestionBackend,
Expand All @@ -203,7 +208,8 @@ public boolean startInner() {
stateTransitionStats,
helixReadOnlyStoreRepository,
partitionPushStatusAccessorFuture,
instance.getNodeId());
instance.getNodeId(),
heartbeatMonitoringService);
}
LOGGER.info(
"LeaderFollower threadPool info: strategy = {}, max future state transition thread = {}",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@
import com.linkedin.davinci.ingestion.VeniceIngestionBackend;
import com.linkedin.davinci.kafka.consumer.LeaderFollowerStoreIngestionTask;
import com.linkedin.davinci.stats.ParticipantStateTransitionStats;
import com.linkedin.davinci.stats.ingestion.heartbeat.HeartbeatMonitoringService;
import com.linkedin.venice.common.VeniceSystemStoreUtils;
import com.linkedin.venice.exceptions.VeniceException;
import com.linkedin.venice.exceptions.VeniceNoStoreException;
import com.linkedin.venice.helix.HelixPartitionStatusAccessor;
import com.linkedin.venice.helix.HelixState;
import com.linkedin.venice.meta.ReadOnlyStoreRepository;
import com.linkedin.venice.meta.Store;
import com.linkedin.venice.meta.Version;
import com.linkedin.venice.utils.LatencyUtils;
import java.util.concurrent.CompletableFuture;
Expand Down Expand Up @@ -57,6 +59,8 @@ public class LeaderFollowerPartitionStateModel extends AbstractPartitionStateMod
private final LeaderFollowerIngestionProgressNotifier notifier;
private final ParticipantStateTransitionStats threadPoolStats;

private final HeartbeatMonitoringService heartbeatMonitoringService;

public LeaderFollowerPartitionStateModel(
VeniceIngestionBackend ingestionBackend,
VeniceStoreVersionConfig storeAndServerConfigs,
Expand All @@ -65,7 +69,8 @@ public LeaderFollowerPartitionStateModel(
ReadOnlyStoreRepository metadataRepo,
CompletableFuture<HelixPartitionStatusAccessor> partitionPushStatusAccessorFuture,
String instanceName,
ParticipantStateTransitionStats threadPoolStats) {
ParticipantStateTransitionStats threadPoolStats,
HeartbeatMonitoringService heartbeatMonitoringService) {
super(
ingestionBackend,
metadataRepo,
Expand All @@ -75,6 +80,7 @@ public LeaderFollowerPartitionStateModel(
instanceName);
this.notifier = notifier;
this.threadPoolStats = threadPoolStats;
this.heartbeatMonitoringService = heartbeatMonitoringService;
}

@Transition(to = HelixState.STANDBY_STATE, from = HelixState.OFFLINE_STATE)
Expand All @@ -83,8 +89,9 @@ public void onBecomeStandbyFromOffline(Message message, NotificationContext cont
String resourceName = message.getResourceName();
String storeName = Version.parseStoreFromKafkaTopicName(resourceName);
int version = Version.parseVersionFromKafkaTopicName(resourceName);
boolean isRegularStoreCurrentVersion = getStoreRepo().getStoreOrThrow(storeName).getCurrentVersion() == version
&& !VeniceSystemStoreUtils.isSystemStore(storeName);
Store store = getStoreRepo().getStoreOrThrow(storeName);
boolean isRegularStoreCurrentVersion =
store.getCurrentVersion() == version && !VeniceSystemStoreUtils.isSystemStore(storeName);

/**
* For regular store current version, firstly create a latch, then start ingestion and wait for ingestion
Expand All @@ -97,6 +104,7 @@ public void onBecomeStandbyFromOffline(Message message, NotificationContext cont
try {
long startTimeForSettingUpNewStorePartitionInNs = System.nanoTime();
setupNewStorePartition();
heartbeatMonitoringService.addFollowerLagMonitor(store.getVersion(version).get(), getPartition());
logger.info(
"Completed setting up new store partition for {} partition {}. Total elapsed time: {} ms",
resourceName,
Expand All @@ -118,6 +126,11 @@ public void onBecomeStandbyFromOffline(Message message, NotificationContext cont
@Transition(to = HelixState.LEADER_STATE, from = HelixState.STANDBY_STATE)
public void onBecomeLeaderFromStandby(Message message, NotificationContext context) {
LeaderSessionIdChecker checker = new LeaderSessionIdChecker(leaderSessionId.incrementAndGet(), leaderSessionId);
String resourceName = message.getResourceName();
String storeName = Version.parseStoreFromKafkaTopicName(resourceName);
int version = Version.parseVersionFromKafkaTopicName(resourceName);
Store store = getStoreRepo().getStoreOrThrow(storeName);
heartbeatMonitoringService.addLeaderLagMonitor(store.getVersion(version).get(), getPartition());
executeStateTransition(
message,
context,
Expand All @@ -127,6 +140,11 @@ public void onBecomeLeaderFromStandby(Message message, NotificationContext conte
@Transition(to = HelixState.STANDBY_STATE, from = HelixState.LEADER_STATE)
public void onBecomeStandbyFromLeader(Message message, NotificationContext context) {
LeaderSessionIdChecker checker = new LeaderSessionIdChecker(leaderSessionId.incrementAndGet(), leaderSessionId);
String resourceName = message.getResourceName();
String storeName = Version.parseStoreFromKafkaTopicName(resourceName);
int version = Version.parseVersionFromKafkaTopicName(resourceName);
Store store = getStoreRepo().getStoreOrThrow(storeName);
heartbeatMonitoringService.addFollowerLagMonitor(store.getVersion(version).get(), getPartition());
executeStateTransition(
message,
context,
Expand All @@ -135,6 +153,11 @@ public void onBecomeStandbyFromLeader(Message message, NotificationContext conte

@Transition(to = HelixState.OFFLINE_STATE, from = HelixState.STANDBY_STATE)
public void onBecomeOfflineFromStandby(Message message, NotificationContext context) {
String resourceName = message.getResourceName();
String storeName = Version.parseStoreFromKafkaTopicName(resourceName);
int version = Version.parseVersionFromKafkaTopicName(resourceName);
Store store = getStoreRepo().getStoreOrThrow(storeName);
heartbeatMonitoringService.removeLagMonitor(store.getVersion(version).get(), getPartition());
executeStateTransition(message, context, () -> stopConsumption(true));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import com.linkedin.davinci.config.VeniceConfigLoader;
import com.linkedin.davinci.ingestion.VeniceIngestionBackend;
import com.linkedin.davinci.stats.ParticipantStateTransitionStats;
import com.linkedin.davinci.stats.ingestion.heartbeat.HeartbeatMonitoringService;
import com.linkedin.venice.helix.HelixPartitionStatusAccessor;
import com.linkedin.venice.meta.ReadOnlyStoreRepository;
import com.linkedin.venice.utils.Utils;
Expand All @@ -24,15 +25,17 @@ public LeaderFollowerPartitionStateModelDualPoolFactory(
ParticipantStateTransitionStats futureVersionStateTransitionStats,
ReadOnlyStoreRepository metadataRepo,
CompletableFuture<HelixPartitionStatusAccessor> partitionPushStatusAccessorFuture,
String instanceName) {
String instanceName,
HeartbeatMonitoringService heartbeatMonitoringService) {
super(
ingestionBackend,
configService,
executorService,
stateTransitionStats,
metadataRepo,
partitionPushStatusAccessorFuture,
instanceName);
instanceName,
heartbeatMonitoringService);
this.futureVersionExecutorService = futureVersionExecutorService;
this.futureVersionStateTransitionStats = futureVersionStateTransitionStats;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import com.linkedin.davinci.config.VeniceConfigLoader;
import com.linkedin.davinci.ingestion.VeniceIngestionBackend;
import com.linkedin.davinci.stats.ParticipantStateTransitionStats;
import com.linkedin.davinci.stats.ingestion.heartbeat.HeartbeatMonitoringService;
import com.linkedin.venice.helix.HelixPartitionStatusAccessor;
import com.linkedin.venice.meta.ReadOnlyStoreRepository;
import com.linkedin.venice.utils.HelixUtils;
Expand All @@ -17,6 +18,7 @@
public class LeaderFollowerPartitionStateModelFactory extends AbstractStateModelFactory {
private final LeaderFollowerIngestionProgressNotifier leaderFollowerStateModelNotifier =
new LeaderFollowerIngestionProgressNotifier();
private final HeartbeatMonitoringService heartbeatMonitoringService;

public LeaderFollowerPartitionStateModelFactory(
VeniceIngestionBackend ingestionBackend,
Expand All @@ -25,7 +27,8 @@ public LeaderFollowerPartitionStateModelFactory(
ParticipantStateTransitionStats stateTransitionStats,
ReadOnlyStoreRepository metadataRepo,
CompletableFuture<HelixPartitionStatusAccessor> partitionPushStatusAccessorFuture,
String instanceName) {
String instanceName,
HeartbeatMonitoringService heartbeatMonitoringService) {
super(
ingestionBackend,
configService,
Expand All @@ -34,6 +37,7 @@ public LeaderFollowerPartitionStateModelFactory(
metadataRepo,
partitionPushStatusAccessorFuture,
instanceName);
this.heartbeatMonitoringService = heartbeatMonitoringService;

// Add a new notifier to let state model knows ingestion has caught up the lag so that it can complete the offline
// to standby state transition.
Expand All @@ -52,7 +56,8 @@ public LeaderFollowerPartitionStateModel createNewStateModel(String resourceName
getStoreMetadataRepo(),
partitionPushStatusAccessorFuture,
instanceName,
getStateTransitionStats(resourceName));
getStateTransitionStats(resourceName),
heartbeatMonitoringService);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
* The class also holds latches that can be used in SM in the cases when state transitions
* need to coordinate with ingestion progress.
*/
public abstract class StateModelIngestionProgressNotifier implements VeniceNotifier {
public class StateModelIngestionProgressNotifier implements VeniceNotifier {
private final Logger logger = LogManager.getLogger(this.getClass());
private final Map<String, CountDownLatch> stateModelToIngestionCompleteFlagMap = new VeniceConcurrentHashMap<>();
private final Map<String, Boolean> stateModelToSuccessMap = new VeniceConcurrentHashMap<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -782,7 +782,8 @@ private void initializeIsolatedIngestionServer() {
isDaVinciClient,
repairService,
pubSubClientsFactory,
sslFactory);
sslFactory,
null);
storeIngestionService.start();
storeIngestionService.addIngestionNotifier(new IsolatedIngestionNotifier(this));
ingestionBackend = new DefaultIngestionBackend(storageMetadataService, storeIngestionService, storageService);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import com.linkedin.davinci.stats.AggVersionedIngestionStats;
import com.linkedin.davinci.stats.ParticipantStoreConsumptionStats;
import com.linkedin.davinci.stats.StoreBufferServiceStats;
import com.linkedin.davinci.stats.ingestion.heartbeat.HeartbeatMonitoringService;
import com.linkedin.davinci.storage.StorageEngineRepository;
import com.linkedin.davinci.storage.StorageMetadataService;
import com.linkedin.davinci.store.cache.backend.ObjectCacheBackend;
Expand Down Expand Up @@ -229,7 +230,8 @@ public KafkaStoreIngestionService(
boolean isDaVinciClient,
RemoteIngestionRepairService remoteIngestionRepairService,
PubSubClientsFactory pubSubClientsFactory,
Optional<SSLFactory> sslFactory) {
Optional<SSLFactory> sslFactory,
HeartbeatMonitoringService heartbeatMonitoringService) {
this.cacheBackend = cacheBackend;
this.recordTransformer = recordTransformer;
this.storageMetadataService = storageMetadataService;
Expand Down Expand Up @@ -514,6 +516,7 @@ public void handleStoreDeleted(Store store) {
.setPubSubTopicRepository(pubSubTopicRepository)
.setRunnableForKillIngestionTasksForNonCurrentVersions(
serverConfig.getIngestionMemoryLimit() > 0 ? () -> killConsumptionTaskForNonCurrentVersions() : null)
.setHeartbeatMonitoringService(heartbeatMonitoringService)
.build();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import com.linkedin.davinci.ingestion.LagType;
import com.linkedin.davinci.schema.merge.CollectionTimestampMergeRecordHelper;
import com.linkedin.davinci.schema.merge.MergeRecordHelper;
import com.linkedin.davinci.stats.ingestion.heartbeat.HeartbeatMonitoringService;
import com.linkedin.davinci.storage.chunking.ChunkedValueManifestContainer;
import com.linkedin.davinci.storage.chunking.ChunkingAdapter;
import com.linkedin.davinci.storage.chunking.GenericRecordChunkingAdapter;
Expand Down Expand Up @@ -155,6 +156,8 @@ public class LeaderFollowerStoreIngestionTask extends StoreIngestionTask {
private final Set<String> nativeReplicationSourceVersionTopicKafkaURLSingletonSet;
private final VeniceWriterFactory veniceWriterFactory;

private final HeartbeatMonitoringService heartbeatMonitoringService;

/**
* Leader must maintain producer DIV states separate from drainers, because leader is always ahead of drainer;
* if leader and drainer share the same DIV validator, leader will pollute the data in shared DIV validator;
Expand Down Expand Up @@ -209,6 +212,7 @@ public LeaderFollowerStoreIngestionTask(
cacheBackend,
recordTransformer,
builder.getLeaderFollowerNotifiers());
this.heartbeatMonitoringService = builder.getHeartbeatMonitoringService();
/**
* We are going to apply fast leader failover for per user store system store since it is time sensitive, and if the
* split-brain problem happens in prod, we could design a way to periodically produce snapshot to the meta system
Expand Down Expand Up @@ -2089,6 +2093,36 @@ private void propagateHeartbeatFromUpstreamTopicToLocalVersionTopic(
}
}

@Override
protected void recordHeartbeatReceived(
PartitionConsumptionState partitionConsumptionState,
PubSubMessage<KafkaKey, KafkaMessageEnvelope, Long> consumerRecord,
String kafkaUrl) {
if (heartbeatMonitoringService == null) {
// Not enabled!
return;
}

if (partitionConsumptionState.getLeaderFollowerState().equals(LEADER)) {
for (int subPartition: PartitionUtils
.getSubPartitions(partitionConsumptionState.getUserPartition(), amplificationFactor)) {
heartbeatMonitoringService.recordLeaderHeartbeat(
storeName,
versionNumber,
subPartition,
serverConfig.getKafkaClusterUrlToAliasMap().get(kafkaUrl),
consumerRecord.getValue().producerMetadata.messageTimestamp);
}
} else {
heartbeatMonitoringService.recordFollowerHeartbeat(
storeName,
versionNumber,
partitionConsumptionState.getUserPartition(),
serverConfig.getKafkaClusterUrlToAliasMap().get(kafkaUrl),
consumerRecord.getValue().producerMetadata.messageTimestamp);
}
}

/**
* The goal of this function is to possibly produce the incoming kafka message consumed from local VT, remote VT, RT or SR topic to
* local VT if needed. It's decided based on the function output of {@link #shouldProduceToVersionTopic} and message type.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import static com.linkedin.davinci.kafka.consumer.ConsumerActionType.UNSUBSCRIBE;
import static com.linkedin.venice.ConfigKeys.KAFKA_BOOTSTRAP_SERVERS;
import static com.linkedin.venice.LogMessages.KILLED_JOB_MESSAGE;
import static com.linkedin.venice.kafka.protocol.enums.ControlMessageType.START_OF_SEGMENT;
import static java.util.concurrent.TimeUnit.HOURS;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static java.util.concurrent.TimeUnit.MINUTES;
Expand Down Expand Up @@ -2254,6 +2255,13 @@ public void processConsumerRecord(
}
}

protected void recordHeartbeatReceived(
PartitionConsumptionState partitionConsumptionState,
PubSubMessage<KafkaKey, KafkaMessageEnvelope, Long> consumerRecord,
String kafkaUrl) {
// No Op
}

/**
* Retrieve current LeaderFollowerState from partition's PCS. This method is used by IsolatedIngestionServer to sync
* user-partition LeaderFollower status from child process to parent process in ingestion isolation.
Expand Down Expand Up @@ -2296,8 +2304,7 @@ private boolean shouldSyncOffset(
* TODO: if we know some other types of Control Messages are frequent as START_OF_SEGMENT and END_OF_SEGMENT in the future,
* we need to consider to exclude them to avoid the issue described above.
*/
if (controlMessageType != ControlMessageType.START_OF_SEGMENT
&& controlMessageType != ControlMessageType.END_OF_SEGMENT) {
if (controlMessageType != START_OF_SEGMENT && controlMessageType != ControlMessageType.END_OF_SEGMENT) {
syncOffset = true;
}
} else {
Expand Down Expand Up @@ -2774,6 +2781,14 @@ private int internalProcessConsumerRecord(
consumerRecord.getTopicPartition().getPartitionNumber(),
consumerRecord.getOffset(),
partitionConsumptionState);
try {
if (controlMessage.controlMessageType == START_OF_SEGMENT.getValue()
&& Arrays.equals(consumerRecord.getKey().getKey(), KafkaKey.HEART_BEAT.getKey())) {
recordHeartbeatReceived(partitionConsumptionState, consumerRecord, kafkaUrl);
}
} catch (Exception e) {
LOGGER.error("Failed to record Record heartbeat with message: ", e);
}
} else {
sizeOfPersistedData = processKafkaDataMessage(
consumerRecord,
Expand Down Expand Up @@ -3787,7 +3802,7 @@ protected void recordProcessedRecordStats(
}

protected boolean isSegmentControlMsg(ControlMessageType msgType) {
return ControlMessageType.START_OF_SEGMENT.equals(msgType) || ControlMessageType.END_OF_SEGMENT.equals(msgType);
return START_OF_SEGMENT.equals(msgType) || ControlMessageType.END_OF_SEGMENT.equals(msgType);
}

/**
Expand Down
Loading

0 comments on commit e0e56d3

Please sign in to comment.