@@ -65,7 +65,7 @@ void HealthcheckService::onPingSent(int serverId, int pingId)
6565 return ;
6666 }
6767 m_remainingPings.insert (serverId);
68- m_failedPings.try_emplace (serverId, 0 );
68+ m_failedPings.try_emplace (serverId, std::set< int >{} );
6969}
7070
7171void HealthcheckService::onPingFailed (int serverId, int pingId)
@@ -86,7 +86,7 @@ void HealthcheckService::onPingFailed(int serverId, int pingId)
8686 {
8787 m_sessionServerAppManager.onSessionServerStateChanged (serverId,
8888 firebolt::rialto::common::SessionServerState::ERROR);
89- m_failedPings.emplace (serverId, 1 );
89+ m_failedPings.emplace (serverId, std::set< int >{pingId} );
9090 }
9191}
9292
@@ -95,15 +95,26 @@ void HealthcheckService::onAckReceived(int serverId, int pingId, bool success)
9595 std::unique_lock<std::mutex> lock{m_mutex};
9696 if (pingId != m_currentPingId)
9797 {
98- RIALTO_SERVER_MANAGER_LOG_WARN (" Unexpected ack received from server id: %d. Current ping id: %d, received ping "
99- " id: %d" ,
100- serverId, m_currentPingId, pingId);
98+ if (success && m_failedPings[serverId].find (pingId) != m_failedPings[serverId].end ())
99+ {
100+ RIALTO_SERVER_MANAGER_LOG_WARN (" Late ack received for server id: %d, Current ping id: %d, received ping "
101+ " id: %d. Removing from failed pings list" ,
102+ serverId, m_currentPingId, pingId);
103+ m_failedPings[serverId].erase (pingId);
104+ }
105+ else
106+ {
107+ RIALTO_SERVER_MANAGER_LOG_ERROR (" Unexpected ack received from server id: %d. Current ping id: %d, received "
108+ " ping "
109+ " id: %d" ,
110+ serverId, m_currentPingId, pingId);
111+ }
101112 return ;
102113 }
103114 m_remainingPings.erase (serverId);
104115 if (success)
105116 {
106- m_failedPings[serverId] = 0 ;
117+ m_failedPings[serverId]. clear () ;
107118 }
108119 else
109120 {
@@ -136,12 +147,13 @@ void HealthcheckService::sendPing()
136147void HealthcheckService::handleError (int serverId)
137148{
138149 m_sessionServerAppManager.onSessionServerStateChanged (serverId, firebolt::rialto::common::SessionServerState::ERROR);
139- unsigned &failedPingsNum{m_failedPings[serverId]};
140- if (++failedPingsNum >= m_kNumOfFailedPingsBeforeRecovery)
150+ auto &failedPings{m_failedPings[serverId]};
151+ failedPings.insert (m_currentPingId);
152+ if (failedPings.size () >= m_kNumOfFailedPingsBeforeRecovery)
141153 {
142154 RIALTO_SERVER_MANAGER_LOG_WARN (
143155 " Max num of failed pings reached for server with id: %d. Starting recovery action" , serverId);
144- failedPingsNum = 0 ;
156+ failedPings. clear () ;
145157 m_sessionServerAppManager.restartServer (serverId);
146158 }
147159}
0 commit comments