Skip to content

Commit ae413b0

Browse files
Do not treat server as deadlocked when outdated acks are received (#495)
Summary: Do not treat server as deadlocked when outdated acks are received Type: Fix Test Plan: UT/CT, Fullstack Jira: RDKEMW-18258
1 parent cad26f1 commit ae413b0

4 files changed

Lines changed: 42 additions & 10 deletions

File tree

media/server/gstplayer/source/tasks/generic/Flush.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ void Flush::execute() const
7979
m_player.stopPositionReportingAndCheckAudioUnderflowTimer();
8080
m_context.flushOnPrerollController->waitIfRequired(m_type);
8181

82+
RIALTO_SERVER_LOG_MIL("Sending flush event for %s source.", common::convertMediaSourceType(m_type));
8283
// Flush source
8384
GstEvent *flushStart = m_gstWrapper->gstEventNewFlushStart();
8485
if (!m_gstWrapper->gstElementSendEvent(source, flushStart))

serverManager/common/source/HealthcheckService.cpp

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ void HealthcheckService::onPingSent(int serverId, int pingId)
6565
return;
6666
}
6767
m_remainingPings.insert(serverId);
68-
m_failedPings.try_emplace(serverId, 0);
68+
m_failedPings.try_emplace(serverId, std::set<int>{});
6969
}
7070

7171
void HealthcheckService::onPingFailed(int serverId, int pingId)
@@ -86,7 +86,7 @@ void HealthcheckService::onPingFailed(int serverId, int pingId)
8686
{
8787
m_sessionServerAppManager.onSessionServerStateChanged(serverId,
8888
firebolt::rialto::common::SessionServerState::ERROR);
89-
m_failedPings.emplace(serverId, 1);
89+
m_failedPings.emplace(serverId, std::set<int>{pingId});
9090
}
9191
}
9292

@@ -95,15 +95,26 @@ void HealthcheckService::onAckReceived(int serverId, int pingId, bool success)
9595
std::unique_lock<std::mutex> lock{m_mutex};
9696
if (pingId != m_currentPingId)
9797
{
98-
RIALTO_SERVER_MANAGER_LOG_WARN("Unexpected ack received from server id: %d. Current ping id: %d, received ping "
99-
"id: %d",
100-
serverId, m_currentPingId, pingId);
98+
if (success && m_failedPings[serverId].find(pingId) != m_failedPings[serverId].end())
99+
{
100+
RIALTO_SERVER_MANAGER_LOG_WARN("Late ack received for server id: %d, Current ping id: %d, received ping "
101+
"id: %d. Removing from failed pings list",
102+
serverId, m_currentPingId, pingId);
103+
m_failedPings[serverId].erase(pingId);
104+
}
105+
else
106+
{
107+
RIALTO_SERVER_MANAGER_LOG_ERROR("Unexpected ack received from server id: %d. Current ping id: %d, received "
108+
"ping "
109+
"id: %d",
110+
serverId, m_currentPingId, pingId);
111+
}
101112
return;
102113
}
103114
m_remainingPings.erase(serverId);
104115
if (success)
105116
{
106-
m_failedPings[serverId] = 0;
117+
m_failedPings[serverId].clear();
107118
}
108119
else
109120
{
@@ -136,12 +147,13 @@ void HealthcheckService::sendPing()
136147
void HealthcheckService::handleError(int serverId)
137148
{
138149
m_sessionServerAppManager.onSessionServerStateChanged(serverId, firebolt::rialto::common::SessionServerState::ERROR);
139-
unsigned &failedPingsNum{m_failedPings[serverId]};
140-
if (++failedPingsNum >= m_kNumOfFailedPingsBeforeRecovery)
150+
auto &failedPings{m_failedPings[serverId]};
151+
failedPings.insert(m_currentPingId);
152+
if (failedPings.size() >= m_kNumOfFailedPingsBeforeRecovery)
141153
{
142154
RIALTO_SERVER_MANAGER_LOG_WARN(
143155
"Max num of failed pings reached for server with id: %d. Starting recovery action", serverId);
144-
failedPingsNum = 0;
156+
failedPings.clear();
145157
m_sessionServerAppManager.restartServer(serverId);
146158
}
147159
}

serverManager/common/source/HealthcheckService.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class HealthcheckService : public IHealthcheckService
5353
std::mutex m_mutex;
5454
int m_currentPingId;
5555
std::set<int> m_remainingPings;
56-
std::map<int, unsigned> m_failedPings;
56+
std::map<int, std::set<int>> m_failedPings;
5757
};
5858
} // namespace rialto::servermanager::common
5959

tests/unittests/serverManager/unittests/common/HealthcheckServiceTests.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,3 +285,22 @@ TEST_F(HealthcheckServiceTests, WillFailToFailPingWithWrongId)
285285
// There should be no error indication here.
286286
triggerOnPingFailed(pingId + 1);
287287
}
288+
289+
TEST_F(HealthcheckServiceTests, WillSkipRestartingServerWhenAcksAreReceivedLater)
290+
{
291+
int pingId{-1};
292+
timerWillBeCreated();
293+
createSut();
294+
pingWillBeSent(pingId);
295+
triggerPingTimeout();
296+
const int firstPingId{pingId};
297+
triggerOnPingSent(pingId);
298+
errorIndicationWillBeSent();
299+
pingWillBeSent(pingId);
300+
triggerPingTimeout();
301+
triggerOnAckReceived(kServerId, firstPingId, kSuccess);
302+
triggerOnPingSent(pingId);
303+
errorIndicationWillBeSent();
304+
pingWillBeSent(pingId);
305+
triggerPingTimeout();
306+
}

0 commit comments

Comments
 (0)