Skip to content

Commit 9dceae4

Browse files
MS maintenance improvements (#10417)
* Update last agents during ms maintenance, and some code improvements * Send 503 (Service Unavailable) response status when maintenance or shutdown is initiated [Any load balancer in the clustered environment can avoid routing requests to this MS node] * Migrate systemvm agents before routing host agents, and some code improvements * Added events for ms maintenance and shutdown operations * Added the following ms maintenance and shutdown improvements - block new agent connections during prepare for maintenance of ms - maintain avoids ms list - propagate updated management servers list and lb algorithm in host and indirect.agent.lb.algorithm settings respectively, to systemvm (non-routing) agents - updated setup ms list and migrate agent connections to executor service - migrate agent connection through executor, and send the answer to the ms host that initiated the migration - re-initialize ssl handshake executor if it is shutdown - don't allow prepare for maintenance or shutdown when other management server nodes are in preparing states - don't allow trigger shutdown when management server is up and other management server nodes are in preparing states - stop agent connections monitor on ms maintenance - update avoid ms list in ready command - updated connected host from the client connection - update last agents in ms metrics from the database - updated some agent config descriptions - update last management server in the hosts during shutdown - added agents and lastagents in management server response - updated management server maintenance & shutdown unit tests - some code improvements * refactored code / addressed comments * removed shutdown testcase (maybe, calling System.exit) * Revert "removed shutdown testcase (maybe, calling System.exit)" This reverts commit e14b071. * avoid system.exit during shutdown test * code improvements * testcase fix * Fix cutoff time in agent connections monitor thread
1 parent ea36568 commit 9dceae4

File tree

35 files changed

+1137
-221
lines changed

35 files changed

+1137
-221
lines changed

agent/src/main/java/com/cloud/agent/Agent.java

Lines changed: 50 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ public void start() {
342342
logger.info("Attempted to connect to the server, but received an unexpected exception, trying again...", e);
343343
}
344344
}
345-
shell.updateConnectedHost();
345+
shell.updateConnectedHost(((NioClient)connection).getHost());
346346
scavengeOldAgentObjects();
347347
}
348348

@@ -617,15 +617,11 @@ public Task create(final Task.Type type, final Link link, final byte[] data) {
617617
}
618618

619619
protected void reconnect(final Link link) {
620-
reconnect(link, null, null, false);
620+
reconnect(link, null, false);
621621
}
622622

623-
protected void reconnect(final Link link, String preferredHost, List<String> avoidHostList, boolean forTransfer) {
623+
protected void reconnect(final Link link, String preferredMSHost, boolean forTransfer) {
624624
if (!(forTransfer || reconnectAllowed)) {
625-
return;
626-
}
627-
628-
if (!reconnectAllowed) {
629625
logger.debug("Reconnect requested but it is not allowed {}", () -> getLinkLog(link));
630626
return;
631627
}
@@ -637,19 +633,26 @@ protected void reconnect(final Link link, String preferredHost, List<String> avo
637633
serverResource.disconnected();
638634
logger.info("Lost connection to host: {}. Attempting reconnection while we still have {} commands in progress.", shell.getConnectedHost(), commandsInProgress.get());
639635
stopAndCleanupConnection(true);
636+
String host = preferredMSHost;
637+
if (org.apache.commons.lang3.StringUtils.isBlank(host)) {
638+
host = shell.getNextHost();
639+
}
640+
List<String> avoidMSHostList = shell.getAvoidHosts();
640641
do {
641-
final String host = shell.getNextHost();
642-
connection = new NioClient(getAgentName(), host, shell.getPort(), shell.getWorkers(), shell.getSslHandshakeTimeout(), this);
643-
logger.info("Reconnecting to host: {}", host);
644-
try {
645-
connection.start();
646-
} catch (final NioConnectionException e) {
647-
logger.info("Attempted to re-connect to the server, but received an unexpected exception, trying again...", e);
648-
stopAndCleanupConnection(false);
642+
if (CollectionUtils.isEmpty(avoidMSHostList) || !avoidMSHostList.contains(host)) {
643+
connection = new NioClient(getAgentName(), host, shell.getPort(), shell.getWorkers(), shell.getSslHandshakeTimeout(), this);
644+
logger.info("Reconnecting to host: {}", host);
645+
try {
646+
connection.start();
647+
} catch (final NioConnectionException e) {
648+
logger.info("Attempted to re-connect to the server, but received an unexpected exception, trying again...", e);
649+
stopAndCleanupConnection(false);
650+
}
649651
}
650652
shell.getBackoffAlgorithm().waitBeforeRetry();
653+
host = shell.getNextHost();
651654
} while (!connection.isStartup());
652-
shell.updateConnectedHost();
655+
shell.updateConnectedHost(((NioClient)connection).getHost());
653656
logger.info("Connected to the host: {}", shell.getConnectedHost());
654657
}
655658

@@ -922,7 +925,7 @@ private Answer setupAgentCertificate(final SetupCertificateCommand cmd) {
922925
return new SetupCertificateAnswer(true);
923926
}
924927

925-
private void processManagementServerList(final List<String> msList, final String lbAlgorithm, final Long lbCheckInterval) {
928+
private void processManagementServerList(final List<String> msList, final List<String> avoidMsList, final String lbAlgorithm, final Long lbCheckInterval) {
926929
if (CollectionUtils.isNotEmpty(msList) && StringUtils.isNotEmpty(lbAlgorithm)) {
927930
try {
928931
final String newMSHosts = String.format("%s%s%s", com.cloud.utils.StringUtils.toCSVList(msList), IAgentShell.hostLbAlgorithmSeparator, lbAlgorithm);
@@ -934,6 +937,7 @@ private void processManagementServerList(final List<String> msList, final String
934937
throw new CloudRuntimeException("Could not persist received management servers list", e);
935938
}
936939
}
940+
shell.setAvoidHosts(avoidMsList);
937941
if ("shuffle".equals(lbAlgorithm)) {
938942
scheduleHostLBCheckerTask(0);
939943
} else {
@@ -942,16 +946,18 @@ private void processManagementServerList(final List<String> msList, final String
942946
}
943947

944948
private Answer setupManagementServerList(final SetupMSListCommand cmd) {
945-
processManagementServerList(cmd.getMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval());
949+
processManagementServerList(cmd.getMsList(), cmd.getAvoidMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval());
946950
return new SetupMSListAnswer(true);
947951
}
948952

949953
private Answer migrateAgentToOtherMS(final MigrateAgentConnectionCommand cmd) {
950954
try {
951955
if (CollectionUtils.isNotEmpty(cmd.getMsList())) {
952-
processManagementServerList(cmd.getMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval());
956+
processManagementServerList(cmd.getMsList(), cmd.getAvoidMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval());
953957
}
954-
migrateAgentConnection(cmd.getAvoidMsList());
958+
Executors.newSingleThreadScheduledExecutor(new NamedThreadFactory("MigrateAgentConnection-Job")).schedule(() -> {
959+
migrateAgentConnection(cmd.getAvoidMsList());
960+
}, 3, TimeUnit.SECONDS);
955961
} catch (Exception e) {
956962
String errMsg = "Migrate agent connection failed, due to " + e.getMessage();
957963
logger.debug(errMsg, e);
@@ -972,25 +978,26 @@ private void migrateAgentConnection(List<String> avoidMsList) {
972978
throw new CloudRuntimeException("No other Management Server hosts to migrate");
973979
}
974980

975-
String preferredHost = null;
981+
String preferredMSHost = null;
976982
for (String msHost : msHostsList) {
977983
try (final Socket socket = new Socket()) {
978984
socket.connect(new InetSocketAddress(msHost, shell.getPort()), 5000);
979-
preferredHost = msHost;
985+
preferredMSHost = msHost;
980986
break;
981987
} catch (final IOException e) {
982988
throw new CloudRuntimeException("Management server host: " + msHost + " is not reachable, to migrate connection");
983989
}
984990
}
985991

986-
if (preferredHost == null) {
992+
if (preferredMSHost == null) {
987993
throw new CloudRuntimeException("Management server host(s) are not reachable, to migrate connection");
988994
}
989995

990-
logger.debug("Management server host " + preferredHost + " is found to be reachable, trying to reconnect");
996+
logger.debug("Management server host " + preferredMSHost + " is found to be reachable, trying to reconnect");
991997
shell.resetHostCounter();
998+
shell.setAvoidHosts(avoidMsList);
992999
shell.setConnectionTransfer(true);
993-
reconnect(link, preferredHost, avoidMsList, true);
1000+
reconnect(link, preferredMSHost, true);
9941001
}
9951002

9961003
public void processResponse(final Response response, final Link link) {
@@ -1003,14 +1010,21 @@ public void processResponse(final Response response, final Link link) {
10031010
for (final IAgentControlListener listener : controlListeners) {
10041011
listener.processControlResponse(response, (AgentControlAnswer)answer);
10051012
}
1006-
} else if (answer instanceof PingAnswer && (((PingAnswer) answer).isSendStartup()) && reconnectAllowed) {
1007-
logger.info("Management server requested startup command to reinitialize the agent");
1008-
sendStartup(link);
1013+
} else if (answer instanceof PingAnswer) {
1014+
processPingAnswer((PingAnswer) answer);
10091015
} else {
10101016
updateLastPingResponseTime();
10111017
}
10121018
}
10131019

1020+
private void processPingAnswer(final PingAnswer answer) {
1021+
if ((answer.isSendStartup()) && reconnectAllowed) {
1022+
logger.info("Management server requested startup command to reinitialize the agent");
1023+
sendStartup(link);
1024+
}
1025+
shell.setAvoidHosts(answer.getAvoidMsList());
1026+
}
1027+
10141028
public void processReadyCommand(final Command cmd) {
10151029
final ReadyCommand ready = (ReadyCommand)cmd;
10161030
// Set human readable sizes;
@@ -1027,7 +1041,7 @@ public void processReadyCommand(final Command cmd) {
10271041
}
10281042

10291043
verifyAgentArch(ready.getArch());
1030-
processManagementServerList(ready.getMsHostList(), ready.getLbAlgorithm(), ready.getLbCheckInterval());
1044+
processManagementServerList(ready.getMsHostList(), ready.getAvoidMsHostList(), ready.getLbAlgorithm(), ready.getLbCheckInterval());
10311045

10321046
logger.info("Ready command is processed for agent [id: {}, uuid: {}, name: {}]", getId(), getUuid(), getName());
10331047
}
@@ -1374,26 +1388,26 @@ protected void runInContext() {
13741388
if (msList == null || msList.length < 1) {
13751389
return;
13761390
}
1377-
final String preferredHost = msList[0];
1391+
final String preferredMSHost = msList[0];
13781392
final String connectedHost = shell.getConnectedHost();
13791393
logger.debug("Running preferred host checker task, connected host={}, preferred host={}",
1380-
connectedHost, preferredHost);
1381-
if (preferredHost == null || preferredHost.equals(connectedHost) || link == null) {
1394+
connectedHost, preferredMSHost);
1395+
if (preferredMSHost == null || preferredMSHost.equals(connectedHost) || link == null) {
13821396
return;
13831397
}
13841398
boolean isHostUp = false;
13851399
try (final Socket socket = new Socket()) {
1386-
socket.connect(new InetSocketAddress(preferredHost, shell.getPort()), 5000);
1400+
socket.connect(new InetSocketAddress(preferredMSHost, shell.getPort()), 5000);
13871401
isHostUp = true;
13881402
} catch (final IOException e) {
1389-
logger.debug("Host: {} is not reachable", preferredHost);
1403+
logger.debug("Host: {} is not reachable", preferredMSHost);
13901404
}
13911405
if (isHostUp && link != null && commandsInProgress.get() == 0) {
13921406
if (logger.isDebugEnabled()) {
1393-
logger.debug("Preferred host {} is found to be reachable, trying to reconnect", preferredHost);
1407+
logger.debug("Preferred host {} is found to be reachable, trying to reconnect", preferredMSHost);
13941408
}
13951409
shell.resetHostCounter();
1396-
reconnect(link);
1410+
reconnect(link, preferredMSHost, false);
13971411
}
13981412
} catch (Throwable t) {
13991413
logger.error("Error caught while attempting to connect to preferred host", t);

agent/src/main/java/com/cloud/agent/AgentShell.java

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ public class AgentShell implements IAgentShell, Daemon {
6666
private String _zone;
6767
private String _pod;
6868
private String _host;
69+
private List<String> _avoidHosts;
6970
private String _privateIp;
7071
private int _port;
7172
private int _proxyPort;
@@ -76,7 +77,6 @@ public class AgentShell implements IAgentShell, Daemon {
7677
private volatile boolean _exit = false;
7778
private int _pingRetries;
7879
private final List<Agent> _agents = new ArrayList<Agent>();
79-
private String hostToConnect;
8080
private String connectedHost;
8181
private Long preferredHostCheckInterval;
8282
private boolean connectionTransfer = false;
@@ -121,7 +121,7 @@ public String getNextHost() {
121121
if (_hostCounter >= hosts.length) {
122122
_hostCounter = 0;
123123
}
124-
hostToConnect = hosts[_hostCounter % hosts.length];
124+
String hostToConnect = hosts[_hostCounter % hosts.length];
125125
_hostCounter++;
126126
return hostToConnect;
127127
}
@@ -143,11 +143,10 @@ public long getLbCheckerInterval(final Long receivedLbInterval) {
143143
}
144144

145145
@Override
146-
public void updateConnectedHost() {
147-
connectedHost = hostToConnect;
146+
public void updateConnectedHost(String connectedHost) {
147+
this.connectedHost = connectedHost;
148148
}
149149

150-
151150
@Override
152151
public void resetHostCounter() {
153152
_hostCounter = 0;
@@ -166,6 +165,16 @@ public void setHosts(final String host) {
166165
}
167166
}
168167

168+
@Override
169+
public void setAvoidHosts(List<String> avoidHosts) {
170+
_avoidHosts = avoidHosts;
171+
}
172+
173+
@Override
174+
public List<String> getAvoidHosts() {
175+
return _avoidHosts;
176+
}
177+
169178
@Override
170179
public String getPrivateIp() {
171180
return _privateIp;

agent/src/main/java/com/cloud/agent/IAgentShell.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
// under the License.
1717
package com.cloud.agent;
1818

19+
import java.util.List;
1920
import java.util.Map;
2021
import java.util.Properties;
2122

@@ -63,9 +64,13 @@ public interface IAgentShell {
6364

6465
String[] getHosts();
6566

67+
void setAvoidHosts(List<String> hosts);
68+
69+
List<String> getAvoidHosts();
70+
6671
long getLbCheckerInterval(Long receivedLbInterval);
6772

68-
void updateConnectedHost();
73+
void updateConnectedHost(String connectedHost);
6974

7075
String getConnectedHost();
7176

agent/src/main/java/com/cloud/agent/properties/AgentProperties.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -816,7 +816,7 @@ public Property<Integer> getWorkers() {
816816
* Data type: Integer.<br>
817817
* Default value: <code>null</code>
818818
*/
819-
public static final Property<Integer> SSL_HANDSHAKE_TIMEOUT = new Property<>("ssl.handshake.timeout", null, Integer.class);
819+
public static final Property<Integer> SSL_HANDSHAKE_TIMEOUT = new Property<>("ssl.handshake.timeout", 30, Integer.class);
820820

821821
public static class Property <T>{
822822
private String name;

agent/src/test/java/com/cloud/agent/AgentShellTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,7 @@ public void updateAndGetConnectedHost() {
358358
AgentShell shell = new AgentShell();
359359
shell.setHosts("test");
360360
shell.getNextHost();
361-
shell.updateConnectedHost();
361+
shell.updateConnectedHost("test");
362362

363363
Assert.assertEquals(expected, shell.getConnectedHost());
364364
}

api/src/main/java/com/cloud/event/EventTypes.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -739,6 +739,13 @@ public class EventTypes {
739739
//Purge resources
740740
public static final String EVENT_PURGE_EXPUNGED_RESOURCES = "PURGE.EXPUNGED.RESOURCES";
741741

742+
// Management Server
743+
public static final String EVENT_MS_MAINTENANCE_PREPARE = "MS.MAINTENANCE.PREPARE";
744+
public static final String EVENT_MS_MAINTENANCE_CANCEL = "MS.MAINTENANCE.CANCEL";
745+
public static final String EVENT_MS_SHUTDOWN_PREPARE = "MS.SHUTDOWN.PREPARE";
746+
public static final String EVENT_MS_SHUTDOWN_CANCEL = "MS.SHUTDOWN.CANCEL";
747+
public static final String EVENT_MS_SHUTDOWN = "MS.SHUTDOWN";
748+
742749
// OBJECT STORE
743750
public static final String EVENT_OBJECT_STORE_CREATE = "OBJECT.STORE.CREATE";
744751
public static final String EVENT_OBJECT_STORE_DELETE = "OBJECT.STORE.DELETE";
@@ -1233,6 +1240,12 @@ public class EventTypes {
12331240
entityEventDetails.put(EVENT_UPDATE_IMAGE_STORE_ACCESS_STATE, ImageStore.class);
12341241
entityEventDetails.put(EVENT_LIVE_PATCH_SYSTEMVM, "SystemVMs");
12351242

1243+
entityEventDetails.put(EVENT_MS_MAINTENANCE_PREPARE, "ManagementServer");
1244+
entityEventDetails.put(EVENT_MS_MAINTENANCE_CANCEL, "ManagementServer");
1245+
entityEventDetails.put(EVENT_MS_SHUTDOWN_PREPARE, "ManagementServer");
1246+
entityEventDetails.put(EVENT_MS_SHUTDOWN_CANCEL, "ManagementServer");
1247+
entityEventDetails.put(EVENT_MS_SHUTDOWN, "ManagementServer");
1248+
12361249
//Object Store
12371250
entityEventDetails.put(EVENT_OBJECT_STORE_CREATE, ObjectStore.class);
12381251
entityEventDetails.put(EVENT_OBJECT_STORE_UPDATE, ObjectStore.class);

api/src/main/java/org/apache/cloudstack/api/ApiConstants.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1150,6 +1150,7 @@ public class ApiConstants {
11501150
public static final String PENDING_JOBS_COUNT = "pendingjobscount";
11511151
public static final String AGENTS_COUNT = "agentscount";
11521152
public static final String AGENTS = "agents";
1153+
public static final String LAST_AGENTS = "lastagents";
11531154

11541155
public static final String PUBLIC_MTU = "publicmtu";
11551156
public static final String PRIVATE_MTU = "privatemtu";

api/src/main/java/org/apache/cloudstack/api/ApiErrorCode.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ public enum ApiErrorCode {
3030
UNSUPPORTED_ACTION_ERROR(432),
3131
API_LIMIT_EXCEED(429),
3232

33+
SERVICE_UNAVAILABLE(503),
3334
INTERNAL_ERROR(530),
3435
ACCOUNT_ERROR(531),
3536
ACCOUNT_RESOURCE_LIMIT_ERROR(532),

api/src/main/java/org/apache/cloudstack/api/response/ManagementServerResponse.java

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,14 @@ public class ManagementServerResponse extends BaseResponse {
8282
@Param(description = "the Management Server Peers")
8383
private List<PeerManagementServerNodeResponse> peers;
8484

85+
@SerializedName(ApiConstants.LAST_AGENTS)
86+
@Param(description = "the last agents this Management Server is responsible for, before shutdown or preparing for maintenance", since = "4.21.0.0")
87+
private List<String> lastAgents;
88+
89+
@SerializedName(ApiConstants.AGENTS)
90+
@Param(description = "the agents this Management Server is responsible for", since = "4.21.0.0")
91+
private List<String> agents;
92+
8593
@SerializedName(ApiConstants.AGENTS_COUNT)
8694
@Param(description = "the number of host agents this Management Server is responsible for", since = "4.21.0.0")
8795
private Long agentsCount;
@@ -134,6 +142,14 @@ public String getIpAddress() {
134142
return ipAddress;
135143
}
136144

145+
public List<String> getLastAgents() {
146+
return lastAgents;
147+
}
148+
149+
public List<String> getAgents() {
150+
return agents;
151+
}
152+
137153
public Long getAgentsCount() {
138154
return this.agentsCount;
139155
}
@@ -190,6 +206,14 @@ public void setIpAddress(String ipAddress) {
190206
this.ipAddress = ipAddress;
191207
}
192208

209+
public void setLastAgents(List<String> lastAgents) {
210+
this.lastAgents = lastAgents;
211+
}
212+
213+
public void setAgents(List<String> agents) {
214+
this.agents = agents;
215+
}
216+
193217
public void setAgentsCount(Long agentsCount) {
194218
this.agentsCount = agentsCount;
195219
}

0 commit comments

Comments
 (0)