forked from autotest/tp-libvirt
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmultivm_stress.py
More file actions
232 lines (200 loc) · 9.08 KB
/
multivm_stress.py
File metadata and controls
232 lines (200 loc) · 9.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
import logging as log
import time
from virttest import utils_stress
from virttest import error_context
from virttest import utils_test
from virttest import virsh
from virttest.libvirt_xml import vm_xml
# Using as lower capital is not the best way to do, but this is just a
# workaround to avoid changing the entire file.
logging = log.getLogger('avocado.' + __name__)
@error_context.context_aware
def run(test, params, env):
"""
:param test: kvm test object
:param params: Dictionary with the test parameters
:param env: Dictionary with test environment.
"""
guest_stress = params.get("guest_stress", "no") == "yes"
host_stress = params.get("host_stress", "no") == "yes"
stress_events = params.get("stress_events", "")
stress_time = params.get("stress_time", "30")
debug_dir = params.get("debug_dir", "/home/")
dump_options = params.get("dump_options", "--memory-only --bypass-cache")
vms = env.get_all_vms()
vms_uptime_init = {}
if "reboot" not in stress_events:
for vm in vms:
vms_uptime_init[vm.name] = vm.uptime()
if guest_stress:
# change the on_crash value to "preserve" when guest crashes
for vm in vms:
logging.debug("Setting on_crash to preserve in %s" % vm.name)
vmxml = vm_xml.VMXML.new_from_inactive_dumpxml(vm.name)
if vm.is_alive():
vm.destroy(gracefully=False)
vmxml.on_crash = "preserve"
vmxml.sync()
vm.start()
try:
utils_test.load_stress("stress_in_vms", params=params, vms=vms)
except Exception as err:
test.fail("Error running stress in vms: %s" % str(err))
if host_stress:
if params.get("host_stress_args", ""):
params["stress_args"] = params.get("host_stress_args")
try:
utils_test.load_stress("stress_on_host", params=params)
except Exception as err:
test.fail("Error running stress in host: %s" % str(err))
stress_timer = int(stress_time)
fail = False
found_traces = False
failed_vms = []
login_error_vms = []
unexpected_reboot_vms = []
error_message = ""
if guest_stress:
# check for any call traces in guest dmesg while stress is running
def check_call_traces(vm):
nonlocal stress_timer
found_trace = False
try:
retry_login = True
retry_times = 0
while retry_login:
try:
retry_login = False
session = vm.wait_for_login(timeout=100)
if vm in login_error_vms:
login_error_vms.remove(vm)
except Exception:
stress_timer -= 150
if vm in login_error_vms:
return False
retry_login = True
retry_times += 1
if retry_times == 3:
logging.debug("Error in logging into %s" % vm.name)
if vm not in login_error_vms:
login_error_vms.append(vm)
return False
time.sleep(30)
stress_timer -= 30
dmesg = session.cmd("dmesg")
dmesg_level = session.cmd("dmesg -l emerg,alert,crit")
if "Call Trace" in dmesg or len(dmesg_level) >= 1:
logging.debug("Call trace found in %s" % vm.name)
if vm not in failed_vms:
failed_vms.append(vm)
found_trace = True
session.close()
except Exception as err:
test.error("Error getting dmesg of %s due to %s" % (vm.name, str(err)))
return found_trace
# run stress for stress_time seconds
logging.debug("Sleeping for %s seconds waiting for stress completion" % stress_time)
stress_time = int(stress_time)
# check domstate of vms after stress_time
if stress_time < 600:
time.sleep(stress_time)
for vm in vms:
if vm.state() != "running":
logging.debug("%s state is %s" % (vm.name, vm.state()))
failed_vms.append(vm)
fail = True
else:
found_traces = check_call_traces(vm)
if found_traces:
fail = True
time.sleep(2)
# check domstate of vms for every 5 minutes during stress_time
else:
all_failed = False
number_of_checks = int(stress_time / 600)
delta_time = int(stress_time % 600)
for itr in range(number_of_checks):
if len(failed_vms) == len(vms) or len(login_error_vms) == len(vms):
all_failed = True
break
if stress_timer <= 0:
break
time.sleep(600)
for vm in vms:
if vm.state() != "running":
logging.debug("%s state is %s" % (vm.name, vm.state()))
if vm not in failed_vms:
failed_vms.append(vm)
fail = True
else:
found_traces = check_call_traces(vm)
if found_traces:
fail = True
time.sleep(3)
stress_timer -= 3
if delta_time > 0 and stress_timer > 0 and not all_failed:
time.sleep(delta_time)
for vm in vms:
if vm.state() != "running":
logging.debug("%s state is %s" % (vm.name, vm.state()))
if vm not in failed_vms:
failed_vms.append(vm)
fail = True
else:
found_traces = check_call_traces(vm)
if found_traces:
fail = True
time.sleep(3)
stress_timer -= 3
# virsh dump the failed vms into debug_dir
if fail:
for vm in failed_vms:
if vm.state() != "shut off":
logging.debug("Dumping %s to debug_dir %s" % (vm.name, debug_dir))
virsh.dump(vm.name, debug_dir+vm.name+"-core", dump_options, ignore_status=False, debug=True)
logging.debug("Successfully dumped %s as %s-core" % (vm.name, vm.name))
else:
logging.debug("Cannot dump %s as it is in shut off state" % vm.name)
failed_vms_string = ", ".join(vm.name for vm in failed_vms)
error_message = "Failure in " + failed_vms_string + " while running stress. "
if login_error_vms:
login_error_vms_string = ", ".join(vm.name for vm in login_error_vms)
error_message += "Login error in " + login_error_vms_string + " while running stress. "
if len(failed_vms) == len(vms) or len(login_error_vms) == len(vms):
error_message += "All vms in unstable state while running stress. Couldn't run STRESS EVENTS"
test.fail(error_message)
# run STRESS EVENTS in the remaining stable guests
if len(failed_vms) < len(vms) and len(login_error_vms) < len(vms):
for vm in failed_vms:
if vm in vms:
vms.remove(vm)
for vm in login_error_vms:
if vm in vms:
vms.remove(vm)
if len(vms) == 0:
error_message += "All vms in unstable state while running stress. Couldn't run STRESS EVENTS"
test.fail(error_message)
new_vms = ", ".join(vm.name for vm in vms)
try:
if stress_events != "":
logging.debug("Running stress_events in %s" % new_vms)
stress_event = utils_stress.VMStressEvents(params, env, vms)
stress_event.run_threads()
stress_event.wait_for_threads()
if guest_stress:
utils_test.unload_stress("stress_in_vms", params=params, vms=vms)
if host_stress:
utils_test.unload_stress("stress_on_host", params=params)
if "reboot" not in stress_events:
for vm in vms:
if vm.uptime() < vms_uptime_init[vm.name]:
logging.debug("Unexpected reboot of VM: %s between test", vm.name)
unexpected_reboot_vms.append(vm)
unexpected_reboot_vms_string = ", ".join(vm.name for vm in unexpected_reboot_vms)
if unexpected_reboot_vms:
error_message += "Unexpected reboot of guest(s) " + unexpected_reboot_vms_string + ". "
except Exception as err:
error_message += "Failure running STRESS EVENTS in " + new_vms + " due to" + str(err)
# check the test status
if error_message:
test.fail(error_message)