@@ -68,9 +68,11 @@ def collect_testmodules():
68
68
print ("Test module discovery failed." )
69
69
exit (return_code )
70
70
for line in stdout .split ("\n " ):
71
- match = re .match ("<Module (.*)>" , line )
71
+ match = re .match ("<Module (.*)>" , line . strip () )
72
72
if match :
73
73
test_file = match .group (1 )
74
+ if "/" not in test_file :
75
+ test_file = os .path .join ("tests" ,test_file )
74
76
all_test_files .append (test_file )
75
77
print ("---------- collected test modules ----------" )
76
78
print ("Found %d test modules." % (len (all_test_files )))
@@ -79,7 +81,7 @@ def collect_testmodules():
79
81
return all_test_files
80
82
81
83
82
- def run_test (testmodule , gpu_tokens ):
84
+ def run_test (testmodule , gpu_tokens , continue_on_fail ):
83
85
global LAST_CODE
84
86
with GPU_LOCK :
85
87
if LAST_CODE != 0 :
@@ -90,39 +92,43 @@ def run_test(testmodule, gpu_tokens):
90
92
"XLA_PYTHON_CLIENT_ALLOCATOR" : "default" ,
91
93
}
92
94
testfile = extract_filename (testmodule )
93
- cmd = ["python3" , "-m" , "pytest" , '--html={}/{}_log.html' .format (base_dir , testfile ), "--reruns" , "3" , "-x" , testmodule ]
95
+ if continue_on_fail :
96
+ cmd = ["python3" , "-m" , "pytest" , '--html={}/{}_log.html' .format (base_dir , testfile ), "--reruns" , "3" , "-v" , testmodule ]
97
+ else :
98
+ cmd = ["python3" , "-m" , "pytest" , '--html={}/{}_log.html' .format (base_dir , testfile ), "--reruns" , "3" , "-x" , "-v" , testmodule ]
94
99
return_code , stderr , stdout = run_shell_command (cmd , env_vars = env_vars )
95
100
with GPU_LOCK :
96
101
gpu_tokens .append (target_gpu )
97
102
if LAST_CODE == 0 :
98
103
print ("Running tests in module %s on GPU %d:" % (testmodule , target_gpu ))
99
104
print (stdout )
100
105
print (stderr )
101
- LAST_CODE = return_code
106
+ if continue_on_fail == False :
107
+ LAST_CODE = return_code
102
108
return
103
109
104
110
105
- def run_parallel (all_testmodules , p ):
106
- print ("Running tests with parallelism=" , p )
111
+ def run_parallel (all_testmodules , p , c ):
112
+ print (f "Running tests with parallelism=" , p )
107
113
available_gpu_tokens = list (range (p ))
108
114
executor = ThreadPoolExecutor (max_workers = p )
109
115
# walking through test modules
110
116
for testmodule in all_testmodules :
111
- executor .submit (run_test , testmodule , available_gpu_tokens )
117
+ executor .submit (run_test , testmodule , available_gpu_tokens , c )
112
118
# waiting for all modules to finish
113
119
executor .shutdown (wait = True ) # wait for all jobs to finish
114
120
return
115
121
116
122
117
123
def find_num_gpus ():
118
- cmd = ["lspci|grep 'controller'|grep 'AMD/ATI'|wc -l" ]
124
+ cmd = ["lspci|grep 'controller\|accel '|grep 'AMD/ATI'|wc -l" ]
119
125
_ , _ , stdout = run_shell_command (cmd , shell = True )
120
126
return int (stdout )
121
127
122
128
123
129
def main (args ):
124
130
all_testmodules = collect_testmodules ()
125
- run_parallel (all_testmodules , args .parallel )
131
+ run_parallel (all_testmodules , args .parallel , args . continue_on_fail )
126
132
generate_final_report ()
127
133
exit (LAST_CODE )
128
134
@@ -134,7 +140,13 @@ def main(args):
134
140
"--parallel" ,
135
141
type = int ,
136
142
help = "number of tests to run in parallel" )
143
+ parser .add_argument ("-c" ,
144
+ "--continue_on_fail" ,
145
+ action = 'store_true' ,
146
+ help = "continue on failure" )
137
147
args = parser .parse_args ()
148
+ if args .continue_on_fail :
149
+ print ("continue on fail is set" )
138
150
if args .parallel is None :
139
151
sys_gpu_count = find_num_gpus ()
140
152
args .parallel = sys_gpu_count
0 commit comments