2
2
import logging
3
3
4
4
from kubeflow .katib import KatibClient , search
5
+ from kubeflow .katib .types .types import TrainerResources
5
6
from kubernetes import client
6
7
from verify import verify_experiment_results
7
8
@@ -19,21 +20,19 @@ def run_e2e_experiment_create_by_tune(
19
20
):
20
21
# Create Katib Experiment and wait until it is finished.
21
22
logging .debug ("Creating Experiment: {}/{}" .format (exp_namespace , exp_name ))
22
-
23
+
23
24
# Use the test case from get-started tutorial.
24
25
# https://www.kubeflow.org/docs/components/katib/getting-started/#getting-started-with-katib-python-sdk
25
26
# [1] Create an objective function.
26
27
def objective (parameters ):
27
28
import time
29
+
28
30
time .sleep (5 )
29
31
result = 4 * int (parameters ["a" ]) - float (parameters ["b" ]) ** 2
30
32
print (f"result={ result } " )
31
-
33
+
32
34
# [2] Create hyperparameter search space.
33
- parameters = {
34
- "a" : search .int (min = 10 , max = 20 ),
35
- "b" : search .double (min = 0.1 , max = 0.2 )
36
- }
35
+ parameters = {"a" : search .int (min = 10 , max = 20 ), "b" : search .double (min = 0.1 , max = 0.2 )}
37
36
38
37
# [3] Create Katib Experiment with 4 Trials and 2 CPUs per Trial.
39
38
# And Wait until Experiment reaches Succeeded condition.
@@ -58,13 +57,80 @@ def objective(parameters):
58
57
logging .debug (katib_client .get_suggestion (exp_name , exp_namespace ))
59
58
60
59
60
+ def run_e2e_experiment_create_by_tune_pytorchjob (
61
+ katib_client : KatibClient ,
62
+ exp_name : str ,
63
+ exp_namespace : str ,
64
+ ):
65
+ # Create Katib Experiment and wait until it is finished.
66
+ logging .debug ("Creating Experiment: {}/{}" .format (exp_namespace , exp_name ))
67
+
68
+ # Verify the PyTorchJob distributed.
69
+ def objective (parameters ):
70
+ import os
71
+ import time
72
+
73
+ import torch .distributed as dist
74
+
75
+ # Setup PyTorch distributed.
76
+ dist .init_process_group (backend = "gloo" )
77
+
78
+ print (
79
+ "PyTorch Dist. WORLD_SIZE: {}, RANK: {}, LOCAL_RANK: {}" .format (
80
+ dist .get_world_size (), dist .get_rank (), os .getenv ("LOCAL_RANK" )
81
+ )
82
+ )
83
+
84
+ time .sleep (5 )
85
+ # Only get results from the process with RANK=0.
86
+ if dist .get_rank () == 0 :
87
+ result = 4 * int (parameters ["a" ]) - float (parameters ["b" ]) ** 2
88
+ print (f"result={ result } " )
89
+ dist .destroy_process_group ()
90
+
91
+ # Create Katib Experiment with 3 Trials. Every Trial runs PyTorchJob with 2 workers.
92
+ katib_client .tune (
93
+ name = exp_name ,
94
+ namespace = exp_namespace ,
95
+ objective = objective ,
96
+ parameters = {
97
+ "a" : search .int (min = 10 , max = 20 ),
98
+ "b" : search .double (min = 0.1 , max = 0.2 ),
99
+ },
100
+ objective_metric_name = "result" ,
101
+ max_trial_count = 3 ,
102
+ parallel_trial_count = 2 ,
103
+ resources_per_trial = TrainerResources (
104
+ num_workers = 2 ,
105
+ num_procs_per_worker = 2 ,
106
+ resources_per_worker = {"cpu" : "2" },
107
+ ),
108
+ )
109
+
110
+ experiment = katib_client .wait_for_experiment_condition (
111
+ exp_name , exp_namespace , timeout = EXPERIMENT_TIMEOUT
112
+ )
113
+
114
+ # Verify the Experiment results.
115
+ verify_experiment_results (katib_client , experiment , exp_name , exp_namespace )
116
+
117
+ # Print the Experiment and Suggestion.
118
+ logging .debug (katib_client .get_experiment (exp_name , exp_namespace ))
119
+ logging .debug (katib_client .get_suggestion (exp_name , exp_namespace ))
120
+
121
+
61
122
if __name__ == "__main__" :
62
123
parser = argparse .ArgumentParser ()
63
124
parser .add_argument (
64
- "--namespace" , type = str , required = True , help = "Namespace for the Katib E2E test" ,
125
+ "--namespace" ,
126
+ type = str ,
127
+ required = True ,
128
+ help = "Namespace for the Katib E2E test" ,
65
129
)
66
130
parser .add_argument (
67
- "--verbose" , action = "store_true" , help = "Verbose output for the Katib E2E test" ,
131
+ "--verbose" ,
132
+ action = "store_true" ,
133
+ help = "Verbose output for the Katib E2E test" ,
68
134
)
69
135
args = parser .parse_args ()
70
136
@@ -74,20 +140,49 @@ def objective(parameters):
74
140
katib_client = KatibClient ()
75
141
76
142
namespace_labels = client .CoreV1Api ().read_namespace (args .namespace ).metadata .labels
77
- if 'katib.kubeflow.org/metrics-collector-injection' not in namespace_labels :
78
- namespace_labels ['katib.kubeflow.org/metrics-collector-injection' ] = 'enabled'
79
- client .CoreV1Api ().patch_namespace (args .namespace , {'metadata' : {'labels' : namespace_labels }})
143
+ if "katib.kubeflow.org/metrics-collector-injection" not in namespace_labels :
144
+ namespace_labels ["katib.kubeflow.org/metrics-collector-injection" ] = "enabled"
145
+ client .CoreV1Api ().patch_namespace (
146
+ args .namespace , {"metadata" : {"labels" : namespace_labels }}
147
+ )
80
148
81
149
# Test with run_e2e_experiment_create_by_tune
82
150
exp_name = "tune-example"
83
151
exp_namespace = args .namespace
84
152
try :
85
153
run_e2e_experiment_create_by_tune (katib_client , exp_name , exp_namespace )
86
154
logging .info ("---------------------------------------------------------------" )
87
- logging .info (f"E2E is succeeded for Experiment created by tune: { exp_namespace } /{ exp_name } " )
155
+ logging .info (
156
+ f"E2E is succeeded for Experiment created by tune: { exp_namespace } /{ exp_name } "
157
+ )
158
+ except Exception as e :
159
+ logging .info ("---------------------------------------------------------------" )
160
+ logging .info (
161
+ f"E2E is failed for Experiment created by tune: { exp_namespace } /{ exp_name } "
162
+ )
163
+ raise e
164
+ finally :
165
+ # Delete the Experiment.
166
+ logging .info ("---------------------------------------------------------------" )
167
+ logging .info ("---------------------------------------------------------------" )
168
+ katib_client .delete_experiment (exp_name , exp_namespace )
169
+
170
+ # Test with run_e2e_experiment_create_by_tune_pytorchjob
171
+ exp_name = "tune-example-pytorchjob"
172
+ exp_namespace = args .namespace
173
+ try :
174
+ run_e2e_experiment_create_by_tune_pytorchjob (
175
+ katib_client , exp_name , exp_namespace
176
+ )
177
+ logging .info ("---------------------------------------------------------------" )
178
+ logging .info (
179
+ f"E2E is succeeded for Experiment created by tune with PyTorchJob: { exp_namespace } /{ exp_name } "
180
+ )
88
181
except Exception as e :
89
182
logging .info ("---------------------------------------------------------------" )
90
- logging .info (f"E2E is failed for Experiment created by tune: { exp_namespace } /{ exp_name } " )
183
+ logging .info (
184
+ f"E2E is failed for Experiment created by tune with PyTorchJob: { exp_namespace } /{ exp_name } "
185
+ )
91
186
raise e
92
187
finally :
93
188
# Delete the Experiment.
0 commit comments