22import  logging 
33
44from  kubeflow .katib  import  KatibClient , search 
5+ from  kubeflow .katib .types .types  import  TrainerResources 
56from  kubernetes  import  client 
67from  verify  import  verify_experiment_results 
78
@@ -19,21 +20,19 @@ def run_e2e_experiment_create_by_tune(
1920):
2021    # Create Katib Experiment and wait until it is finished. 
2122    logging .debug ("Creating Experiment: {}/{}" .format (exp_namespace , exp_name ))
22-      
23+ 
2324    # Use the test case from get-started tutorial. 
2425    # https://www.kubeflow.org/docs/components/katib/getting-started/#getting-started-with-katib-python-sdk 
2526    # [1] Create an objective function. 
2627    def  objective (parameters ):
2728        import  time 
29+ 
2830        time .sleep (5 )
2931        result  =  4  *  int (parameters ["a" ]) -  float (parameters ["b" ]) **  2 
3032        print (f"result={ result }  )
31-      
33+ 
3234    # [2] Create hyperparameter search space. 
33-     parameters  =  {
34-         "a" : search .int (min = 10 , max = 20 ),
35-         "b" : search .double (min = 0.1 , max = 0.2 )
36-     }
35+     parameters  =  {"a" : search .int (min = 10 , max = 20 ), "b" : search .double (min = 0.1 , max = 0.2 )}
3736
3837    # [3] Create Katib Experiment with 4 Trials and 2 CPUs per Trial. 
3938    # And Wait until Experiment reaches Succeeded condition. 
@@ -58,13 +57,80 @@ def objective(parameters):
5857    logging .debug (katib_client .get_suggestion (exp_name , exp_namespace ))
5958
6059
60+ def  run_e2e_experiment_create_by_tune_pytorchjob (
61+     katib_client : KatibClient ,
62+     exp_name : str ,
63+     exp_namespace : str ,
64+ ):
65+     # Create Katib Experiment and wait until it is finished. 
66+     logging .debug ("Creating Experiment: {}/{}" .format (exp_namespace , exp_name ))
67+ 
68+     # Verify the PyTorchJob distributed. 
69+     def  objective (parameters ):
70+         import  os 
71+         import  time 
72+ 
73+         import  torch .distributed  as  dist 
74+ 
75+         # Setup PyTorch distributed. 
76+         dist .init_process_group (backend = "gloo" )
77+ 
78+         print (
79+             "PyTorch Dist. WORLD_SIZE: {}, RANK: {}, LOCAL_RANK: {}" .format (
80+                 dist .get_world_size (), dist .get_rank (), os .getenv ("LOCAL_RANK" )
81+             )
82+         )
83+ 
84+         time .sleep (5 )
85+         # Only get results from the process with RANK=0. 
86+         if  dist .get_rank () ==  0 :
87+             result  =  4  *  int (parameters ["a" ]) -  float (parameters ["b" ]) **  2 
88+             print (f"result={ result }  )
89+         dist .destroy_process_group ()
90+ 
91+     # Create Katib Experiment with 3 Trials. Every Trial runs PyTorchJob with 2 workers. 
92+     katib_client .tune (
93+         name = exp_name ,
94+         namespace = exp_namespace ,
95+         objective = objective ,
96+         parameters = {
97+             "a" : search .int (min = 10 , max = 20 ),
98+             "b" : search .double (min = 0.1 , max = 0.2 ),
99+         },
100+         objective_metric_name = "result" ,
101+         max_trial_count = 3 ,
102+         parallel_trial_count = 2 ,
103+         resources_per_trial = TrainerResources (
104+             num_workers = 2 ,
105+             num_procs_per_worker = 2 ,
106+             resources_per_worker = {"cpu" : "2" },
107+         ),
108+     )
109+ 
110+     experiment  =  katib_client .wait_for_experiment_condition (
111+         exp_name , exp_namespace , timeout = EXPERIMENT_TIMEOUT 
112+     )
113+ 
114+     # Verify the Experiment results. 
115+     verify_experiment_results (katib_client , experiment , exp_name , exp_namespace )
116+ 
117+     # Print the Experiment and Suggestion. 
118+     logging .debug (katib_client .get_experiment (exp_name , exp_namespace ))
119+     logging .debug (katib_client .get_suggestion (exp_name , exp_namespace ))
120+ 
121+ 
61122if  __name__  ==  "__main__" :
62123    parser  =  argparse .ArgumentParser ()
63124    parser .add_argument (
64-         "--namespace" , type = str , required = True , help = "Namespace for the Katib E2E test" ,
125+         "--namespace" ,
126+         type = str ,
127+         required = True ,
128+         help = "Namespace for the Katib E2E test" ,
65129    )
66130    parser .add_argument (
67-         "--verbose" , action = "store_true" , help = "Verbose output for the Katib E2E test" ,
131+         "--verbose" ,
132+         action = "store_true" ,
133+         help = "Verbose output for the Katib E2E test" ,
68134    )
69135    args  =  parser .parse_args ()
70136
@@ -74,20 +140,49 @@ def objective(parameters):
74140    katib_client  =  KatibClient ()
75141
76142    namespace_labels  =  client .CoreV1Api ().read_namespace (args .namespace ).metadata .labels 
77-     if  'katib.kubeflow.org/metrics-collector-injection'  not  in namespace_labels :
78-         namespace_labels ['katib.kubeflow.org/metrics-collector-injection' ] =  'enabled' 
79-         client .CoreV1Api ().patch_namespace (args .namespace , {'metadata' : {'labels' : namespace_labels }})
143+     if  "katib.kubeflow.org/metrics-collector-injection"  not  in namespace_labels :
144+         namespace_labels ["katib.kubeflow.org/metrics-collector-injection" ] =  "enabled" 
145+         client .CoreV1Api ().patch_namespace (
146+             args .namespace , {"metadata" : {"labels" : namespace_labels }}
147+         )
80148
81149    # Test with run_e2e_experiment_create_by_tune 
82150    exp_name  =  "tune-example" 
83151    exp_namespace  =  args .namespace 
84152    try :
85153        run_e2e_experiment_create_by_tune (katib_client , exp_name , exp_namespace )
86154        logging .info ("---------------------------------------------------------------" )
87-         logging .info (f"E2E is succeeded for Experiment created by tune: { exp_namespace } { exp_name }  )
155+         logging .info (
156+             f"E2E is succeeded for Experiment created by tune: { exp_namespace } { exp_name }  
157+         )
158+     except  Exception  as  e :
159+         logging .info ("---------------------------------------------------------------" )
160+         logging .info (
161+             f"E2E is failed for Experiment created by tune: { exp_namespace } { exp_name }  
162+         )
163+         raise  e 
164+     finally :
165+         # Delete the Experiment. 
166+         logging .info ("---------------------------------------------------------------" )
167+         logging .info ("---------------------------------------------------------------" )
168+         katib_client .delete_experiment (exp_name , exp_namespace )
169+ 
170+     # Test with run_e2e_experiment_create_by_tune_pytorchjob 
171+     exp_name  =  "tune-example-pytorchjob" 
172+     exp_namespace  =  args .namespace 
173+     try :
174+         run_e2e_experiment_create_by_tune_pytorchjob (
175+             katib_client , exp_name , exp_namespace 
176+         )
177+         logging .info ("---------------------------------------------------------------" )
178+         logging .info (
179+             f"E2E is succeeded for Experiment created by tune with PyTorchJob: { exp_namespace } { exp_name }  
180+         )
88181    except  Exception  as  e :
89182        logging .info ("---------------------------------------------------------------" )
90-         logging .info (f"E2E is failed for Experiment created by tune: { exp_namespace } { exp_name }  )
183+         logging .info (
184+             f"E2E is failed for Experiment created by tune with PyTorchJob: { exp_namespace } { exp_name }  
185+         )
91186        raise  e 
92187    finally :
93188        # Delete the Experiment. 
0 commit comments