Skip to content

Commit 238fda3

Browse files
committed
Add e2e test for PyTorchJob as Trial
Signed-off-by: Andrey Velichkevich <[email protected]>
1 parent e04213f commit 238fda3

File tree

2 files changed

+109
-14
lines changed

2 files changed

+109
-14
lines changed

test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py

Lines changed: 108 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import logging
33

44
from kubeflow.katib import KatibClient, search
5+
from kubeflow.katib.types.types import TrainerResources
56
from kubernetes import client
67
from verify import verify_experiment_results
78

@@ -19,21 +20,19 @@ def run_e2e_experiment_create_by_tune(
1920
):
2021
# Create Katib Experiment and wait until it is finished.
2122
logging.debug("Creating Experiment: {}/{}".format(exp_namespace, exp_name))
22-
23+
2324
# Use the test case from get-started tutorial.
2425
# https://www.kubeflow.org/docs/components/katib/getting-started/#getting-started-with-katib-python-sdk
2526
# [1] Create an objective function.
2627
def objective(parameters):
2728
import time
29+
2830
time.sleep(5)
2931
result = 4 * int(parameters["a"]) - float(parameters["b"]) ** 2
3032
print(f"result={result}")
31-
33+
3234
# [2] Create hyperparameter search space.
33-
parameters = {
34-
"a": search.int(min=10, max=20),
35-
"b": search.double(min=0.1, max=0.2)
36-
}
35+
parameters = {"a": search.int(min=10, max=20), "b": search.double(min=0.1, max=0.2)}
3736

3837
# [3] Create Katib Experiment with 4 Trials and 2 CPUs per Trial.
3938
# And Wait until Experiment reaches Succeeded condition.
@@ -58,13 +57,80 @@ def objective(parameters):
5857
logging.debug(katib_client.get_suggestion(exp_name, exp_namespace))
5958

6059

60+
def run_e2e_experiment_create_by_tune_pytorchjob(
61+
katib_client: KatibClient,
62+
exp_name: str,
63+
exp_namespace: str,
64+
):
65+
# Create Katib Experiment and wait until it is finished.
66+
logging.debug("Creating Experiment: {}/{}".format(exp_namespace, exp_name))
67+
68+
# Verify the PyTorchJob distributed.
69+
def objective(parameters):
70+
import os
71+
import time
72+
73+
import torch.distributed as dist
74+
75+
# Setup PyTorch distributed.
76+
dist.init_process_group(backend="gloo")
77+
78+
print(
79+
"PyTorch Dist. WORLD_SIZE: {}, RANK: {}, LOCAL_RANK: {}".format(
80+
dist.get_world_size(), dist.get_rank(), os.getenv("LOCAL_RANK")
81+
)
82+
)
83+
84+
time.sleep(5)
85+
# Only get results from the process with RANK=0.
86+
if dist.get_rank() == 0:
87+
result = 4 * int(parameters["a"]) - float(parameters["b"]) ** 2
88+
print(f"result={result}")
89+
dist.destroy_process_group()
90+
91+
# Create Katib Experiment with 3 Trials. Every Trial runs PyTorchJob with 2 workers.
92+
katib_client.tune(
93+
name=exp_name,
94+
namespace=exp_namespace,
95+
objective=objective,
96+
parameters={
97+
"a": search.int(min=10, max=20),
98+
"b": search.double(min=0.1, max=0.2),
99+
},
100+
objective_metric_name="result",
101+
max_trial_count=3,
102+
parallel_trial_count=2,
103+
resources_per_trial=TrainerResources(
104+
num_workers=2,
105+
num_procs_per_worker=2,
106+
resources_per_worker={"cpu": "2"},
107+
),
108+
)
109+
110+
experiment = katib_client.wait_for_experiment_condition(
111+
exp_name, exp_namespace, timeout=EXPERIMENT_TIMEOUT
112+
)
113+
114+
# Verify the Experiment results.
115+
verify_experiment_results(katib_client, experiment, exp_name, exp_namespace)
116+
117+
# Print the Experiment and Suggestion.
118+
logging.debug(katib_client.get_experiment(exp_name, exp_namespace))
119+
logging.debug(katib_client.get_suggestion(exp_name, exp_namespace))
120+
121+
61122
if __name__ == "__main__":
62123
parser = argparse.ArgumentParser()
63124
parser.add_argument(
64-
"--namespace", type=str, required=True, help="Namespace for the Katib E2E test",
125+
"--namespace",
126+
type=str,
127+
required=True,
128+
help="Namespace for the Katib E2E test",
65129
)
66130
parser.add_argument(
67-
"--verbose", action="store_true", help="Verbose output for the Katib E2E test",
131+
"--verbose",
132+
action="store_true",
133+
help="Verbose output for the Katib E2E test",
68134
)
69135
args = parser.parse_args()
70136

@@ -74,20 +140,49 @@ def objective(parameters):
74140
katib_client = KatibClient()
75141

76142
namespace_labels = client.CoreV1Api().read_namespace(args.namespace).metadata.labels
77-
if 'katib.kubeflow.org/metrics-collector-injection' not in namespace_labels:
78-
namespace_labels['katib.kubeflow.org/metrics-collector-injection'] = 'enabled'
79-
client.CoreV1Api().patch_namespace(args.namespace, {'metadata': {'labels': namespace_labels}})
143+
if "katib.kubeflow.org/metrics-collector-injection" not in namespace_labels:
144+
namespace_labels["katib.kubeflow.org/metrics-collector-injection"] = "enabled"
145+
client.CoreV1Api().patch_namespace(
146+
args.namespace, {"metadata": {"labels": namespace_labels}}
147+
)
80148

81149
# Test with run_e2e_experiment_create_by_tune
82150
exp_name = "tune-example"
83151
exp_namespace = args.namespace
84152
try:
85153
run_e2e_experiment_create_by_tune(katib_client, exp_name, exp_namespace)
86154
logging.info("---------------------------------------------------------------")
87-
logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}")
155+
logging.info(
156+
f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}"
157+
)
158+
except Exception as e:
159+
logging.info("---------------------------------------------------------------")
160+
logging.info(
161+
f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}"
162+
)
163+
raise e
164+
finally:
165+
# Delete the Experiment.
166+
logging.info("---------------------------------------------------------------")
167+
logging.info("---------------------------------------------------------------")
168+
katib_client.delete_experiment(exp_name, exp_namespace)
169+
170+
# Test with run_e2e_experiment_create_by_tune_pytorchjob
171+
exp_name = "tune-example-pytorchjob"
172+
exp_namespace = args.namespace
173+
try:
174+
run_e2e_experiment_create_by_tune_pytorchjob(
175+
katib_client, exp_name, exp_namespace
176+
)
177+
logging.info("---------------------------------------------------------------")
178+
logging.info(
179+
f"E2E is succeeded for Experiment created by tune with PyTorchJob: {exp_namespace}/{exp_name}"
180+
)
88181
except Exception as e:
89182
logging.info("---------------------------------------------------------------")
90-
logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}")
183+
logging.info(
184+
f"E2E is failed for Experiment created by tune with PyTorchJob: {exp_namespace}/{exp_name}"
185+
)
91186
raise e
92187
finally:
93188
# Delete the Experiment.

test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ DEPLOY_TRAINING_OPERATOR=${2:-false}
2525
WITH_DATABASE_TYPE=${3:-mysql}
2626

2727
E2E_TEST_IMAGE_TAG="e2e-test"
28-
TRAINING_OPERATOR_VERSION="v1.6.0-rc.0"
28+
TRAINING_OPERATOR_VERSION="v1.9.0"
2929

3030
echo "Start to install Katib"
3131

0 commit comments

Comments
 (0)