Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Add unit test for tune api #2410

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
2a882d7
update tune api for llm hyperparameters optimization
helenxie-bit Jul 21, 2024
0c3e067
resolve conflict
helenxie-bit Jul 21, 2024
158c8f3
resolve conflict
helenxie-bit Jul 21, 2024
f4a0d4e
fix the problem of dependency
helenxie-bit Jul 21, 2024
7e7dd56
fix the format of import statement
helenxie-bit Jul 21, 2024
62ad385
adjust the blank lines
helenxie-bit Jul 21, 2024
3f36740
delete the trainer to reuse it in Training Operator
helenxie-bit Jul 22, 2024
9d20253
update constants
helenxie-bit Jul 22, 2024
dfbe793
update metrics format
helenxie-bit Jul 25, 2024
290a249
update the type of and
helenxie-bit Jul 29, 2024
aba2606
update the message of 'ImportError'
helenxie-bit Jul 29, 2024
eaf0193
add TODO of PVC creation
helenxie-bit Jul 29, 2024
62355a2
update the name of pvc
helenxie-bit Jul 29, 2024
7b2b40e
reuse constants from Training Operator
helenxie-bit Jul 29, 2024
acd1dcf
keep 'parameters' and update validation
helenxie-bit Jul 30, 2024
10b057d
update for test
helenxie-bit Jul 31, 2024
5a87eb0
reuse 'get_container_spec' and 'get_pod_template_spec' from Training …
helenxie-bit Aug 7, 2024
8387e67
resolve conflicts
helenxie-bit Aug 7, 2024
71605b4
format with black
helenxie-bit Aug 7, 2024
35acedb
fix Lint error
helenxie-bit Aug 7, 2024
af534b3
fix Lint errors
helenxie-bit Aug 7, 2024
c7f6e10
delete types
helenxie-bit Aug 7, 2024
9fdbdb7
fix format
helenxie-bit Aug 7, 2024
ddd5153
update format
helenxie-bit Aug 7, 2024
b31e820
update format
helenxie-bit Aug 7, 2024
dad3831
fix e2e test error
helenxie-bit Aug 7, 2024
1afe56d
add TODO
helenxie-bit Aug 8, 2024
ad7bce8
format with max line length
helenxie-bit Aug 8, 2024
7e58c94
format docstring
helenxie-bit Aug 8, 2024
61dc8ca
update format
helenxie-bit Aug 8, 2024
ba0d7d1
add helper functions
helenxie-bit Aug 8, 2024
2a1b008
update format
helenxie-bit Aug 8, 2024
b368521
update format
helenxie-bit Aug 8, 2024
3ccbdf9
run test again
helenxie-bit Aug 12, 2024
64e34e0
run test again
helenxie-bit Aug 12, 2024
dde724c
run test again
helenxie-bit Aug 12, 2024
1cccd4a
fix dict substitution in training_parameters
helenxie-bit Aug 14, 2024
510661d
fix typo
helenxie-bit Aug 17, 2024
f03c5ba
Merge remote-tracking branch 'origin/master' into helenxie/update_tun…
helenxie-bit Aug 18, 2024
f6b15a2
resolve conflicts and add check for case of no parameters
helenxie-bit Aug 18, 2024
6a3e046
fix format
helenxie-bit Aug 18, 2024
25541b9
fix format
helenxie-bit Aug 18, 2024
99e74d1
fix format
helenxie-bit Aug 18, 2024
96cf99c
fix flake8 error
helenxie-bit Aug 18, 2024
c568806
fix format
helenxie-bit Aug 18, 2024
6f65253
fix format
helenxie-bit Aug 18, 2024
ad17ac9
fix format
helenxie-bit Aug 18, 2024
9a1e2df
fix format
helenxie-bit Aug 18, 2024
421aaa6
add pytorchjob for tune api
helenxie-bit Aug 19, 2024
bab4d92
fix format
helenxie-bit Aug 19, 2024
f11051d
add 'types' module
helenxie-bit Aug 19, 2024
96768bc
add unit test for tune api
helenxie-bit Aug 19, 2024
3edfb49
fix format
helenxie-bit Aug 19, 2024
6662ef7
fix format
helenxie-bit Aug 19, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
737 changes: 598 additions & 139 deletions sdk/python/v1beta1/kubeflow/katib/api/katib_client.py

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions sdk/python/v1beta1/kubeflow/katib/constants/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@


DEFAULT_PRIMARY_CONTAINER_NAME = "training-container"
PYTORCHJOB_PRIMARY_CONTAINER_NAME = "pytorch"

# Label to identify Experiment's resources.
EXPERIMENT_LABEL = "katib.kubeflow.org/experiment"
Expand Down Expand Up @@ -60,3 +61,8 @@
BASE_IMAGE_MXNET = "docker.io/mxnet/python:1.9.1_native_py3"

DEFAULT_DB_MANAGER_ADDRESS = "katib-db-manager.kubeflow:6789"

# The default value for dataset and model storage PVC.
PVC_DEFAULT_SIZE = "10Gi"
# The default value for PVC access modes.
PVC_DEFAULT_ACCESS_MODES = ["ReadWriteOnce", "ReadOnlyMany"]
7 changes: 7 additions & 0 deletions sdk/python/v1beta1/kubeflow/katib/types/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from __future__ import absolute_import

# Import types into type package.
from kubeflow.katib.types.trainer_resources import TrainerResources

# Import Kubernetes models.
from kubernetes.client import *
147 changes: 147 additions & 0 deletions sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import pprint

from kubeflow.katib.configuration import Configuration
import six


class TrainerResources(object):
def __init__(
self,
num_workers=None,
num_procs_per_worker=None,
resources_per_worker=None,
local_vars_configuration=None,
):
if local_vars_configuration is None:
local_vars_configuration = Configuration()
self.local_vars_configuration = local_vars_configuration

self._num_workers = None
self._num_procs_per_worker = None
self._resources_per_worker = None

if num_workers is not None:
self.num_workers = num_workers
if num_procs_per_worker is not None:
self.num_procs_per_worker = num_procs_per_worker
if resources_per_worker is not None:
self.resources_per_worker = resources_per_worker

@property
def num_workers(self):
"""Gets the number of workers of distributed training.

Number of workers is setting number of workers.

:return: The number of workers of distributed training.
:rtype: int
"""
return self._num_workers

@num_workers.setter
def num_workers(self, num_workers):
"""Sets the number of workers of distributed training.

Number of workers is setting number of workers.

:param num_workers: The number of workers of distributed training.
:type: int
"""

self._num_workers = num_workers

@property
def num_procs_per_worker(self):
"""Gets the number of processes per worker of distributed training.

Number of processes per worker is the setting number of processes per worker.

:return: The number of processed per worker of distributed training.
:rtype: int
"""
return self._num_procs_per_worker

@num_procs_per_worker.setter
def num_procs_per_worker(self, num_procs_per_worker):
"""Sets the number of processes per worker of distributed training.

Number of processes per worker is the setting number of processes per worker.

:param num_procs_per_worker: The number of processes per worker of distributed training.
:type: int
"""

self._num_procs_per_worker = num_procs_per_worker

@property
def resources_per_worker(self):
"""Gets the resources per worker of distributed training.

Resources per worker is the setting resources per worker.

:return: The resources per worker of distributed training.
:rtype: dict or V1ResourceRequirements
"""
return self._resources_per_worker

@resources_per_worker.setter
def resources_per_worker(self, resources_per_worker):
"""Sets the resources per worker of distributed training.

Resources per worker is the setting resources per worker.

:param resources_per_worker: The resources per worker of distributed training.
:type: dict or V1ResourceRequirements
"""

self._resources_per_worker = resources_per_worker

def to_dict(self):
"""Returns the resources properties as a dict"""
result = {}

for attr, _ in six.iteritems(self.__dict__):
value = getattr(self, attr)
if isinstance(value, list):
result[attr] = list(
map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
)
elif hasattr(value, "to_dict"):
result[attr] = value.to_dict()
elif isinstance(value, dict):
result[attr] = dict(
map(
lambda item: (
(item[0], item[1].to_dict())
if hasattr(item[1], "to_dict")
else item
),
value.items(),
)
)
else:
result[attr] = value

return result

def to_str(self):
"""Returns the string representation of the model"""
return pprint.pformat(self.to_dict())

def __repr__(self):
"""For `print` and `pprint`"""
return self.to_str()

def __eq__(self, other):
"""Returns true if both objects are equal"""
if not isinstance(other, TrainerResources):
return False

return self.to_dict() == other.to_dict()

def __ne__(self, other):
"""Returns true if both objects are not equal"""
if not isinstance(other, TrainerResources):
return True

return self.to_dict() != other.to_dict()
182 changes: 180 additions & 2 deletions sdk/python/v1beta1/kubeflow/katib/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import json
import os
import textwrap
from typing import Any, Callable
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

from kubeflow.katib import models
from kubeflow.katib.constants import constants
Expand Down Expand Up @@ -85,7 +85,6 @@ def validate_metrics_value(value: Any):


def validate_objective_function(objective: Callable):

# Check if objective function is callable.
if not callable(objective):
raise ValueError(
Expand Down Expand Up @@ -129,3 +128,182 @@ class FakeResponse:

def __init__(self, obj):
self.data = json.dumps(obj)


def get_command_using_train_func(
train_func: Optional[Callable],
train_func_parameters: Optional[Dict[str, Any]] = None,
packages_to_install: Optional[List[str]] = None,
pip_index_url: str = "https://pypi.org/simple",
) -> Tuple[List[str], List[str]]:
"""
Get container args and command from the given training function and parameters.
"""
# Check if function is callable.
if not callable(train_func):
raise ValueError(
f"Training function must be callable, got function type: {type(train_func)}"
)

# Extract function implementation.
func_code = inspect.getsource(train_func)

# Function might be defined in some indented scope (e.g. in another function).
# We need to dedent the function code.
func_code = textwrap.dedent(func_code)

# Wrap function code to execute it from the file. For example:
# def train(parameters):
# print('Start Training...')
# train({'lr': 0.01})
if train_func_parameters is None:
func_code = f"{func_code}\n{train_func.__name__}()\n"
else:
func_code = f"{func_code}\n{train_func.__name__}({train_func_parameters})\n"

# Prepare execute script template.
exec_script = textwrap.dedent(
"""
program_path=$(mktemp -d)
read -r -d '' SCRIPT << EOM\n
{func_code}
EOM
printf "%s" \"$SCRIPT\" > \"$program_path/ephemeral_script.py\"
python3 -u \"$program_path/ephemeral_script.py\""""
)

# Add function code to the execute script.
exec_script = exec_script.format(func_code=func_code)

# Install Python packages if that is required.
if packages_to_install is not None:
exec_script = (
get_script_for_python_packages(packages_to_install, pip_index_url)
+ exec_script
)

# Return container command and args to execute training function.
return ["bash", "-c"], [exec_script]


def get_container_spec(
name: str,
base_image: str,
train_func: Optional[Callable] = None,
train_func_parameters: Optional[Dict[str, Any]] = None,
packages_to_install: Optional[List[str]] = None,
pip_index_url: str = "https://pypi.org/simple",
args: Optional[List[str]] = None,
resources: Union[dict, models.V1ResourceRequirements, None] = None,
volume_mounts: Optional[List[models.V1VolumeMount]] = None,
env: Optional[List[models.V1EnvVar]] = None,
env_from: Optional[List[models.V1EnvFromSource]] = None,
) -> models.V1Container:
"""
Get container spec for the given parameters.
"""

if name is None or base_image is None:
raise ValueError("Container name or base image cannot be none")

# Create initial container spec.
container_spec = models.V1Container(
name=name, image=base_image, args=args, volume_mounts=volume_mounts
)

# If training function is set, override container command and args to execute the function.
if train_func is not None:
container_spec.command, container_spec.args = get_command_using_train_func(
train_func=train_func,
train_func_parameters=train_func_parameters,
packages_to_install=packages_to_install,
pip_index_url=pip_index_url,
)

# Convert dict to the Kubernetes container resources if that is required.
if isinstance(resources, dict):
# Convert all keys in resources to lowercase.
resources = {k.lower(): v for k, v in resources.items()}
if "gpu" in resources:
resources["nvidia.com/gpu"] = resources.pop("gpu")

resources = models.V1ResourceRequirements(
requests=resources,
limits=resources,
)

# Add resources to the container spec.
container_spec.resources = resources

# Add environment variables to the container spec.
if env:
container_spec.env = env
if env_from:
container_spec.env_from = env_from

return container_spec


def get_pod_template_spec(
containers: List[models.V1Container],
init_containers: Optional[List[models.V1Container]] = None,
volumes: Optional[List[models.V1Volume]] = None,
restart_policy: Optional[str] = None,
) -> models.V1PodTemplateSpec:
"""
Get Pod template spec for the given parameters.
"""

# Create Pod template spec. If the value is None, Pod doesn't have that parameter
pod_template_spec = models.V1PodTemplateSpec(
metadata=models.V1ObjectMeta(annotations={"sidecar.istio.io/inject": "false"}),
spec=models.V1PodSpec(
init_containers=init_containers,
containers=containers,
volumes=volumes,
restart_policy=restart_policy,
),
)

return pod_template_spec


def get_pvc_spec(
pvc_name: str,
namespace: str,
storage_config: Dict[str, Optional[Union[str, List[str]]]],
):
if pvc_name is None or namespace is None:
raise ValueError("One of the required storage config argument is None")

if "size" not in storage_config:
storage_config["size"] = constants.PVC_DEFAULT_SIZE

if "access_modes" not in storage_config:
storage_config["access_modes"] = constants.PVC_DEFAULT_ACCESS_MODES

pvc_spec = models.V1PersistentVolumeClaim(
api_version="v1",
kind="PersistentVolumeClaim",
metadata={"name": pvc_name, "namespace": namespace},
spec=models.V1PersistentVolumeClaimSpec(
access_modes=storage_config["access_modes"],
resources=models.V1ResourceRequirements(
requests={"storage": storage_config["size"]}
),
),
)

if "storage_class" in storage_config:
pvc_spec.spec.storage_class_name = storage_config["storage_class"]

return pvc_spec


class SetEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, set):
return list(obj)
if isinstance(obj, type):
return obj.__name__
return json.JSONEncoder.default(self, obj)
3 changes: 3 additions & 0 deletions sdk/python/v1beta1/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,7 @@
"Topic :: Software Development :: Libraries :: Python Modules",
],
install_requires=REQUIRES,
extras_require={
"huggingface": ["kubeflow-training[huggingface]"],
},
)
Loading
Loading