kubeflow · helenxie-bit · Jul 21, 2024 · Jul 21, 2024 · Jul 21, 2024 · Jul 21, 2024
diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
diff --git a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py
@@ -33,6 +33,7 @@
 
 
 DEFAULT_PRIMARY_CONTAINER_NAME = "training-container"
+PYTORCHJOB_PRIMARY_CONTAINER_NAME = "pytorch"
 
 # Label to identify Experiment's resources.
 EXPERIMENT_LABEL = "katib.kubeflow.org/experiment"
@@ -60,3 +61,8 @@
 BASE_IMAGE_MXNET = "docker.io/mxnet/python:1.9.1_native_py3"
 
 DEFAULT_DB_MANAGER_ADDRESS = "katib-db-manager.kubeflow:6789"
+
+# The default value for dataset and model storage PVC.
+PVC_DEFAULT_SIZE = "10Gi"
+# The default value for PVC access modes.
+PVC_DEFAULT_ACCESS_MODES = ["ReadWriteOnce", "ReadOnlyMany"]
diff --git a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py
@@ -0,0 +1,7 @@
+from __future__ import absolute_import
+
+# Import types into type package.
+from kubeflow.katib.types.trainer_resources import TrainerResources
+
+# Import Kubernetes models.
+from kubernetes.client import *
diff --git a/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py
@@ -0,0 +1,147 @@
+import pprint
+
+from kubeflow.katib.configuration import Configuration
+import six
+
+
+class TrainerResources(object):
+    def __init__(
+        self,
+        num_workers=None,
+        num_procs_per_worker=None,
+        resources_per_worker=None,
+        local_vars_configuration=None,
+    ):
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._num_workers = None
+        self._num_procs_per_worker = None
+        self._resources_per_worker = None
+
+        if num_workers is not None:
+            self.num_workers = num_workers
+        if num_procs_per_worker is not None:
+            self.num_procs_per_worker = num_procs_per_worker
+        if resources_per_worker is not None:
+            self.resources_per_worker = resources_per_worker
+
+    @property
+    def num_workers(self):
+        """Gets the number of workers of distributed training.
+
+        Number of workers is setting number of workers.
+
+        :return: The number of workers of distributed training.
+        :rtype: int
+        """
+        return self._num_workers
+
+    @num_workers.setter
+    def num_workers(self, num_workers):
+        """Sets the number of workers of distributed training.
+
+        Number of workers is setting number of workers.
+
+        :param num_workers: The number of workers of distributed training.
+        :type: int
+        """
+
+        self._num_workers = num_workers
+
+    @property
+    def num_procs_per_worker(self):
+        """Gets the number of processes per worker of distributed training.
+
+        Number of processes per worker is the setting number of processes per worker.
+
+        :return: The number of processed per worker of distributed training.
+        :rtype: int
+        """
+        return self._num_procs_per_worker
+
+    @num_procs_per_worker.setter
+    def num_procs_per_worker(self, num_procs_per_worker):
+        """Sets the number of processes per worker of distributed training.
+
+        Number of processes per worker is the setting number of processes per worker.
+
+        :param num_procs_per_worker: The number of processes per worker of distributed training.
+        :type: int
+        """
+
+        self._num_procs_per_worker = num_procs_per_worker
+
+    @property
+    def resources_per_worker(self):
+        """Gets the resources per worker of distributed training.
+
+        Resources per worker is the setting resources per worker.
+
+        :return: The resources per worker of distributed training.
+        :rtype: dict or V1ResourceRequirements
+        """
+        return self._resources_per_worker
+
+    @resources_per_worker.setter
+    def resources_per_worker(self, resources_per_worker):
+        """Sets the resources per worker of distributed training.
+
+        Resources per worker is the setting resources per worker.
+
+        :param resources_per_worker: The resources per worker of distributed training.
+        :type: dict or V1ResourceRequirements
+        """
+
+        self._resources_per_worker = resources_per_worker
+
+    def to_dict(self):
+        """Returns the resources properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.__dict__):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (
+                            (item[0], item[1].to_dict())
+                            if hasattr(item[1], "to_dict")
+                            else item
+                        ),
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, TrainerResources):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, TrainerResources):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py
@@ -16,7 +16,7 @@
 import json
 import os
 import textwrap
-from typing import Any, Callable
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 from kubeflow.katib import models
 from kubeflow.katib.constants import constants
@@ -85,7 +85,6 @@ def validate_metrics_value(value: Any):
 
 
 def validate_objective_function(objective: Callable):
-
     # Check if objective function is callable.
     if not callable(objective):
         raise ValueError(
@@ -129,3 +128,182 @@ class FakeResponse:
 
     def __init__(self, obj):
         self.data = json.dumps(obj)
+
+
+def get_command_using_train_func(
+    train_func: Optional[Callable],
+    train_func_parameters: Optional[Dict[str, Any]] = None,
+    packages_to_install: Optional[List[str]] = None,
+    pip_index_url: str = "https://pypi.org/simple",
+) -> Tuple[List[str], List[str]]:
+    """
+    Get container args and command from the given training function and parameters.
+    """
+    # Check if function is callable.
+    if not callable(train_func):
+        raise ValueError(
+            f"Training function must be callable, got function type: {type(train_func)}"
+        )
+
+    # Extract function implementation.
+    func_code = inspect.getsource(train_func)
+
+    # Function might be defined in some indented scope (e.g. in another function).
+    # We need to dedent the function code.
+    func_code = textwrap.dedent(func_code)
+
+    # Wrap function code to execute it from the file. For example:
+    # def train(parameters):
+    #     print('Start Training...')
+    # train({'lr': 0.01})
+    if train_func_parameters is None:
+        func_code = f"{func_code}\n{train_func.__name__}()\n"
+    else:
+        func_code = f"{func_code}\n{train_func.__name__}({train_func_parameters})\n"
+
+    # Prepare execute script template.
+    exec_script = textwrap.dedent(
+        """
+                program_path=$(mktemp -d)
+                read -r -d '' SCRIPT << EOM\n
+                {func_code}
+                EOM
+                printf "%s" \"$SCRIPT\" > \"$program_path/ephemeral_script.py\"
+                python3 -u \"$program_path/ephemeral_script.py\""""
+    )
+
+    # Add function code to the execute script.
+    exec_script = exec_script.format(func_code=func_code)
+
+    # Install Python packages if that is required.
+    if packages_to_install is not None:
+        exec_script = (
+            get_script_for_python_packages(packages_to_install, pip_index_url)
+            + exec_script
+        )
+
+    # Return container command and args to execute training function.
+    return ["bash", "-c"], [exec_script]
+
+
+def get_container_spec(
+    name: str,
+    base_image: str,
+    train_func: Optional[Callable] = None,
+    train_func_parameters: Optional[Dict[str, Any]] = None,
+    packages_to_install: Optional[List[str]] = None,
+    pip_index_url: str = "https://pypi.org/simple",
+    args: Optional[List[str]] = None,
+    resources: Union[dict, models.V1ResourceRequirements, None] = None,
+    volume_mounts: Optional[List[models.V1VolumeMount]] = None,
+    env: Optional[List[models.V1EnvVar]] = None,
+    env_from: Optional[List[models.V1EnvFromSource]] = None,
+) -> models.V1Container:
+    """
+    Get container spec for the given parameters.
+    """
+
+    if name is None or base_image is None:
+        raise ValueError("Container name or base image cannot be none")
+
+    # Create initial container spec.
+    container_spec = models.V1Container(
+        name=name, image=base_image, args=args, volume_mounts=volume_mounts
+    )
+
+    # If training function is set, override container command and args to execute the function.
+    if train_func is not None:
+        container_spec.command, container_spec.args = get_command_using_train_func(
+            train_func=train_func,
+            train_func_parameters=train_func_parameters,
+            packages_to_install=packages_to_install,
+            pip_index_url=pip_index_url,
+        )
+
+    # Convert dict to the Kubernetes container resources if that is required.
+    if isinstance(resources, dict):
+        # Convert all keys in resources to lowercase.
+        resources = {k.lower(): v for k, v in resources.items()}
+        if "gpu" in resources:
+            resources["nvidia.com/gpu"] = resources.pop("gpu")
+
+        resources = models.V1ResourceRequirements(
+            requests=resources,
+            limits=resources,
+        )
+
+    # Add resources to the container spec.
+    container_spec.resources = resources
+
+    # Add environment variables to the container spec.
+    if env:
+        container_spec.env = env
+    if env_from:
+        container_spec.env_from = env_from
+
+    return container_spec
+
+
+def get_pod_template_spec(
+    containers: List[models.V1Container],
+    init_containers: Optional[List[models.V1Container]] = None,
+    volumes: Optional[List[models.V1Volume]] = None,
+    restart_policy: Optional[str] = None,
+) -> models.V1PodTemplateSpec:
+    """
+    Get Pod template spec for the given parameters.
+    """
+
+    # Create Pod template spec. If the value is None, Pod doesn't have that parameter
+    pod_template_spec = models.V1PodTemplateSpec(
+        metadata=models.V1ObjectMeta(annotations={"sidecar.istio.io/inject": "false"}),
+        spec=models.V1PodSpec(
+            init_containers=init_containers,
+            containers=containers,
+            volumes=volumes,
+            restart_policy=restart_policy,
+        ),
+    )
+
+    return pod_template_spec
+
+
+def get_pvc_spec(
+    pvc_name: str,
+    namespace: str,
+    storage_config: Dict[str, Optional[Union[str, List[str]]]],
+):
+    if pvc_name is None or namespace is None:
+        raise ValueError("One of the required storage config argument is None")
+
+    if "size" not in storage_config:
+        storage_config["size"] = constants.PVC_DEFAULT_SIZE
+
+    if "access_modes" not in storage_config:
+        storage_config["access_modes"] = constants.PVC_DEFAULT_ACCESS_MODES
+
+    pvc_spec = models.V1PersistentVolumeClaim(
+        api_version="v1",
+        kind="PersistentVolumeClaim",
+        metadata={"name": pvc_name, "namespace": namespace},
+        spec=models.V1PersistentVolumeClaimSpec(
+            access_modes=storage_config["access_modes"],
+            resources=models.V1ResourceRequirements(
+                requests={"storage": storage_config["size"]}
+            ),
+        ),
+    )
+
+    if "storage_class" in storage_config:
+        pvc_spec.spec.storage_class_name = storage_config["storage_class"]
+
+    return pvc_spec
+
+
+class SetEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, set):
+            return list(obj)
+        if isinstance(obj, type):
+            return obj.__name__
+        return json.JSONEncoder.default(self, obj)
diff --git a/sdk/python/v1beta1/setup.py b/sdk/python/v1beta1/setup.py
@@ -69,4 +69,7 @@
         "Topic :: Software Development :: Libraries :: Python Modules",
     ],
     install_requires=REQUIRES,
+    extras_require={
+        "huggingface": ["kubeflow-training[huggingface]"],
+    },
 )