From 2a882d7004ce5d0f47d67de4033c89d7dbf28900 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 20 Jul 2024 23:18:41 -0700 Subject: [PATCH 01/84] update tune api for llm hyperparameters optimization Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 381 ++++++++++++++---- .../kubeflow/katib/constants/constants.py | 25 ++ .../v1beta1/kubeflow/katib/utils/utils.py | 43 ++ .../v1beta1/kubeflow/trainer/Dockerfile | 17 + .../kubeflow/trainer/hf_llm_optimization.py | 196 +++++++++ .../v1beta1/kubeflow/trainer/requirements.txt | 4 + sdk/python/v1beta1/setup.py | 3 + 7 files changed, 582 insertions(+), 87 deletions(-) create mode 100644 sdk/python/v1beta1/kubeflow/trainer/Dockerfile create mode 100644 sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py create mode 100644 sdk/python/v1beta1/kubeflow/trainer/requirements.txt diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 8be9e52f6da..e7d6d49bcda 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -17,6 +17,9 @@ import textwrap import time from typing import Any, Callable, Dict, List, Optional, Union +import json +import logging +logger = logging.getLogger(__name__) import grpc import kubeflow.katib.katib_api_pb2 as katib_api_pb2 @@ -59,6 +62,7 @@ def __init__( k8s_client = client.ApiClient(client_configuration) self.custom_api = client.CustomObjectsApi(k8s_client) + self.core_api = client.CoreV1Api(k8s_client) self.api_client = ApiClient() self.namespace = namespace @@ -153,9 +157,16 @@ def tune( self, # TODO (andreyvelich): How to be consistent with other APIs (name) ? name: str, - objective: Callable, - parameters: Dict[str, Any], - base_image: str = constants.BASE_IMAGE_TENSORFLOW, + model_provider_parameters: Optional[Any] = None, + dataset_provider_parameters: Optional[Any] = None, + storage_config: Optional[Dict[str, Optional[Union[str, List[str]]]]] = { + "size": constants.PVC_DEFAULT_SIZE, + "storage_class": None, + "access_modes": constants.PVC_DEFAULT_ACCESS_MODES, + }, + objective: Optional[Callable] = None, + base_image: Optional[str] = constants.BASE_IMAGE_TENSORFLOW, + trainer_parameters = None, namespace: Optional[str] = None, env_per_trial: Optional[ Union[Dict[str, str], List[Union[client.V1EnvVar, client.V1EnvFromSource]]] @@ -176,23 +187,39 @@ def tune( packages_to_install: List[str] = None, pip_index_url: str = "https://pypi.org/simple", ): - """Create HyperParameter Tuning Katib Experiment from the objective function. + """Create HyperParameter Tuning Katib Experiment using one of the following options: + - External models and datasets: Specify both `model_provider_parameters` and `dataset_provider_parameters` to download models and datasets from external platforms (currently supports HuggingFace and Amazon S3) using the Storage Initializer. The `trainer_parameters` should be of type `HuggingFaceTrainerParams` to set the hyperparameters search space. This API will automatically define the "Trainer" class in HuggingFace with the provided parameters and utilize `Trainer.train()` from HuggingFace to obtain the metrics for optimizing hyperparameters. + - Custom objective function: Specify the `objective` parameter to define your own objective function. The `base_image` parameter will be used to execute the objective function. `trainer_parameters` should be a dictionary to define the search space for these parameters. Args: name: Name for the Experiment. + model_provider_parameters: Parameters for the model provider in the Storage Initializer. + For example, HuggingFace model name and Transformer type for that model, like: AutoModelForSequenceClassification. This argument must be the type of `kubeflow.storage_initializer.hugging_face.HuggingFaceModelParams`. + dataset_provider_parameters: Parameters for the dataset provider in the Storage Initializer. + For example, name of the HuggingFace dataset or AWS S3 configuration. This argument must be the type of `kubeflow.storage_initializer.hugging_face.HuggingFaceDatasetParams` or `kubeflow.storage_initializer.s3.S3DatasetParams` + storage_config: Configuration for Storage Initializer PVC to download pre-trained model and dataset. + You can configure PVC size and storage class name in this argument. objective: Objective function that Katib uses to train the model. This function must be Callable and it must have only one dict argument. Katib uses this argument to send HyperParameters to the function. The function should not use any code declared outside of the function definition. Import statements must be added inside the function. - parameters: Dict of HyperParameters to tune your Experiment. You - should use Katib SDK to define the search space for these parameters. - - For example: `parameters = {"lr": katib.search.double(min=0.1, max=0.2)}` - - Also, you can use these parameters to define input for your - objective function. base_image: Image to use when executing the objective function. + trainer_parameters: Parameters for configuring the training process, including settings for the hyperparameters search space. + You should use the Katib SDK to define the search space for these parameters. + If you choose to use external models and datasets, it should be of type `HuggingFaceTrainerParams`. For example: + ``` + trainer_parameters = HuggingFaceTrainerParams( + training_parameters = transformers.TrainingArguments( + learning_rate = katib.search.double(min=0.1, max=0.2), + ), + ), + ``` + If you choose a custom objective function, it should be a dictionary. For example: + ``` + trainer_parameters = {"lr": katib.search.double(min=0.1, max=0.2)} + ``` + Also, you can use these parameters to define input for training the external models or your custom objective function. namespace: Namespace for the Experiment. env_per_trial: Environment variable(s) to be attached to each trial container. You can specify a dictionary as a mapping object representing the environment @@ -244,6 +271,24 @@ def tune( RuntimeError: Failed to create Katib Experiment. """ + print( + "Thank you for using `tune` API for LLMs hyperparameters optimization. This feature is in alpha stage Kubeflow community is looking for your feedback. Please share your experience via #kubeflow-katib Slack channel or Kubeflow Katib GitHub." + ) + + if ( + ((model_provider_parameters is not None) and (dataset_provider_parameters is not None)) == (objective is not None) + ): + raise ValueError( + "Invalid configuration for creating a Katib Experiment for hyperparameter optimization. " + "You should only specify one of the following options: 1) `model_provider_parameters` and `dataset_provider_parameters`; 2) `objective`." + ) + + if ( + not name + or not trainer_parameters + ): + raise ValueError("One of the required parameters is None") + namespace = namespace or self.namespace # Create Katib Experiment template. @@ -282,66 +327,8 @@ def tune( experiment.spec.parallel_trial_count = parallel_trial_count if max_failed_trial_count is not None: experiment.spec.max_failed_trial_count = max_failed_trial_count - - # Validate objective function. - utils.validate_objective_function(objective) - - # Extract objective function implementation. - objective_code = inspect.getsource(objective) - - # Objective function might be defined in some indented scope - # (e.g. in another function). We need to dedent the function code. - objective_code = textwrap.dedent(objective_code) - - # Iterate over input parameters. - input_params = {} - experiment_params = [] - trial_params = [] - for p_name, p_value in parameters.items(): - # If input parameter value is Katib Experiment parameter sample. - if isinstance(p_value, models.V1beta1ParameterSpec): - # Wrap value for the function input. - input_params[p_name] = f"${{trialParameters.{p_name}}}" - - # Add value to the Katib Experiment parameters. - p_value.name = p_name - experiment_params.append(p_value) - - # Add value to the Katib Experiment's Trial parameters. - trial_params.append( - models.V1beta1TrialParameterSpec(name=p_name, reference=p_name) - ) - else: - # Otherwise, add value to the function input. - input_params[p_name] = p_value - - # Wrap objective function to execute it from the file. For example - # def objective(parameters): - # print(f'Parameters are {parameters}') - # objective({'lr': '${trialParameters.lr}', 'epochs': '${trialParameters.epochs}', 'is_dist': False}) - objective_code = f"{objective_code}\n{objective.__name__}({input_params})\n" - - # Prepare execute script template. - exec_script = textwrap.dedent( - """ - program_path=$(mktemp -d) - read -r -d '' SCRIPT << EOM\n - {objective_code} - EOM - printf "%s" "$SCRIPT" > $program_path/ephemeral_objective.py - python3 -u $program_path/ephemeral_objective.py""" - ) - - # Add objective code to the execute script. - exec_script = exec_script.format(objective_code=objective_code) - - # Install Python packages if that is required. - if packages_to_install is not None: - exec_script = ( - utils.get_script_for_python_packages(packages_to_install, pip_index_url) - + exec_script - ) - + + # Add resources to the Katib Experiment. if isinstance(resources_per_trial, dict): if "gpu" in resources_per_trial: resources_per_trial["nvidia.com/gpu"] = resources_per_trial.pop("gpu") @@ -351,6 +338,7 @@ def tune( limits=resources_per_trial, ) + # Add environment variables to the Katib Experiment. env = [] env_from = [] if isinstance(env_per_trial, dict): @@ -369,30 +357,249 @@ def tune( f"Incorrect value for env_per_trial: {env_per_trial}" ) - # Create Trial specification. - trial_spec = client.V1Job( - api_version="batch/v1", - kind="Job", - spec=client.V1JobSpec( - template=client.V1PodTemplateSpec( - metadata=models.V1ObjectMeta( - annotations={"sidecar.istio.io/inject": "false"} - ), - spec=client.V1PodSpec( - restart_policy="Never", - containers=[ - client.V1Container( + # Create Container and Pod specifications. + # If users choose to use a custom objective function. + if objective is not None: + # Validate objective function. + utils.validate_objective_function(objective) + + # Extract objective function implementation. + objective_code = inspect.getsource(objective) + + # Objective function might be defined in some indented scope + # (e.g. in another function). We need to dedent the function code. + objective_code = textwrap.dedent(objective_code) + + # Iterate over input parameters. + input_params = {} + experiment_params = [] + trial_params = [] + for p_name, p_value in trainer_parameters.items(): + # If input parameter value is Katib Experiment parameter sample. + if isinstance(p_value, models.V1beta1ParameterSpec): + # Wrap value for the function input. + input_params[p_name] = f"${{trialParameters.{p_name}}}" + + # Add value to the Katib Experiment parameters. + p_value.name = p_name + experiment_params.append(p_value) + + # Add value to the Katib Experiment's Trial parameters. + trial_params.append( + models.V1beta1TrialParameterSpec(name=p_name, reference=p_name) + ) + else: + # Otherwise, add value to the function input. + input_params[p_name] = p_value + + # Wrap objective function to execute it from the file. For example + # def objective(parameters): + # print(f'Parameters are {parameters}') + # objective({'lr': '${trialParameters.lr}', 'epochs': '${trialParameters.epochs}', 'is_dist': False}) + objective_code = f"{objective_code}\n{objective.__name__}({input_params})\n" + + # Prepare execute script template. + exec_script = textwrap.dedent( + """ + program_path=$(mktemp -d) + read -r -d '' SCRIPT << EOM\n + {objective_code} + EOM + printf "%s" "$SCRIPT" > $program_path/ephemeral_objective.py + python3 -u $program_path/ephemeral_objective.py""" + ) + + # Add objective code to the execute script. + exec_script = exec_script.format(objective_code=objective_code) + + # Install Python packages if that is required. + if packages_to_install is not None: + exec_script = ( + utils.get_script_for_python_packages(packages_to_install, pip_index_url) + + exec_script + ) + + # create app container spec + container_spec = client.V1Container( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, image=base_image, command=["bash", "-c"], args=[exec_script], - env=env, - env_from=env_from, + env=env if env else None, + env_from=env_from if env_from else None, resources=resources_per_trial, ) - ], + + pod_spec = client.V1PodTemplateSpec( + metadata=models.V1ObjectMeta( + annotations={"sidecar.istio.io/inject": "false"} + ), + spec=client.V1PodSpec( + restart_policy="Never", + containers=[container_spec], + ), + ) + + # If users choose to use external models and datasets. + else: + try: + import peft + import transformers + from kubeflow.storage_initializer.s3 import S3DatasetParams + from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceModelParams, + HuggingFaceDatasetParams, + HuggingFaceTrainerParams, + ) + except ImportError: + raise ImportError( + "Tune API dependencies not installed. " + + "Run: pip install -U 'kubeflow-training[huggingface]' " + ) + + # Create PVC for the Storage Initializer. + try: + self.core_api.create_namespaced_persistent_volume_claim( + namespace=namespace, + body=utils.get_pvc_spec( + pvc_name=constants.STORAGE_INITIALIZER, + namespace=namespace, + storage_config=storage_config, ), ) + except Exception as e: + pvc_list = self.core_api.list_namespaced_persistent_volume_claim(namespace) + # Check if the PVC with the specified name exists. + for pvc in pvc_list.items: + if pvc.metadata.name == constants.STORAGE_INITIALIZER: + print( + f"PVC '{constants.STORAGE_INITIALIZER}' already exists in namespace " + f"{namespace}." + ) + break + else: + raise RuntimeError(f"failed to create PVC. Error: {e}") + + if isinstance(model_provider_parameters, HuggingFaceModelParams): + mp = "hf" + else: + raise ValueError("Model provider parameters must be an instance of HuggingFaceModelParams.") + + if isinstance(dataset_provider_parameters, S3DatasetParams): + dp = "s3" + elif isinstance(dataset_provider_parameters, HuggingFaceDatasetParams): + dp = "hf" + else: + raise ValueError("Dataset provider parameters must be an instance of S3DatasetParams or HuggingFaceDatasetParams.") + + # Iterate over input parameters. + experiment_params = [] + trial_params = [] + + training_args = trainer_parameters.training_parameters + for p_name, p_value in trainer_parameters.training_parameters.to_dict().items(): + if not hasattr(training_args, p_name): + logger.warning(f"Training parameter {p_name} is not supported by the current transformer.") + continue + if isinstance(p_value, models.V1beta1ParameterSpec): + old_attr = getattr(training_args, p_name, None) + if old_attr is not None: + value = f"${{trialParameters.{p_name}}}" + setattr(training_args, p_name, value) + p_value.name = p_name + experiment_params.append(p_value) + trial_params.append(models.V1beta1TrialParameterSpec(name=p_name, reference=p_name)) + elif p_value is not None: + old_attr = getattr(training_args, p_name, None) + if old_attr is not None: + value = type(old_attr)(p_value) + setattr(training_args, p_name, value) + + lora_config = trainer_parameters.lora_config + for p_name, p_value in trainer_parameters.lora_config.__dict__.items(): + if not hasattr(lora_config, p_name): + logger.warning(f"Training parameter {p_name} is not supported by the current peft.") + continue + if isinstance(p_value, models.V1beta1ParameterSpec): + old_attr = getattr(lora_config, p_name, None) + if old_attr is not None: + value = f"${{trialParameters.{p_name}}}" + setattr(lora_config, p_name, value) + p_value.name = p_name + experiment_params.append(p_value) + trial_params.append(models.V1beta1TrialParameterSpec(name=p_name, reference=p_name)) + elif p_value is not None: + old_attr = getattr(lora_config, p_name, None) + if old_attr is not None: + value = type(old_attr)(p_value) + setattr(lora_config, p_name, value) + + # create init container spec. + init_container_spec = client.V1Container( + name=constants.STORAGE_INITIALIZER, + image=constants.STORAGE_INITIALIZER_IMAGE, + args=[ + "--model_provider", + mp, + "--model_provider_parameters", + json.dumps(model_provider_parameters.__dict__, cls=utils.SetEncoder), + "--dataset_provider", + dp, + "--dataset_provider_parameters", + json.dumps(dataset_provider_parameters.__dict__), + ], + volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT], + ) + + from kubeflow.storage_initializer.constants import ( + VOLUME_PATH_DATASET, + VOLUME_PATH_MODEL, + ) + + lora_config = json.dumps(lora_config.__dict__, cls=utils.SetEncoder) + training_args = json.dumps(training_args.to_dict()) + # create app container spec. + container_spec = client.V1Container( + name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, + image=constants.TRAINER_TRANSFORMER_IMAGE, + args=[ + "--model_uri", + model_provider_parameters.model_uri, + "--transformer_type", + model_provider_parameters.transformer_type.__name__, + "--model_dir", + VOLUME_PATH_MODEL, + "--dataset_dir", + VOLUME_PATH_DATASET, + "--lora_config", + f"'{lora_config}'", + "--training_parameters", + f"'{training_args}'", + ], + volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT], + env=env if env else None, + env_from=env_from if env_from else None, + resources=resources_per_trial, + ) + + pod_spec = client.V1PodTemplateSpec( + metadata=models.V1ObjectMeta( + annotations={"sidecar.istio.io/inject": "false"} + ), + spec=client.V1PodSpec( + restart_policy="Never", + containers=[container_spec], + init_containers=[init_container_spec], + volumes=[constants.STORAGE_INITIALIZER_VOLUME], + ), + ) + + # Create Trial specification. + trial_spec = client.V1Job( + api_version="batch/v1", + kind="Job", + spec=client.V1JobSpec( + template=pod_spec, ), ) diff --git a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py index 9af281524cd..1fec6068d47 100644 --- a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py +++ b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py @@ -13,6 +13,8 @@ # limitations under the License. import os +from kubernetes import client +from kubeflow.storage_initializer.constants import INIT_CONTAINER_MOUNT_PATH # How long to wait in seconds for requests to the Kubernetes or gRPC API Server. DEFAULT_TIMEOUT = 120 @@ -56,3 +58,26 @@ BASE_IMAGE_MXNET = "docker.io/mxnet/python:1.9.1_native_py3" DEFAULT_DB_MANAGER_ADDRESS = "katib-db-manager.kubeflow:6789" + +# Constants for Tune API. +STORAGE_INITIALIZER = "storage-initializer" +# The default value for dataset and model storage PVC. +PVC_DEFAULT_SIZE = "10Gi" +# The default value for PVC access modes. +PVC_DEFAULT_ACCESS_MODES = ["ReadWriteOnce", "ReadOnlyMany"] + +STORAGE_INITIALIZER_IMAGE = "docker.io/kubeflow/storage-initializer" + +STORAGE_INITIALIZER_VOLUME_MOUNT = client.V1VolumeMount( + name=STORAGE_INITIALIZER, + mount_path=INIT_CONTAINER_MOUNT_PATH, +) + +STORAGE_INITIALIZER_VOLUME = client.V1Volume( + name=STORAGE_INITIALIZER, + persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource( + claim_name=STORAGE_INITIALIZER + ), +) + +TRAINER_TRANSFORMER_IMAGE = "" # Need to be built using the `trainer` file \ No newline at end of file diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 97c46772611..27133df3cc2 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -20,6 +20,8 @@ from kubeflow.katib import models from kubeflow.katib.constants import constants +from typing import Any, Callable, Dict, List, Optional, Union +import transformers def is_running_in_k8s(): @@ -118,3 +120,44 @@ class FakeResponse: def __init__(self, obj): self.data = json.dumps(obj) + + +def get_pvc_spec( + pvc_name: str, + namespace: str, + storage_config: Dict[str, Optional[Union[str, List[str]]]], +): + if pvc_name is None or namespace is None: + raise ValueError("One of the required storage config argument is None") + + if "size" not in storage_config: + storage_config["size"] = constants.PVC_DEFAULT_SIZE + + if "access_modes" not in storage_config: + storage_config["access_modes"] = constants.PVC_DEFAULT_ACCESS_MODES + + pvc_spec = models.V1PersistentVolumeClaim( + api_version="v1", + kind="PersistentVolumeClaim", + metadata={"name": pvc_name, "namepsace": namespace}, + spec=models.V1PersistentVolumeClaimSpec( + access_modes=storage_config["access_modes"], + resources=models.V1ResourceRequirements( + requests={"storage": storage_config["size"]} + ), + ), + ) + + if "storage_class" in storage_config: + pvc_spec.spec.storage_class_name = storage_config["storage_class"] + + return pvc_spec + + +class SetEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, set): + return list(obj) + if isinstance(obj, type): + return obj.__name__ + return json.JSONEncoder.default(self, obj) \ No newline at end of file diff --git a/sdk/python/v1beta1/kubeflow/trainer/Dockerfile b/sdk/python/v1beta1/kubeflow/trainer/Dockerfile new file mode 100644 index 00000000000..c55633ff713 --- /dev/null +++ b/sdk/python/v1beta1/kubeflow/trainer/Dockerfile @@ -0,0 +1,17 @@ +# Use an official Pytorch runtime as a parent image +FROM nvcr.io/nvidia/pytorch:23.10-py3 + +# Set the working directory in the container +WORKDIR /app + +# Copy the requirements.txt file into the container +COPY requirements.txt /app/requirements.txt + +# Install any needed packages specified in requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the Python package and its source code into the container +COPY . /app + +# Run storage.py when the container launches +ENTRYPOINT ["torchrun", "hf_llm_optimization.py"] \ No newline at end of file diff --git a/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py b/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py new file mode 100644 index 00000000000..114071c7401 --- /dev/null +++ b/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py @@ -0,0 +1,196 @@ +import argparse +import logging +from urllib.parse import urlparse +import json +import os + +from datasets import load_from_disk, Dataset +from datasets.distributed import split_dataset_by_node +from peft import LoraConfig, get_peft_model +import transformers +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + AutoModelForImageClassification, + TrainingArguments, + DataCollatorForLanguageModeling, + Trainer, +) + + +# Configure logger. +log_formatter = logging.Formatter( + "%(asctime)s %(levelname)-8s %(message)s", "%Y-%m-%dT%H:%M:%SZ" +) +logger = logging.getLogger(__file__) +console_handler = logging.StreamHandler() +console_handler.setFormatter(log_formatter) +logger.addHandler(console_handler) +logger.setLevel(logging.INFO) + + +def setup_model_and_tokenizer(model_uri, transformer_type, model_dir): + # Set up the model and tokenizer + parsed_uri = urlparse(model_uri) + model_name = parsed_uri.netloc + parsed_uri.path + + model = transformer_type.from_pretrained( + pretrained_model_name_or_path=model_name, + cache_dir=model_dir, + local_files_only=True, + trust_remote_code=True, + ) + + tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path=model_name, + cache_dir=model_dir, + local_files_only=True, + ) + + # Freeze model parameters + for param in model.parameters(): + param.requires_grad = False + + return model, tokenizer + + +def load_and_preprocess_data(dataset_dir, transformer_type, tokenizer): + # Load and preprocess the dataset + logger.info("Load and preprocess dataset") + + if transformer_type != AutoModelForImageClassification: + dataset = load_from_disk(dataset_dir) + + logger.info(f"Dataset specification: {dataset}") + logger.info("-" * 40) + + logger.info("Tokenize dataset") + # TODO (andreyvelich): Discuss how user should set the tokenizer function. + dataset = dataset.map( + lambda x: tokenizer(x["text"], padding="max_length", truncation=True), + batched=True, + ) + else: + dataset = load_from_disk(dataset_dir) + + # Check if dataset contains `train` key. Otherwise, load full dataset to train_data. + if "train" in dataset: + train_data = dataset["train"] + else: + train_data = dataset + + try: + eval_data = dataset["eval"] or dataset["test"] + except Exception: + eval_data = None + logger.info("Evaluation dataset is not found") + + # Distribute dataset across PyTorchJob workers. + RANK = int(os.environ["RANK"]) + WORLD_SIZE = int(os.environ["WORLD_SIZE"]) + logger.info( + f"Distributed dataset across PyTorchJob workers. WORLD_SIZE: {WORLD_SIZE}, RANK: {RANK}" + ) + if isinstance(train_data, Dataset): + train_data = split_dataset_by_node( + train_data, + rank=RANK, + world_size=WORLD_SIZE, + ) + if isinstance(eval_data, Dataset): + eval_data = split_dataset_by_node( + eval_data, + rank=RANK, + world_size=WORLD_SIZE, + ) + + return train_data, eval_data + + +def setup_peft_model(model, lora_config): + # Set up the PEFT model + model.enable_input_require_grads() + model = get_peft_model(model, lora_config) + return model + + +def train_model(model, transformer_type, train_data, eval_data, tokenizer, train_args): + # Setup the Trainer. + trainer = Trainer( + model=model, + train_dataset=train_data, + eval_dataset=eval_data, + args=train_args, + ) + + # TODO (andreyvelich): Currently, data collator is supported only for casual LM Transformer. + if transformer_type == AutoModelForCausalLM: + logger.info("Add data collector for language modeling") + logger.info("-" * 40) + trainer.data_collator = DataCollatorForLanguageModeling( + tokenizer, + pad_to_multiple_of=8, + mlm=False, + ) + + # Train the model. + train_results = trainer.train() + print(f"train_loss={train_results.training_loss}") + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description="Script for training a model with PEFT configuration." + ) + + parser.add_argument("--model_uri", help="model uri") + parser.add_argument("--transformer_type", help="model transformer type") + parser.add_argument("--model_dir", help="directory containing model") + parser.add_argument("--dataset_dir", help="directory containing dataset") + parser.add_argument("--lora_config", help="lora_config") + parser.add_argument( + "--training_parameters", help="hugging face training parameters" + ) + + return parser.parse_args() + + +if __name__ == "__main__": + logger.info("Starting HuggingFace LLM Trainer") + args = parse_arguments() + + train_args = TrainingArguments(**json.loads(args.training_parameters)) + reference_args = transformers.TrainingArguments(output_dir=train_args.output_dir) + for key, val in train_args.to_dict().items(): + old_attr = getattr(reference_args, key, None) + if old_attr is not None: + val = type(old_attr)(val) + setattr(train_args, key, val) + + lora_config = LoraConfig(**json.loads(args.lora_config)) + reference_lora_config = LoraConfig() + for key, val in lora_config.__dict__.items(): + old_attr = getattr(reference_lora_config, key, None) + if old_attr is not None: + val = type(old_attr)(val) + setattr(lora_config, key, val) + + transformer_type = getattr(transformers, args.transformer_type) + + logger.info("Setup model and tokenizer") + model, tokenizer = setup_model_and_tokenizer( + args.model_uri, transformer_type, args.model_dir + ) + + logger.info("Preprocess dataset") + train_data, eval_data = load_and_preprocess_data( + args.dataset_dir, transformer_type, tokenizer + ) + + logger.info("Setup LoRA config for model") + model = setup_peft_model(model, lora_config) + + logger.info("Start model training") + train_model(model, transformer_type, train_data, eval_data, tokenizer, train_args) + + logger.info("Training is complete") \ No newline at end of file diff --git a/sdk/python/v1beta1/kubeflow/trainer/requirements.txt b/sdk/python/v1beta1/kubeflow/trainer/requirements.txt new file mode 100644 index 00000000000..ba76f3cdcec --- /dev/null +++ b/sdk/python/v1beta1/kubeflow/trainer/requirements.txt @@ -0,0 +1,4 @@ +peft==0.3.0 +datasets==2.15.0 +transformers==4.38.0 +evaluate==0.4.0 \ No newline at end of file diff --git a/sdk/python/v1beta1/setup.py b/sdk/python/v1beta1/setup.py index 39a4f0e2372..685c45c102e 100644 --- a/sdk/python/v1beta1/setup.py +++ b/sdk/python/v1beta1/setup.py @@ -68,4 +68,7 @@ "Topic :: Software Development :: Libraries :: Python Modules", ], install_requires=REQUIRES, + extras_require={ + "huggingface": ["kubeflow-training[huggingface]"], + }, ) From 158c8f3e32e3d5b08baea4371d17653f0655b107 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 21 Jul 2024 12:14:20 -0700 Subject: [PATCH 02/84] resolve conflict Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 39 ++++--------------- 1 file changed, 7 insertions(+), 32 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 5b2cf67ab21..4047fe54fe7 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -371,8 +371,13 @@ def tune( raise ValueError( f"Incorrect value for env_per_trial: {env_per_trial}" ) - -<<<<<<< HEAD + + # Add metrics collector to the Katib Experiment. + # Up to now, We only support parameter `kind`, of which default value is `StdOut`, to specify the kind of metrics collector. + experiment.spec.metrics_collector = models.V1beta1MetricsCollectorSpec( + collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]) + ) + # Create Container and Pod specifications. # If users choose to use a custom objective function. if objective is not None: @@ -609,43 +614,13 @@ def tune( volumes=[constants.STORAGE_INITIALIZER_VOLUME], ), ) - -======= - # Add metrics collector to the Katib Experiment. - # Up to now, We only support parameter `kind`, of which default value is `StdOut`, to specify the kind of metrics collector. - experiment.spec.metrics_collector = models.V1beta1MetricsCollectorSpec( - collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]) - ) ->>>>>>> upstream/master # Create Trial specification. trial_spec = client.V1Job( api_version="batch/v1", kind="Job", spec=client.V1JobSpec( -<<<<<<< HEAD template=pod_spec, -======= - template=client.V1PodTemplateSpec( - metadata=models.V1ObjectMeta( - annotations={"sidecar.istio.io/inject": "false"} - ), - spec=client.V1PodSpec( - restart_policy="Never", - containers=[ - client.V1Container( - name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - image=base_image, - command=["bash", "-c"], - args=[exec_script], - env=env if env else None, - env_from=env_from if env_from else None, - resources=resources_per_trial, - ) - ], - ), - ) ->>>>>>> upstream/master ), ) From f4a0d4e90c556e5a855a56ebb2fcbdf6268ecc68 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 21 Jul 2024 13:46:19 -0700 Subject: [PATCH 03/84] fix the problem of dependency Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 9 ++------- sdk/python/v1beta1/kubeflow/katib/constants/constants.py | 7 +++++-- sdk/python/v1beta1/kubeflow/katib/utils/utils.py | 3 +-- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 4047fe54fe7..6e0518bb0ab 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -572,11 +572,6 @@ def tune( volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT], ) - from kubeflow.storage_initializer.constants import ( - VOLUME_PATH_DATASET, - VOLUME_PATH_MODEL, - ) - lora_config = json.dumps(lora_config.__dict__, cls=utils.SetEncoder) training_args = json.dumps(training_args.to_dict()) # create app container spec. @@ -589,9 +584,9 @@ def tune( "--transformer_type", model_provider_parameters.transformer_type.__name__, "--model_dir", - VOLUME_PATH_MODEL, + constants.VOLUME_PATH_MODEL, "--dataset_dir", - VOLUME_PATH_DATASET, + constants.VOLUME_PATH_DATASET, "--lora_config", f"'{lora_config}'", "--training_parameters", diff --git a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py index 5707a39ccb9..17f5619b922 100644 --- a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py +++ b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py @@ -14,7 +14,6 @@ import os from kubernetes import client -from kubeflow.storage_initializer.constants import INIT_CONTAINER_MOUNT_PATH # How long to wait in seconds for requests to the Kubernetes or gRPC API Server. DEFAULT_TIMEOUT = 120 @@ -70,7 +69,9 @@ # The default value for PVC access modes. PVC_DEFAULT_ACCESS_MODES = ["ReadWriteOnce", "ReadOnlyMany"] -STORAGE_INITIALIZER_IMAGE = "docker.io/kubeflow/storage-initializer" +INIT_CONTAINER_MOUNT_PATH = "/workspace" +VOLUME_PATH_DATASET = INIT_CONTAINER_MOUNT_PATH + "/dataset" +VOLUME_PATH_MODEL = INIT_CONTAINER_MOUNT_PATH + "/model" STORAGE_INITIALIZER_VOLUME_MOUNT = client.V1VolumeMount( name=STORAGE_INITIALIZER, @@ -84,4 +85,6 @@ ), ) +STORAGE_INITIALIZER_IMAGE = "docker.io/kubeflow/storage-initializer" + TRAINER_TRANSFORMER_IMAGE = "" # Need to be built using the `trainer` file \ No newline at end of file diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 8985d785236..0f4a2ad263a 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -21,7 +21,6 @@ from kubeflow.katib import models from kubeflow.katib.constants import constants from typing import Any, Callable, Dict, List, Optional, Union -import transformers def is_running_in_k8s(): @@ -170,4 +169,4 @@ def default(self, obj): return list(obj) if isinstance(obj, type): return obj.__name__ - return json.JSONEncoder.default(self, obj) \ No newline at end of file + return json.JSONEncoder.default(self, obj) From 7e7dd56beb44fbb61c6d421f6bfa3ac66b3701d6 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 21 Jul 2024 14:40:15 -0700 Subject: [PATCH 04/84] fix the format of import statement Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 17 ++++++------- .../v1beta1/kubeflow/katib/utils/utils.py | 3 +-- .../kubeflow/trainer/hf_llm_optimization.py | 24 +++++++++---------- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 6e0518bb0ab..7a661a5bf1b 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -13,13 +13,13 @@ # limitations under the License. import inspect +import json import logging import multiprocessing import textwrap import time from typing import Any, Callable, Dict, List, Optional, Union -import json -import logging + logger = logging.getLogger(__name__) import grpc @@ -464,14 +464,15 @@ def tune( # If users choose to use external models and datasets. else: try: + from kubeflow.storage_initializer.hugging_face import \ + HuggingFaceDatasetParams + from kubeflow.storage_initializer.hugging_face import \ + HuggingFaceModelParams + from kubeflow.storage_initializer.hugging_face import \ + HuggingFaceTrainerParams + from kubeflow.storage_initializer.s3 import S3DatasetParams import peft import transformers - from kubeflow.storage_initializer.s3 import S3DatasetParams - from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceModelParams, - HuggingFaceDatasetParams, - HuggingFaceTrainerParams, - ) except ImportError: raise ImportError( "Tune API dependencies not installed. " diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 0f4a2ad263a..2a2e2b4b4b8 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -16,11 +16,10 @@ import json import os import textwrap -from typing import Any, Callable +from typing import Any, Callable, Dict, List, Optional, Union from kubeflow.katib import models from kubeflow.katib.constants import constants -from typing import Any, Callable, Dict, List, Optional, Union def is_running_in_k8s(): diff --git a/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py b/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py index 114071c7401..e12d3e3a940 100644 --- a/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py +++ b/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py @@ -1,21 +1,21 @@ import argparse -import logging -from urllib.parse import urlparse import json +import logging import os +from urllib.parse import urlparse -from datasets import load_from_disk, Dataset +from datasets import Dataset +from datasets import load_from_disk from datasets.distributed import split_dataset_by_node -from peft import LoraConfig, get_peft_model +from peft import get_peft_model +from peft import LoraConfig import transformers -from transformers import ( - AutoModelForCausalLM, - AutoTokenizer, - AutoModelForImageClassification, - TrainingArguments, - DataCollatorForLanguageModeling, - Trainer, -) +from transformers import AutoModelForCausalLM +from transformers import AutoModelForImageClassification +from transformers import AutoTokenizer +from transformers import DataCollatorForLanguageModeling +from transformers import Trainer +from transformers import TrainingArguments # Configure logger. From 62ad3850c1b57232ccfd5df3595a401d546fc1e3 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 21 Jul 2024 14:46:55 -0700 Subject: [PATCH 05/84] adjust the blank lines Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/constants/constants.py | 1 + sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py index 17f5619b922..955ee07a01a 100644 --- a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py +++ b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py @@ -13,6 +13,7 @@ # limitations under the License. import os + from kubernetes import client # How long to wait in seconds for requests to the Kubernetes or gRPC API Server. diff --git a/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py b/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py index e12d3e3a940..a0050c45071 100644 --- a/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py +++ b/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py @@ -17,7 +17,6 @@ from transformers import Trainer from transformers import TrainingArguments - # Configure logger. log_formatter = logging.Formatter( "%(asctime)s %(levelname)-8s %(message)s", "%Y-%m-%dT%H:%M:%SZ" From 3f36740364decc59c389bddef9bffa7d9babd285 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 22 Jul 2024 15:36:54 -0700 Subject: [PATCH 06/84] delete the trainer to reuse it in Training Operator Signed-off-by: helenxie-bit --- .../v1beta1/kubeflow/trainer/Dockerfile | 17 -- .../kubeflow/trainer/hf_llm_optimization.py | 195 ------------------ .../v1beta1/kubeflow/trainer/requirements.txt | 4 - 3 files changed, 216 deletions(-) delete mode 100644 sdk/python/v1beta1/kubeflow/trainer/Dockerfile delete mode 100644 sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py delete mode 100644 sdk/python/v1beta1/kubeflow/trainer/requirements.txt diff --git a/sdk/python/v1beta1/kubeflow/trainer/Dockerfile b/sdk/python/v1beta1/kubeflow/trainer/Dockerfile deleted file mode 100644 index c55633ff713..00000000000 --- a/sdk/python/v1beta1/kubeflow/trainer/Dockerfile +++ /dev/null @@ -1,17 +0,0 @@ -# Use an official Pytorch runtime as a parent image -FROM nvcr.io/nvidia/pytorch:23.10-py3 - -# Set the working directory in the container -WORKDIR /app - -# Copy the requirements.txt file into the container -COPY requirements.txt /app/requirements.txt - -# Install any needed packages specified in requirements.txt -RUN pip install --no-cache-dir -r requirements.txt - -# Copy the Python package and its source code into the container -COPY . /app - -# Run storage.py when the container launches -ENTRYPOINT ["torchrun", "hf_llm_optimization.py"] \ No newline at end of file diff --git a/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py b/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py deleted file mode 100644 index a0050c45071..00000000000 --- a/sdk/python/v1beta1/kubeflow/trainer/hf_llm_optimization.py +++ /dev/null @@ -1,195 +0,0 @@ -import argparse -import json -import logging -import os -from urllib.parse import urlparse - -from datasets import Dataset -from datasets import load_from_disk -from datasets.distributed import split_dataset_by_node -from peft import get_peft_model -from peft import LoraConfig -import transformers -from transformers import AutoModelForCausalLM -from transformers import AutoModelForImageClassification -from transformers import AutoTokenizer -from transformers import DataCollatorForLanguageModeling -from transformers import Trainer -from transformers import TrainingArguments - -# Configure logger. -log_formatter = logging.Formatter( - "%(asctime)s %(levelname)-8s %(message)s", "%Y-%m-%dT%H:%M:%SZ" -) -logger = logging.getLogger(__file__) -console_handler = logging.StreamHandler() -console_handler.setFormatter(log_formatter) -logger.addHandler(console_handler) -logger.setLevel(logging.INFO) - - -def setup_model_and_tokenizer(model_uri, transformer_type, model_dir): - # Set up the model and tokenizer - parsed_uri = urlparse(model_uri) - model_name = parsed_uri.netloc + parsed_uri.path - - model = transformer_type.from_pretrained( - pretrained_model_name_or_path=model_name, - cache_dir=model_dir, - local_files_only=True, - trust_remote_code=True, - ) - - tokenizer = AutoTokenizer.from_pretrained( - pretrained_model_name_or_path=model_name, - cache_dir=model_dir, - local_files_only=True, - ) - - # Freeze model parameters - for param in model.parameters(): - param.requires_grad = False - - return model, tokenizer - - -def load_and_preprocess_data(dataset_dir, transformer_type, tokenizer): - # Load and preprocess the dataset - logger.info("Load and preprocess dataset") - - if transformer_type != AutoModelForImageClassification: - dataset = load_from_disk(dataset_dir) - - logger.info(f"Dataset specification: {dataset}") - logger.info("-" * 40) - - logger.info("Tokenize dataset") - # TODO (andreyvelich): Discuss how user should set the tokenizer function. - dataset = dataset.map( - lambda x: tokenizer(x["text"], padding="max_length", truncation=True), - batched=True, - ) - else: - dataset = load_from_disk(dataset_dir) - - # Check if dataset contains `train` key. Otherwise, load full dataset to train_data. - if "train" in dataset: - train_data = dataset["train"] - else: - train_data = dataset - - try: - eval_data = dataset["eval"] or dataset["test"] - except Exception: - eval_data = None - logger.info("Evaluation dataset is not found") - - # Distribute dataset across PyTorchJob workers. - RANK = int(os.environ["RANK"]) - WORLD_SIZE = int(os.environ["WORLD_SIZE"]) - logger.info( - f"Distributed dataset across PyTorchJob workers. WORLD_SIZE: {WORLD_SIZE}, RANK: {RANK}" - ) - if isinstance(train_data, Dataset): - train_data = split_dataset_by_node( - train_data, - rank=RANK, - world_size=WORLD_SIZE, - ) - if isinstance(eval_data, Dataset): - eval_data = split_dataset_by_node( - eval_data, - rank=RANK, - world_size=WORLD_SIZE, - ) - - return train_data, eval_data - - -def setup_peft_model(model, lora_config): - # Set up the PEFT model - model.enable_input_require_grads() - model = get_peft_model(model, lora_config) - return model - - -def train_model(model, transformer_type, train_data, eval_data, tokenizer, train_args): - # Setup the Trainer. - trainer = Trainer( - model=model, - train_dataset=train_data, - eval_dataset=eval_data, - args=train_args, - ) - - # TODO (andreyvelich): Currently, data collator is supported only for casual LM Transformer. - if transformer_type == AutoModelForCausalLM: - logger.info("Add data collector for language modeling") - logger.info("-" * 40) - trainer.data_collator = DataCollatorForLanguageModeling( - tokenizer, - pad_to_multiple_of=8, - mlm=False, - ) - - # Train the model. - train_results = trainer.train() - print(f"train_loss={train_results.training_loss}") - - -def parse_arguments(): - parser = argparse.ArgumentParser( - description="Script for training a model with PEFT configuration." - ) - - parser.add_argument("--model_uri", help="model uri") - parser.add_argument("--transformer_type", help="model transformer type") - parser.add_argument("--model_dir", help="directory containing model") - parser.add_argument("--dataset_dir", help="directory containing dataset") - parser.add_argument("--lora_config", help="lora_config") - parser.add_argument( - "--training_parameters", help="hugging face training parameters" - ) - - return parser.parse_args() - - -if __name__ == "__main__": - logger.info("Starting HuggingFace LLM Trainer") - args = parse_arguments() - - train_args = TrainingArguments(**json.loads(args.training_parameters)) - reference_args = transformers.TrainingArguments(output_dir=train_args.output_dir) - for key, val in train_args.to_dict().items(): - old_attr = getattr(reference_args, key, None) - if old_attr is not None: - val = type(old_attr)(val) - setattr(train_args, key, val) - - lora_config = LoraConfig(**json.loads(args.lora_config)) - reference_lora_config = LoraConfig() - for key, val in lora_config.__dict__.items(): - old_attr = getattr(reference_lora_config, key, None) - if old_attr is not None: - val = type(old_attr)(val) - setattr(lora_config, key, val) - - transformer_type = getattr(transformers, args.transformer_type) - - logger.info("Setup model and tokenizer") - model, tokenizer = setup_model_and_tokenizer( - args.model_uri, transformer_type, args.model_dir - ) - - logger.info("Preprocess dataset") - train_data, eval_data = load_and_preprocess_data( - args.dataset_dir, transformer_type, tokenizer - ) - - logger.info("Setup LoRA config for model") - model = setup_peft_model(model, lora_config) - - logger.info("Start model training") - train_model(model, transformer_type, train_data, eval_data, tokenizer, train_args) - - logger.info("Training is complete") \ No newline at end of file diff --git a/sdk/python/v1beta1/kubeflow/trainer/requirements.txt b/sdk/python/v1beta1/kubeflow/trainer/requirements.txt deleted file mode 100644 index ba76f3cdcec..00000000000 --- a/sdk/python/v1beta1/kubeflow/trainer/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -peft==0.3.0 -datasets==2.15.0 -transformers==4.38.0 -evaluate==0.4.0 \ No newline at end of file From 9d202538f24bc5a84ee3e10fb079e7f1301b2115 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 22 Jul 2024 15:43:01 -0700 Subject: [PATCH 07/84] update constants Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/constants/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py index 955ee07a01a..21d421b7bbc 100644 --- a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py +++ b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py @@ -88,4 +88,4 @@ STORAGE_INITIALIZER_IMAGE = "docker.io/kubeflow/storage-initializer" -TRAINER_TRANSFORMER_IMAGE = "" # Need to be built using the `trainer` file \ No newline at end of file +TRAINER_TRANSFORMER_IMAGE = "docker.io/kubeflow/trainer-huggingface" \ No newline at end of file From dfbe793d0b909066bec81cc021480a48cdd6ccd8 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 25 Jul 2024 11:38:50 -0700 Subject: [PATCH 08/84] update metrics format Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 7a661a5bf1b..18f6c0158e4 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -374,8 +374,13 @@ def tune( # Add metrics collector to the Katib Experiment. # Up to now, We only support parameter `kind`, of which default value is `StdOut`, to specify the kind of metrics collector. - experiment.spec.metrics_collector = models.V1beta1MetricsCollectorSpec( - collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]) + experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( + collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]), + source=models.V1beta1SourceSpec( + filter=models.V1beta1FilterSpec( + metrics_format=["\\'(\\w+)\\':\\s((-?\\d+)(\\.\\d+)?)"] + ) + ) ) # Create Container and Pod specifications. From 290a249426599c8cba7e09d5bbf17d765ef629bf Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 29 Jul 2024 09:38:08 +0800 Subject: [PATCH 09/84] update the type of and Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 146 +++++++++++------- .../kubeflow/katib/constants/constants.py | 2 +- .../v1beta1/kubeflow/katib/utils/utils.py | 1 + sdk/python/v1beta1/setup.py | 4 +- 4 files changed, 94 insertions(+), 59 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 18f6c0158e4..568c150fdee 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -18,7 +18,7 @@ import multiprocessing import textwrap import time -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union, TYPE_CHECKING logger = logging.getLogger(__name__) @@ -168,16 +168,16 @@ def tune( self, # TODO (andreyvelich): How to be consistent with other APIs (name) ? name: str, - model_provider_parameters: Optional[Any] = None, - dataset_provider_parameters: Optional[Any] = None, - storage_config: Optional[Dict[str, Optional[Union[str, List[str]]]]] = { + model_provider_parameters: Optional['HuggingFaceModelParams'] = None, + dataset_provider_parameters: Optional[Union['HuggingFaceDatasetParams', 'S3DatasetParams']] = None, + storage_config: Optional[Dict[str, Optional[Union[str, List[str]]]]] = { "size": constants.PVC_DEFAULT_SIZE, "storage_class": None, "access_modes": constants.PVC_DEFAULT_ACCESS_MODES, }, objective: Optional[Callable] = None, base_image: Optional[str] = constants.BASE_IMAGE_TENSORFLOW, - trainer_parameters = None, + trainer_parameters: Union['HuggingFaceTrainerParams', Dict[str, Any]]=None, namespace: Optional[str] = None, env_per_trial: Optional[ Union[Dict[str, str], List[Union[client.V1EnvVar, client.V1EnvFromSource]]] @@ -200,7 +200,7 @@ def tune( metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"}, ): """Create HyperParameter Tuning Katib Experiment using one of the following options: - - External models and datasets: Specify both `model_provider_parameters` and `dataset_provider_parameters` to download models and datasets from external platforms (currently supports HuggingFace and Amazon S3) using the Storage Initializer. The `trainer_parameters` should be of type `HuggingFaceTrainerParams` to set the hyperparameters search space. This API will automatically define the "Trainer" class in HuggingFace with the provided parameters and utilize `Trainer.train()` from HuggingFace to obtain the metrics for optimizing hyperparameters. + - External models and datasets: Specify both `model_provider_parameters` and `dataset_provider_parameters` to download models and datasets from external platforms (currently supports HuggingFace and Amazon S3) using the Storage Initializer. The `trainer_parameters` should be of type `HuggingFaceTrainerParams` to set the hyperparameters search space. This API will automatically define the "Trainer" class in HuggingFace with the provided parameters and utilize `Trainer.train()` from HuggingFace to obtain the metrics for optimizing hyperparameters. - Custom objective function: Specify the `objective` parameter to define your own objective function. The `base_image` parameter will be used to execute the objective function. `trainer_parameters` should be a dictionary to define the search space for these parameters. Args: @@ -276,7 +276,7 @@ def tune( to the base image packages. These packages are installed before executing the objective function. pip_index_url: The PyPI url from which to install Python packages. - metrics_collector_config: Specify the config of metrics collector, + metrics_collector_config: Specify the config of metrics collector, for example, `metrics_collector_config = {"kind": "Push"}`. Currently, we only support `StdOut` and `Push` metrics collector. @@ -289,19 +289,17 @@ def tune( print( "Thank you for using `tune` API for LLMs hyperparameters optimization. This feature is in alpha stage Kubeflow community is looking for your feedback. Please share your experience via #kubeflow-katib Slack channel or Kubeflow Katib GitHub." ) - + if ( - ((model_provider_parameters is not None) and (dataset_provider_parameters is not None)) == (objective is not None) - ): + (model_provider_parameters is not None) + and (dataset_provider_parameters is not None) + ) == (objective is not None): raise ValueError( "Invalid configuration for creating a Katib Experiment for hyperparameter optimization. " "You should only specify one of the following options: 1) `model_provider_parameters` and `dataset_provider_parameters`; 2) `objective`." ) - - if ( - not name - or not trainer_parameters - ): + + if not name or not trainer_parameters: raise ValueError("One of the required parameters is None") namespace = namespace or self.namespace @@ -342,7 +340,7 @@ def tune( experiment.spec.parallel_trial_count = parallel_trial_count if max_failed_trial_count is not None: experiment.spec.max_failed_trial_count = max_failed_trial_count - + # Add resources to the Katib Experiment. if isinstance(resources_per_trial, dict): if "gpu" in resources_per_trial: @@ -371,18 +369,20 @@ def tune( raise ValueError( f"Incorrect value for env_per_trial: {env_per_trial}" ) - + # Add metrics collector to the Katib Experiment. - # Up to now, We only support parameter `kind`, of which default value is `StdOut`, to specify the kind of metrics collector. + # Up to now, We only support parameter `kind`, of which default value is `StdOut`, to specify the kind of metrics collector. experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( - collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]), + collector=models.V1beta1CollectorSpec( + kind=metrics_collector_config["kind"] + ), source=models.V1beta1SourceSpec( filter=models.V1beta1FilterSpec( metrics_format=["\\'(\\w+)\\':\\s((-?\\d+)(\\.\\d+)?)"] ) - ) + ), ) - + # Create Container and Pod specifications. # If users choose to use a custom objective function. if objective is not None: @@ -441,21 +441,23 @@ def tune( # Install Python packages if that is required. if packages_to_install is not None: exec_script = ( - utils.get_script_for_python_packages(packages_to_install, pip_index_url) + utils.get_script_for_python_packages( + packages_to_install, pip_index_url + ) + exec_script ) - + # create app container spec container_spec = client.V1Container( - name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - image=base_image, - command=["bash", "-c"], - args=[exec_script], - env=env if env else None, - env_from=env_from if env_from else None, - resources=resources_per_trial, - ) - + name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, + image=base_image, + command=["bash", "-c"], + args=[exec_script], + env=env if env else None, + env_from=env_from if env_from else None, + resources=resources_per_trial, + ) + pod_spec = client.V1PodTemplateSpec( metadata=models.V1ObjectMeta( annotations={"sidecar.istio.io/inject": "false"} @@ -469,12 +471,15 @@ def tune( # If users choose to use external models and datasets. else: try: - from kubeflow.storage_initializer.hugging_face import \ - HuggingFaceDatasetParams - from kubeflow.storage_initializer.hugging_face import \ - HuggingFaceModelParams - from kubeflow.storage_initializer.hugging_face import \ - HuggingFaceTrainerParams + from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceDatasetParams, + ) + from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceModelParams, + ) + from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceTrainerParams, + ) from kubeflow.storage_initializer.s3 import S3DatasetParams import peft import transformers @@ -483,7 +488,7 @@ def tune( "Tune API dependencies not installed. " + "Run: pip install -U 'kubeflow-training[huggingface]' " ) - + # Create PVC for the Storage Initializer. try: self.core_api.create_namespaced_persistent_volume_claim( @@ -495,7 +500,9 @@ def tune( ), ) except Exception as e: - pvc_list = self.core_api.list_namespaced_persistent_volume_claim(namespace) + pvc_list = self.core_api.list_namespaced_persistent_volume_claim( + namespace + ) # Check if the PVC with the specified name exists. for pvc in pvc_list.items: if pvc.metadata.name == constants.STORAGE_INITIALIZER: @@ -506,27 +513,36 @@ def tune( break else: raise RuntimeError(f"failed to create PVC. Error: {e}") - + if isinstance(model_provider_parameters, HuggingFaceModelParams): mp = "hf" else: - raise ValueError("Model provider parameters must be an instance of HuggingFaceModelParams.") - + raise ValueError( + "Model provider parameters must be an instance of HuggingFaceModelParams." + ) + if isinstance(dataset_provider_parameters, S3DatasetParams): dp = "s3" elif isinstance(dataset_provider_parameters, HuggingFaceDatasetParams): dp = "hf" else: - raise ValueError("Dataset provider parameters must be an instance of S3DatasetParams or HuggingFaceDatasetParams.") - + raise ValueError( + "Dataset provider parameters must be an instance of S3DatasetParams or HuggingFaceDatasetParams." + ) + # Iterate over input parameters. experiment_params = [] trial_params = [] training_args = trainer_parameters.training_parameters - for p_name, p_value in trainer_parameters.training_parameters.to_dict().items(): + for ( + p_name, + p_value, + ) in trainer_parameters.training_parameters.to_dict().items(): if not hasattr(training_args, p_name): - logger.warning(f"Training parameter {p_name} is not supported by the current transformer.") + logger.warning( + f"Training parameter {p_name} is not supported by the current transformer." + ) continue if isinstance(p_value, models.V1beta1ParameterSpec): old_attr = getattr(training_args, p_name, None) @@ -535,7 +551,9 @@ def tune( setattr(training_args, p_name, value) p_value.name = p_name experiment_params.append(p_value) - trial_params.append(models.V1beta1TrialParameterSpec(name=p_name, reference=p_name)) + trial_params.append( + models.V1beta1TrialParameterSpec(name=p_name, reference=p_name) + ) elif p_value is not None: old_attr = getattr(training_args, p_name, None) if old_attr is not None: @@ -545,7 +563,9 @@ def tune( lora_config = trainer_parameters.lora_config for p_name, p_value in trainer_parameters.lora_config.__dict__.items(): if not hasattr(lora_config, p_name): - logger.warning(f"Training parameter {p_name} is not supported by the current peft.") + logger.warning( + f"Training parameter {p_name} is not supported by the current peft." + ) continue if isinstance(p_value, models.V1beta1ParameterSpec): old_attr = getattr(lora_config, p_name, None) @@ -554,7 +574,9 @@ def tune( setattr(lora_config, p_name, value) p_value.name = p_name experiment_params.append(p_value) - trial_params.append(models.V1beta1TrialParameterSpec(name=p_name, reference=p_name)) + trial_params.append( + models.V1beta1TrialParameterSpec(name=p_name, reference=p_name) + ) elif p_value is not None: old_attr = getattr(lora_config, p_name, None) if old_attr is not None: @@ -569,7 +591,9 @@ def tune( "--model_provider", mp, "--model_provider_parameters", - json.dumps(model_provider_parameters.__dict__, cls=utils.SetEncoder), + json.dumps( + model_provider_parameters.__dict__, cls=utils.SetEncoder + ), "--dataset_provider", dp, "--dataset_provider_parameters", @@ -972,7 +996,9 @@ def wait_for_experiment_condition( ) ): utils.print_experiment_status(experiment) - logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") + logger.debug( + f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" + ) return experiment # Raise exception if Experiment is Failed. @@ -992,7 +1018,9 @@ def wait_for_experiment_condition( ) ): utils.print_experiment_status(experiment) - logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") + logger.debug( + f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" + ) return experiment # Check if Experiment reaches Running condition. @@ -1003,7 +1031,9 @@ def wait_for_experiment_condition( ) ): utils.print_experiment_status(experiment) - logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") + logger.debug( + f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" + ) return experiment # Check if Experiment reaches Restarting condition. @@ -1014,7 +1044,9 @@ def wait_for_experiment_condition( ) ): utils.print_experiment_status(experiment) - logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") + logger.debug( + f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" + ) return experiment # Check if Experiment reaches Succeeded condition. @@ -1025,7 +1057,9 @@ def wait_for_experiment_condition( ) ): utils.print_experiment_status(experiment) - logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") + logger.debug( + f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" + ) return experiment # Otherwise, print the current Experiment results and sleep for the pooling interval. diff --git a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py index 21d421b7bbc..8de550e2fe7 100644 --- a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py +++ b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py @@ -88,4 +88,4 @@ STORAGE_INITIALIZER_IMAGE = "docker.io/kubeflow/storage-initializer" -TRAINER_TRANSFORMER_IMAGE = "docker.io/kubeflow/trainer-huggingface" \ No newline at end of file +TRAINER_TRANSFORMER_IMAGE = "docker.io/kubeflow/trainer-huggingface" diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 2a2e2b4b4b8..8c90a001d96 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -72,6 +72,7 @@ def print_experiment_status(experiment: models.V1beta1Experiment): print(f"Current Optimal Trial:\n {experiment.status.current_optimal_trial}") print(f"Experiment conditions:\n {experiment.status.conditions}") + def validate_metrics_value(value: Any): """Validate if the metrics value can be converted to type `float`.""" try: diff --git a/sdk/python/v1beta1/setup.py b/sdk/python/v1beta1/setup.py index 757ccd4a05d..b715be4f7c3 100644 --- a/sdk/python/v1beta1/setup.py +++ b/sdk/python/v1beta1/setup.py @@ -70,6 +70,6 @@ ], install_requires=REQUIRES, extras_require={ - "huggingface": ["kubeflow-training[huggingface]"], - }, + "huggingface": ["kubeflow-training[huggingface]"], + }, ) From aba2606e010bac8a05bbd3f00544368de64e0bb9 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 29 Jul 2024 09:43:49 +0800 Subject: [PATCH 10/84] update the message of 'ImportError' Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 568c150fdee..1d45200b8f2 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -486,7 +486,7 @@ def tune( except ImportError: raise ImportError( "Tune API dependencies not installed. " - + "Run: pip install -U 'kubeflow-training[huggingface]' " + + "Run: pip install -U 'kubeflow-katib[huggingface]' " ) # Create PVC for the Storage Initializer. From eaf0193a9edeaf8e39fc72df8e0bc156b45b72ad Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 29 Jul 2024 09:48:08 +0800 Subject: [PATCH 11/84] add TODO of PVC creation Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 1d45200b8f2..d10890238da 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -490,6 +490,7 @@ def tune( ) # Create PVC for the Storage Initializer. + # TODO (helenxie-bit): PVC Creation should be part of Katib Controller. try: self.core_api.create_namespaced_persistent_volume_claim( namespace=namespace, From 62355a2bc3acb142d00bc7401dc6e2982f87a529 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 29 Jul 2024 10:25:02 +0800 Subject: [PATCH 12/84] update the name of pvc Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index d10890238da..7667f6e30fd 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -495,7 +495,7 @@ def tune( self.core_api.create_namespaced_persistent_volume_claim( namespace=namespace, body=utils.get_pvc_spec( - pvc_name=constants.STORAGE_INITIALIZER, + pvc_name=name, namespace=namespace, storage_config=storage_config, ), @@ -506,9 +506,9 @@ def tune( ) # Check if the PVC with the specified name exists. for pvc in pvc_list.items: - if pvc.metadata.name == constants.STORAGE_INITIALIZER: + if pvc.metadata.name == name: print( - f"PVC '{constants.STORAGE_INITIALIZER}' already exists in namespace " + f"PVC '{name}' already exists in namespace " f"{namespace}." ) break From 7b2b40eaa5fb89e0cc8b3b57787acbad2493ce89 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 29 Jul 2024 16:31:41 +0800 Subject: [PATCH 13/84] reuse constants from Training Operator Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 29 +++++++++++++------ .../kubeflow/katib/constants/constants.py | 22 -------------- 2 files changed, 20 insertions(+), 31 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 7667f6e30fd..484ae04167a 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -471,6 +471,17 @@ def tune( # If users choose to use external models and datasets. else: try: + from kubeflow.training.constants.constants import( + STORAGE_INITIALIZER, + STORAGE_INITIALIZER_VOLUME_MOUNT, + STORAGE_INITIALIZER_VOLUME, + STORAGE_INITIALIZER_IMAGE, + TRAINER_TRANSFORMER_IMAGE, + ) + from kubeflow.storage_initializer.constants import ( + VOLUME_PATH_DATASET, + VOLUME_PATH_MODEL, + ) from kubeflow.storage_initializer.hugging_face import ( HuggingFaceDatasetParams, ) @@ -586,8 +597,8 @@ def tune( # create init container spec. init_container_spec = client.V1Container( - name=constants.STORAGE_INITIALIZER, - image=constants.STORAGE_INITIALIZER_IMAGE, + name=STORAGE_INITIALIZER, + image=STORAGE_INITIALIZER_IMAGE, args=[ "--model_provider", mp, @@ -600,7 +611,7 @@ def tune( "--dataset_provider_parameters", json.dumps(dataset_provider_parameters.__dict__), ], - volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT], + volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], ) lora_config = json.dumps(lora_config.__dict__, cls=utils.SetEncoder) @@ -608,22 +619,22 @@ def tune( # create app container spec. container_spec = client.V1Container( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - image=constants.TRAINER_TRANSFORMER_IMAGE, + image=TRAINER_TRANSFORMER_IMAGE, args=[ "--model_uri", model_provider_parameters.model_uri, "--transformer_type", model_provider_parameters.transformer_type.__name__, "--model_dir", - constants.VOLUME_PATH_MODEL, + VOLUME_PATH_MODEL, "--dataset_dir", - constants.VOLUME_PATH_DATASET, + VOLUME_PATH_DATASET, "--lora_config", f"'{lora_config}'", "--training_parameters", f"'{training_args}'", ], - volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT], + volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], env=env if env else None, env_from=env_from if env_from else None, resources=resources_per_trial, @@ -637,7 +648,7 @@ def tune( restart_policy="Never", containers=[container_spec], init_containers=[init_container_spec], - volumes=[constants.STORAGE_INITIALIZER_VOLUME], + volumes=[STORAGE_INITIALIZER_VOLUME], ), ) @@ -957,7 +968,7 @@ def wait_for_experiment_condition( name: str, namespace: Optional[str] = None, expected_condition: str = constants.EXPERIMENT_CONDITION_SUCCEEDED, - timeout: int = 600, + timeout: int = 6000, polling_interval: int = 15, apiserver_timeout: int = constants.DEFAULT_TIMEOUT, ): diff --git a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py index 8de550e2fe7..fa4e5882727 100644 --- a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py +++ b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py @@ -63,29 +63,7 @@ DEFAULT_DB_MANAGER_ADDRESS = "katib-db-manager.kubeflow:6789" -# Constants for Tune API. -STORAGE_INITIALIZER = "storage-initializer" # The default value for dataset and model storage PVC. PVC_DEFAULT_SIZE = "10Gi" # The default value for PVC access modes. PVC_DEFAULT_ACCESS_MODES = ["ReadWriteOnce", "ReadOnlyMany"] - -INIT_CONTAINER_MOUNT_PATH = "/workspace" -VOLUME_PATH_DATASET = INIT_CONTAINER_MOUNT_PATH + "/dataset" -VOLUME_PATH_MODEL = INIT_CONTAINER_MOUNT_PATH + "/model" - -STORAGE_INITIALIZER_VOLUME_MOUNT = client.V1VolumeMount( - name=STORAGE_INITIALIZER, - mount_path=INIT_CONTAINER_MOUNT_PATH, -) - -STORAGE_INITIALIZER_VOLUME = client.V1Volume( - name=STORAGE_INITIALIZER, - persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource( - claim_name=STORAGE_INITIALIZER - ), -) - -STORAGE_INITIALIZER_IMAGE = "docker.io/kubeflow/storage-initializer" - -TRAINER_TRANSFORMER_IMAGE = "docker.io/kubeflow/trainer-huggingface" From acd1dcf07bc871eb2e2004f1f63cd90203e9dedb Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 30 Jul 2024 10:37:34 +0800 Subject: [PATCH 14/84] keep 'parameters' and update validation Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 99 ++++++++++++------- 1 file changed, 64 insertions(+), 35 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 484ae04167a..64f5cc53aea 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -18,7 +18,7 @@ import multiprocessing import textwrap import time -from typing import Any, Callable, Dict, List, Optional, Union, TYPE_CHECKING +from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING, Union logger = logging.getLogger(__name__) @@ -168,8 +168,11 @@ def tune( self, # TODO (andreyvelich): How to be consistent with other APIs (name) ? name: str, - model_provider_parameters: Optional['HuggingFaceModelParams'] = None, - dataset_provider_parameters: Optional[Union['HuggingFaceDatasetParams', 'S3DatasetParams']] = None, + model_provider_parameters: Optional["HuggingFaceModelParams"] = None, + dataset_provider_parameters: Optional[ + Union["HuggingFaceDatasetParams", "S3DatasetParams"] + ] = None, + trainer_parameters: Optional["HuggingFaceTrainerParams"] = None, storage_config: Optional[Dict[str, Optional[Union[str, List[str]]]]] = { "size": constants.PVC_DEFAULT_SIZE, "storage_class": None, @@ -177,7 +180,7 @@ def tune( }, objective: Optional[Callable] = None, base_image: Optional[str] = constants.BASE_IMAGE_TENSORFLOW, - trainer_parameters: Union['HuggingFaceTrainerParams', Dict[str, Any]]=None, + parameters: Optional[Dict[str, Any]] = None, namespace: Optional[str] = None, env_per_trial: Optional[ Union[Dict[str, str], List[Union[client.V1EnvVar, client.V1EnvFromSource]]] @@ -200,8 +203,12 @@ def tune( metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"}, ): """Create HyperParameter Tuning Katib Experiment using one of the following options: - - External models and datasets: Specify both `model_provider_parameters` and `dataset_provider_parameters` to download models and datasets from external platforms (currently supports HuggingFace and Amazon S3) using the Storage Initializer. The `trainer_parameters` should be of type `HuggingFaceTrainerParams` to set the hyperparameters search space. This API will automatically define the "Trainer" class in HuggingFace with the provided parameters and utilize `Trainer.train()` from HuggingFace to obtain the metrics for optimizing hyperparameters. - - Custom objective function: Specify the `objective` parameter to define your own objective function. The `base_image` parameter will be used to execute the objective function. `trainer_parameters` should be a dictionary to define the search space for these parameters. + 1. External models and datasets + Parameters: `model_provider_parameters` + `dataset_provider_parameters` + `trainer_parameters`. + Usage: Specify both `model_provider_parameters` and `dataset_provider_parameters` to download models and datasets from external platforms (currently supports HuggingFace and Amazon S3) using the Storage Initializer. The `trainer_parameters` should be of type `HuggingFaceTrainerParams` to set the hyperparameters search space. This API will automatically define the "Trainer" class in HuggingFace with the provided parameters and utilize `Trainer.train()` from HuggingFace to obtain the metrics for optimizing hyperparameters. + 2. Custom objective function + Parameters: `objective` + `base_image` + `parameters`. + Usage: Specify the `objective` parameter to define your own objective function. The `base_image` parameter will be used to execute the objective function. The `parameters` should be a dictionary to define the search space for these parameters. Args: name: Name for the Experiment. @@ -209,6 +216,15 @@ def tune( For example, HuggingFace model name and Transformer type for that model, like: AutoModelForSequenceClassification. This argument must be the type of `kubeflow.storage_initializer.hugging_face.HuggingFaceModelParams`. dataset_provider_parameters: Parameters for the dataset provider in the Storage Initializer. For example, name of the HuggingFace dataset or AWS S3 configuration. This argument must be the type of `kubeflow.storage_initializer.hugging_face.HuggingFaceDatasetParams` or `kubeflow.storage_initializer.s3.S3DatasetParams` + trainer_parameters: Parameters for configuring the training process, including settings for the hyperparameters search space. It should be of type `HuggingFaceTrainerParams`. You should use the Katib SDK to define the search space for these parameters.For example: + ``` + trainer_parameters = HuggingFaceTrainerParams( + training_parameters = transformers.TrainingArguments( + learning_rate = katib.search.double(min=0.1, max=0.2), + ), + ), + ``` + Also, you can use these parameters to define input for training the models. storage_config: Configuration for Storage Initializer PVC to download pre-trained model and dataset. You can configure PVC size and storage class name in this argument. objective: Objective function that Katib uses to train the model. @@ -217,21 +233,11 @@ def tune( The function should not use any code declared outside of the function definition. Import statements must be added inside the function. base_image: Image to use when executing the objective function. - trainer_parameters: Parameters for configuring the training process, including settings for the hyperparameters search space. - You should use the Katib SDK to define the search space for these parameters. - If you choose to use external models and datasets, it should be of type `HuggingFaceTrainerParams`. For example: - ``` - trainer_parameters = HuggingFaceTrainerParams( - training_parameters = transformers.TrainingArguments( - learning_rate = katib.search.double(min=0.1, max=0.2), - ), - ), - ``` - If you choose a custom objective function, it should be a dictionary. For example: + parameters: Dict of hyperparameters to optimize if you choose a custom objective function. You should use the Katib SDK to define the search space for these parameters. For example: ``` - trainer_parameters = {"lr": katib.search.double(min=0.1, max=0.2)} + parameters = {"lr": katib.search.double(min=0.1, max=0.2)}` ``` - Also, you can use these parameters to define input for training the external models or your custom objective function. + Also, you can use these parameters to define input for your objective function. namespace: Namespace for the Experiment. env_per_trial: Environment variable(s) to be attached to each trial container. You can specify a dictionary as a mapping object representing the environment @@ -287,20 +293,29 @@ def tune( """ print( - "Thank you for using `tune` API for LLMs hyperparameters optimization. This feature is in alpha stage Kubeflow community is looking for your feedback. Please share your experience via #kubeflow-katib Slack channel or Kubeflow Katib GitHub." + "Thank you for using the `tune` API for LLM hyperparameter optimization. " + "You can create a HyperParameter Optimization Katib Experiment using one of the following options:\n" + "1. Use external models and datasets: specify `model_provider_parameters`, `dataset_provider_parameters` and `trainer_parameters`.\n" + "2. Use custom objective function: specify `objective`, `base_image` and `parameters`.\n" + "This feature is in the alpha stage. The Kubeflow community is looking for your feedback. Please share your experience via the #kubeflow-katib Slack channel or the Kubeflow Katib GitHub." ) if ( - (model_provider_parameters is not None) - and (dataset_provider_parameters is not None) - ) == (objective is not None): + model_provider_parameters is not None + or dataset_provider_parameters is not None + or trainer_parameters is not None + ) and ( + objective is not None or base_image is not None or parameters is not None + ): raise ValueError( "Invalid configuration for creating a Katib Experiment for hyperparameter optimization. " - "You should only specify one of the following options: 1) `model_provider_parameters` and `dataset_provider_parameters`; 2) `objective`." + "You should only specify one of the following options:\n" + "1. Use external models and datasets: specify `model_provider_parameters`, `dataset_provider_parameters` and `trainer_parameters`;\n" + "2. Use custom objective function: specify `objective`, `base_image` and `parameters`." ) - if not name or not trainer_parameters: - raise ValueError("One of the required parameters is None") + if not name: + raise ValueError("Please specify name for the Experiment.") namespace = namespace or self.namespace @@ -378,7 +393,12 @@ def tune( ), source=models.V1beta1SourceSpec( filter=models.V1beta1FilterSpec( - metrics_format=["\\'(\\w+)\\':\\s((-?\\d+)(\\.\\d+)?)"] + metrics_format=[ + # For example: train_loss=0.846 + r"([\w|-]+)\s*=\s*([+-]?\d*(\.\d+)?([Ee][+-]?\d+)?)", + # For example: 'train_loss':0.846 + r"'([\w|-]+)'\s*:\s*([+-]?\d*(\.\d+)?([Ee][+-]?\d+)?)", + ] ) ), ) @@ -386,6 +406,9 @@ def tune( # Create Container and Pod specifications. # If users choose to use a custom objective function. if objective is not None: + if not base_image or not parameters: + raise ValueError("One of the required parameters is None.") + # Validate objective function. utils.validate_objective_function(objective) @@ -400,7 +423,7 @@ def tune( input_params = {} experiment_params = [] trial_params = [] - for p_name, p_value in trainer_parameters.items(): + for p_name, p_value in parameters.items(): # If input parameter value is Katib Experiment parameter sample. if isinstance(p_value, models.V1beta1ParameterSpec): # Wrap value for the function input. @@ -447,7 +470,7 @@ def tune( + exec_script ) - # create app container spec + # Create app container spec container_spec = client.V1Container( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, image=base_image, @@ -470,8 +493,15 @@ def tune( # If users choose to use external models and datasets. else: + if ( + not model_provider_parameters + or not dataset_provider_parameters + or not trainer_parameters + ): + raise ValueError("One of the required parameters is None") + try: - from kubeflow.training.constants.constants import( + from kubeflow.training.constants.constants import ( STORAGE_INITIALIZER, STORAGE_INITIALIZER_VOLUME_MOUNT, STORAGE_INITIALIZER_VOLUME, @@ -519,8 +549,7 @@ def tune( for pvc in pvc_list.items: if pvc.metadata.name == name: print( - f"PVC '{name}' already exists in namespace " - f"{namespace}." + f"PVC '{name}' already exists in namespace " f"{namespace}." ) break else: @@ -595,7 +624,7 @@ def tune( value = type(old_attr)(p_value) setattr(lora_config, p_name, value) - # create init container spec. + # Create init container spec. init_container_spec = client.V1Container( name=STORAGE_INITIALIZER, image=STORAGE_INITIALIZER_IMAGE, @@ -616,7 +645,7 @@ def tune( lora_config = json.dumps(lora_config.__dict__, cls=utils.SetEncoder) training_args = json.dumps(training_args.to_dict()) - # create app container spec. + # Create app container spec. container_spec = client.V1Container( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, image=TRAINER_TRANSFORMER_IMAGE, @@ -968,7 +997,7 @@ def wait_for_experiment_condition( name: str, namespace: Optional[str] = None, expected_condition: str = constants.EXPERIMENT_CONDITION_SUCCEEDED, - timeout: int = 6000, + timeout: int = 600, polling_interval: int = 15, apiserver_timeout: int = constants.DEFAULT_TIMEOUT, ): From 10b057df5cff36b913e9f68da10c246122398809 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 1 Aug 2024 07:01:56 +0800 Subject: [PATCH 15/84] update for test Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 260 +++++++++++++----- .../v1beta1/kubeflow/katib/types/__init__.py | 7 + .../kubeflow/katib/types/trainer_resources.py | 139 ++++++++++ sdk/python/v1beta1/test_llm.py | 63 +++++ 4 files changed, 397 insertions(+), 72 deletions(-) create mode 100644 sdk/python/v1beta1/kubeflow/katib/types/__init__.py create mode 100644 sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py create mode 100644 sdk/python/v1beta1/test_llm.py diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 64f5cc53aea..5a9b1ea5bff 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -24,6 +24,7 @@ import grpc from kubeflow.katib import models +from kubeflow.katib import types from kubeflow.katib.api_client import ApiClient from kubeflow.katib.constants import constants import kubeflow.katib.katib_api_pb2 as katib_api_pb2 @@ -196,7 +197,7 @@ def tune( max_trial_count: int = None, parallel_trial_count: int = None, max_failed_trial_count: int = None, - resources_per_trial: Union[dict, client.V1ResourceRequirements, None] = None, + resources_per_trial: Union[dict, client.V1ResourceRequirements, types.TrainerResources, None] = None, retain_trials: bool = False, packages_to_install: List[str] = None, pip_index_url: str = "https://pypi.org/simple", @@ -357,14 +358,14 @@ def tune( experiment.spec.max_failed_trial_count = max_failed_trial_count # Add resources to the Katib Experiment. - if isinstance(resources_per_trial, dict): - if "gpu" in resources_per_trial: - resources_per_trial["nvidia.com/gpu"] = resources_per_trial.pop("gpu") + #if isinstance(resources_per_trial, dict): + # if "gpu" in resources_per_trial: + # resources_per_trial["nvidia.com/gpu"] = resources_per_trial.pop("gpu") - resources_per_trial = client.V1ResourceRequirements( - requests=resources_per_trial, - limits=resources_per_trial, - ) + # resources_per_trial = client.V1ResourceRequirements( + # requests=resources_per_trial, + # limits=resources_per_trial, + # ) # Add environment variables to the Katib Experiment. env = [] @@ -413,11 +414,11 @@ def tune( utils.validate_objective_function(objective) # Extract objective function implementation. - objective_code = inspect.getsource(objective) + #objective_code = inspect.getsource(objective) # Objective function might be defined in some indented scope # (e.g. in another function). We need to dedent the function code. - objective_code = textwrap.dedent(objective_code) + #objective_code = textwrap.dedent(objective_code) # Iterate over input parameters. input_params = {} @@ -445,51 +446,94 @@ def tune( # def objective(parameters): # print(f'Parameters are {parameters}') # objective({'lr': '${trialParameters.lr}', 'epochs': '${trialParameters.epochs}', 'is_dist': False}) - objective_code = f"{objective_code}\n{objective.__name__}({input_params})\n" + #objective_code = f"{objective_code}\n{objective.__name__}({input_params})\n" # Prepare execute script template. - exec_script = textwrap.dedent( - """ - program_path=$(mktemp -d) - read -r -d '' SCRIPT << EOM\n - {objective_code} - EOM - printf "%s" "$SCRIPT" > $program_path/ephemeral_objective.py - python3 -u $program_path/ephemeral_objective.py""" - ) + #exec_script = textwrap.dedent( + # """ + # program_path=$(mktemp -d) + # read -r -d '' SCRIPT << EOM\n + # {objective_code} + # EOM + # printf "%s" "$SCRIPT" > $program_path/ephemeral_objective.py + # python3 -u $program_path/ephemeral_objective.py""" + #) # Add objective code to the execute script. - exec_script = exec_script.format(objective_code=objective_code) + #exec_script = exec_script.format(objective_code=objective_code) # Install Python packages if that is required. - if packages_to_install is not None: - exec_script = ( - utils.get_script_for_python_packages( - packages_to_install, pip_index_url - ) - + exec_script - ) - + #if packages_to_install is not None: + # exec_script = ( + # utils.get_script_for_python_packages( + # packages_to_install, pip_index_url + # ) + # + exec_script + # ) + + from kubeflow.training.utils import get_container_spec, get_pod_template_spec, get_pytorchjob_template # Create app container spec - container_spec = client.V1Container( + container_spec = get_container_spec( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, image=base_image, - command=["bash", "-c"], - args=[exec_script], - env=env if env else None, - env_from=env_from if env_from else None, - resources=resources_per_trial, + train_func=objective, + train_func_parameters=input_params, + packages_to_install=packages_to_install, + pip_index_url=pip_index_url, + resources=resources_per_trial.resources_per_worker if isinstance(resources_per_trial, types.TrainerResources) else resources_per_trial, + env=env, + env_from=env_from, ) + #container_spec = client.V1Container( + # name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, + # image=base_image, + # command=["bash", "-c"], + # args=[exec_script], + # env=env if env else None, + # env_from=env_from if env_from else None, + # resources=resources_per_trial, + #) + + if isinstance(resources_per_trial, dict) or isinstance(resources_per_trial, client.V1ResourceRequirements): + pod_spec = get_pod_template_spec( + containers = [container_spec], + restart_policy="Never", + ) + #pod_spec = client.V1PodTemplateSpec( + # metadata=models.V1ObjectMeta( + # annotations={"sidecar.istio.io/inject": "false"} + # ), + # spec=client.V1PodSpec( + # restart_policy="Never", + # containers=[container_spec], + # ), + #) + # Create Trial specification. + trial_spec = client.V1Job( + api_version="batch/v1", + kind="Job", + spec=client.V1JobSpec( + template=pod_spec, + ), + ) + else: + worker_pod_spec = get_pod_template_spec( + containers = [container_spec], + restart_policy="Never", + ) - pod_spec = client.V1PodTemplateSpec( - metadata=models.V1ObjectMeta( - annotations={"sidecar.istio.io/inject": "false"} - ), - spec=client.V1PodSpec( + master_pod_spec = get_pod_template_spec( + containers = [container_spec], restart_policy="Never", - containers=[container_spec], - ), - ) + ) + trial_spec = get_pytorchjob_template( + name=name, + namespace=namespace, + master_pod_template_spec=master_pod_spec, + worker_pod_template_spec=worker_pod_spec, + num_workers=resources_per_trial.num_workers, + num_procs_per_worker=resources_per_trial.num_procs_per_worker, + ) # If users choose to use external models and datasets. else: @@ -504,7 +548,7 @@ def tune( from kubeflow.training.constants.constants import ( STORAGE_INITIALIZER, STORAGE_INITIALIZER_VOLUME_MOUNT, - STORAGE_INITIALIZER_VOLUME, + #STORAGE_INITIALIZER_VOLUME, STORAGE_INITIALIZER_IMAGE, TRAINER_TRANSFORMER_IMAGE, ) @@ -625,16 +669,14 @@ def tune( setattr(lora_config, p_name, value) # Create init container spec. - init_container_spec = client.V1Container( + init_container_spec = get_container_spec( name=STORAGE_INITIALIZER, - image=STORAGE_INITIALIZER_IMAGE, + base_image=STORAGE_INITIALIZER_IMAGE, args=[ "--model_provider", mp, "--model_provider_parameters", - json.dumps( - model_provider_parameters.__dict__, cls=utils.SetEncoder - ), + json.dumps(model_provider_parameters.__dict__, cls=utils.SetEncoder), "--dataset_provider", dp, "--dataset_provider_parameters", @@ -642,13 +684,30 @@ def tune( ], volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], ) + #init_container_spec = client.V1Container( + # name=STORAGE_INITIALIZER, + # image=STORAGE_INITIALIZER_IMAGE, + # args=[ + # "--model_provider", + # mp, + # "--model_provider_parameters", + # json.dumps( + # model_provider_parameters.__dict__, cls=utils.SetEncoder + # ), + # "--dataset_provider", + # dp, + # "--dataset_provider_parameters", + # json.dumps(dataset_provider_parameters.__dict__), + # ], + # volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], + #) lora_config = json.dumps(lora_config.__dict__, cls=utils.SetEncoder) training_args = json.dumps(training_args.to_dict()) - # Create app container spec. - container_spec = client.V1Container( + + container_spec = get_container_spec( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - image=TRAINER_TRANSFORMER_IMAGE, + base_image=TRAINER_TRANSFORMER_IMAGE, args=[ "--model_uri", model_provider_parameters.model_uri, @@ -664,31 +723,88 @@ def tune( f"'{training_args}'", ], volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], - env=env if env else None, - env_from=env_from if env_from else None, - resources=resources_per_trial, + resources=resources_per_trial.resources_per_worker if isinstance(resources_per_trial, types.TrainerResources) else resources_per_trial, + env=env, + env_from=env_from, ) - - pod_spec = client.V1PodTemplateSpec( - metadata=models.V1ObjectMeta( - annotations={"sidecar.istio.io/inject": "false"} + # Create app container spec. + #container_spec = client.V1Container( + # name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, + # image=TRAINER_TRANSFORMER_IMAGE, + # args=[ + # "--model_uri", + # model_provider_parameters.model_uri, + # "--transformer_type", + # model_provider_parameters.transformer_type.__name__, + # "--model_dir", + # VOLUME_PATH_MODEL, + # "--dataset_dir", + # VOLUME_PATH_DATASET, + # "--lora_config", + # f"'{lora_config}'", + # "--training_parameters", + # f"'{training_args}'", + # ], + # volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], + # env=env if env else None, + # env_from=env_from if env_from else None, + # resources=resources_per_trial, + #) + + storage_initializer_volume = models.V1Volume( + name=STORAGE_INITIALIZER, + persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource( + claim_name=name ), - spec=client.V1PodSpec( - restart_policy="Never", + ) + + if isinstance(resources_per_trial, dict) or isinstance(resources_per_trial, client.V1ResourceRequirements): + pod_spec = get_pod_template_spec( containers=[container_spec], init_containers=[init_container_spec], - volumes=[STORAGE_INITIALIZER_VOLUME], - ), - ) + volumes=[storage_initializer_volume], + restart_policy="Never", + ) + #pod_spec = client.V1PodTemplateSpec( + # metadata=models.V1ObjectMeta( + # annotations={"sidecar.istio.io/inject": "false"} + # ), + # spec=client.V1PodSpec( + # restart_policy="Never", + # containers=[container_spec], + # init_containers=[init_container_spec], + # volumes=[STORAGE_INITIALIZER_VOLUME], + # ), + #) + # Create Trial specification. + trial_spec = client.V1Job( + api_version="batch/v1", + kind="Job", + spec=client.V1JobSpec( + template=pod_spec, + ), + ) + else: + # create worker pod spec + worker_pod_spec = get_pod_template_spec( + containers=[container_spec], + volumes=[storage_initializer_volume], + ) - # Create Trial specification. - trial_spec = client.V1Job( - api_version="batch/v1", - kind="Job", - spec=client.V1JobSpec( - template=pod_spec, - ), - ) + # create master pod spec + master_pod_spec = get_pod_template_spec( + containers=[container_spec], + init_containers=[init_container_spec], + volumes=[storage_initializer_volume], + ) + trial_spec = get_pytorchjob_template( + name=name, + namespace=namespace, + master_pod_template_spec=master_pod_spec, + worker_pod_template_spec=worker_pod_spec, + num_workers=resources_per_trial.num_workers, + num_procs_per_worker=resources_per_trial.num_procs_per_worker, + ) # Create Trial template. trial_template = models.V1beta1TrialTemplate( diff --git a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py new file mode 100644 index 00000000000..a38761478a0 --- /dev/null +++ b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py @@ -0,0 +1,7 @@ +from __future__ import absolute_import + +# Import types into type package +from kubeflow.katib.types.trainer_resources import TrainerResources + +# Import Kubernetes models. +from kubernetes.client import * \ No newline at end of file diff --git a/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py new file mode 100644 index 00000000000..6ae7fa5741b --- /dev/null +++ b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py @@ -0,0 +1,139 @@ +import pprint +import re + +import six + +from kubeflow.katib.configuration import Configuration + + +class TrainerResources(object): + def __init__(self, num_workers=None, num_procs_per_worker=None, resources_per_worker=None, local_vars_configuration=None): + if local_vars_configuration is None: + local_vars_configuration = Configuration() + self.local_vars_configuration = local_vars_configuration + + self._num_workers = None + self._num_procs_per_worker = None + self._resources_per_worker = None + + if num_workers is not None: + self.num_workers = num_workers + if num_procs_per_worker is not None: + self.num_procs_per_worker = num_procs_per_worker + if resources_per_worker is not None: + self.resources_per_worker = resources_per_worker + + @property + def num_workers(self): + """Gets the number of workers of distributed training. + + Number of workers is setting number of workers. + + :return: The number of workers of distributed training. + :rtype: int + """ + return self._num_workers + + @num_workers.setter + def num_workers(self, num_workers): + """Sets the number of workers of distributed training. + + Number of workers is setting number of workers. + + :param num_workers: The number of workers of distributed training. + :type: int + """ + + self._num_workers = num_workers + + @property + def num_procs_per_worker(self): + """Gets the number of processes per worker of distributed training. + + Number of processes per worker is the setting number of processes per worker. + + :return: The number of processed per worker of distributed training. + :rtype: int + """ + return self._num_procs_per_worker + + @num_procs_per_worker.setter + def num_procs_per_worker(self, num_procs_per_worker): + """Sets the number of processes per worker of distributed training. + + Number of processes per worker is the setting number of processes per worker. + + :param num_procs_per_worker: The number of processes per worker of distributed training. + :type: int + """ + + self._num_procs_per_worker = num_procs_per_worker + + @property + def resources_per_worker(self): + """Gets the resources per worker of distributed training. + + Resources per worker is the setting resources per worker. + + :return: The resources per worker of distributed training. + :rtype: dict or V1ResourceRequirements + """ + return self._resources_per_worker + + @resources_per_worker.setter + def resources_per_worker(self, resources_per_worker): + """Sets the resources per worker of distributed training. + + Resources per worker is the setting resources per worker. + + :param resources_per_worker: The resources per worker of distributed training. + :type: dict or V1ResourceRequirements + """ + + self._resources_per_worker = resources_per_worker + + def to_dict(self): + """Returns the resources properties as a dict""" + result = {} + + for attr, _ in six.iteritems(self.__dict__): + value = getattr(self, attr) + if isinstance(value, list): + result[attr] = list(map( + lambda x: x.to_dict() if hasattr(x, "to_dict") else x, + value + )) + elif hasattr(value, "to_dict"): + result[attr] = value.to_dict() + elif isinstance(value, dict): + result[attr] = dict(map( + lambda item: (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") else item, + value.items() + )) + else: + result[attr] = value + + return result + + def to_str(self): + """Returns the string representation of the model""" + return pprint.pformat(self.to_dict()) + + def __repr__(self): + """For `print` and `pprint`""" + return self.to_str() + + def __eq__(self, other): + """Returns true if both objects are equal""" + if not isinstance(other, TrainerResources): + return False + + return self.to_dict() == other.to_dict() + + def __ne__(self, other): + """Returns true if both objects are not equal""" + if not isinstance(other, TrainerResources): + return True + + return self.to_dict() != other.to_dict() diff --git a/sdk/python/v1beta1/test_llm.py b/sdk/python/v1beta1/test_llm.py new file mode 100644 index 00000000000..4bc81efb459 --- /dev/null +++ b/sdk/python/v1beta1/test_llm.py @@ -0,0 +1,63 @@ +import kubeflow.katib as katib +from kubeflow.katib import KatibClient + +import transformers +from peft import LoraConfig + +from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceModelParams, + HuggingFaceDatasetParams, + HuggingFaceTrainerParams, +) + +cl = KatibClient(namespace="kubeflow") + + +# [3] Create Katib Experiment with 12 Trials and 2 CPUs per Trial. +name = "llm-experiment" +cl.tune( + name = name, + # BERT model URI and type of Transformer to train it. + model_provider_parameters = HuggingFaceModelParams( + model_uri = "hf://google-bert/bert-base-cased", + transformer_type = transformers.AutoModelForSequenceClassification, + ), + # Use 3000 samples from Yelp dataset. + dataset_provider_parameters = HuggingFaceDatasetParams( + repo_id = "yelp_review_full", + split = "train[:8]", + ), + # Specify HuggingFace Trainer parameters. + trainer_parameters = HuggingFaceTrainerParams( + training_parameters = transformers.TrainingArguments( + output_dir = "test_tune_api", + save_strategy = "no", + learning_rate = katib.search.double(min=1e-05, max=5e-05), + #no_cuda=True, #if you use cpu instead of gpu + #use_cpu=True, #if you use cpu instead of gpu + num_train_epochs=1, + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config = LoraConfig( + r = katib.search.int(min=8, max=32), + lora_alpha = 8, + lora_dropout = 0.1, + bias = "none", + ), + ), + objective_metric_name = "train_loss", + objective_type = "minimize", + algorithm_name = "random", + max_trial_count = 1, + parallel_trial_count = 1, + resources_per_trial={ + "cpu": "4", + "memory": "10G", + }, +) + +# [4] Wait until Katib Experiment is complete +cl.wait_for_experiment_condition(name=name) + +# [5] Get the best hyperparameters. +#print(cl.get_optimal_hyperparameters(name)) \ No newline at end of file From 5a87eb01be311baea7d62f56e176b992d33250d2 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 7 Aug 2024 10:50:53 +0800 Subject: [PATCH 16/84] reuse 'get_container_spec' and 'get_pod_template_spec' from Training Operator Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 244 +++++------------- 1 file changed, 61 insertions(+), 183 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 5a9b1ea5bff..910d6b03d40 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -24,7 +24,6 @@ import grpc from kubeflow.katib import models -from kubeflow.katib import types from kubeflow.katib.api_client import ApiClient from kubeflow.katib.constants import constants import kubeflow.katib.katib_api_pb2 as katib_api_pb2 @@ -197,7 +196,7 @@ def tune( max_trial_count: int = None, parallel_trial_count: int = None, max_failed_trial_count: int = None, - resources_per_trial: Union[dict, client.V1ResourceRequirements, types.TrainerResources, None] = None, + resources_per_trial: Union[dict, client.V1ResourceRequirements, None] = None, retain_trials: bool = False, packages_to_install: List[str] = None, pip_index_url: str = "https://pypi.org/simple", @@ -294,11 +293,7 @@ def tune( """ print( - "Thank you for using the `tune` API for LLM hyperparameter optimization. " - "You can create a HyperParameter Optimization Katib Experiment using one of the following options:\n" - "1. Use external models and datasets: specify `model_provider_parameters`, `dataset_provider_parameters` and `trainer_parameters`.\n" - "2. Use custom objective function: specify `objective`, `base_image` and `parameters`.\n" - "This feature is in the alpha stage. The Kubeflow community is looking for your feedback. Please share your experience via the #kubeflow-katib Slack channel or the Kubeflow Katib GitHub." + "Thank you for using `tune` API for LLM hyperparameter optimization. This feature is in the alpha stage. Kubeflow community is looking for your feedback. Please share your experience via #kubeflow-katib Slack channel or the Kubeflow Katib GitHub." ) if ( @@ -306,7 +301,7 @@ def tune( or dataset_provider_parameters is not None or trainer_parameters is not None ) and ( - objective is not None or base_image is not None or parameters is not None + objective is not None or parameters is not None ): raise ValueError( "Invalid configuration for creating a Katib Experiment for hyperparameter optimization. " @@ -358,14 +353,14 @@ def tune( experiment.spec.max_failed_trial_count = max_failed_trial_count # Add resources to the Katib Experiment. - #if isinstance(resources_per_trial, dict): - # if "gpu" in resources_per_trial: - # resources_per_trial["nvidia.com/gpu"] = resources_per_trial.pop("gpu") + if isinstance(resources_per_trial, dict): + if "gpu" in resources_per_trial: + resources_per_trial["nvidia.com/gpu"] = resources_per_trial.pop("gpu") - # resources_per_trial = client.V1ResourceRequirements( - # requests=resources_per_trial, - # limits=resources_per_trial, - # ) + resources_per_trial = client.V1ResourceRequirements( + requests=resources_per_trial, + limits=resources_per_trial, + ) # Add environment variables to the Katib Experiment. env = [] @@ -414,11 +409,11 @@ def tune( utils.validate_objective_function(objective) # Extract objective function implementation. - #objective_code = inspect.getsource(objective) + objective_code = inspect.getsource(objective) # Objective function might be defined in some indented scope # (e.g. in another function). We need to dedent the function code. - #objective_code = textwrap.dedent(objective_code) + objective_code = textwrap.dedent(objective_code) # Iterate over input parameters. input_params = {} @@ -446,94 +441,48 @@ def tune( # def objective(parameters): # print(f'Parameters are {parameters}') # objective({'lr': '${trialParameters.lr}', 'epochs': '${trialParameters.epochs}', 'is_dist': False}) - #objective_code = f"{objective_code}\n{objective.__name__}({input_params})\n" + objective_code = f"{objective_code}\n{objective.__name__}({input_params})\n" # Prepare execute script template. - #exec_script = textwrap.dedent( - # """ - # program_path=$(mktemp -d) - # read -r -d '' SCRIPT << EOM\n - # {objective_code} - # EOM - # printf "%s" "$SCRIPT" > $program_path/ephemeral_objective.py - # python3 -u $program_path/ephemeral_objective.py""" - #) + exec_script = textwrap.dedent( + """ + program_path=$(mktemp -d) + read -r -d '' SCRIPT << EOM\n + {objective_code} + EOM + printf "%s" "$SCRIPT" > $program_path/ephemeral_objective.py + python3 -u $program_path/ephemeral_objective.py""" + ) # Add objective code to the execute script. - #exec_script = exec_script.format(objective_code=objective_code) + exec_script = exec_script.format(objective_code=objective_code) # Install Python packages if that is required. - #if packages_to_install is not None: - # exec_script = ( - # utils.get_script_for_python_packages( - # packages_to_install, pip_index_url - # ) - # + exec_script - # ) - - from kubeflow.training.utils import get_container_spec, get_pod_template_spec, get_pytorchjob_template - # Create app container spec - container_spec = get_container_spec( + if packages_to_install is not None: + exec_script = ( + utils.get_script_for_python_packages(packages_to_install, pip_index_url) + + exec_script + ) + + container_spec = client.V1Container( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, image=base_image, - train_func=objective, - train_func_parameters=input_params, - packages_to_install=packages_to_install, - pip_index_url=pip_index_url, - resources=resources_per_trial.resources_per_worker if isinstance(resources_per_trial, types.TrainerResources) else resources_per_trial, - env=env, - env_from=env_from, + command=["bash", "-c"], + args=[exec_script], + env=env if env else None, + env_from=env_from if env_from else None, + resources=resources_per_trial, ) - #container_spec = client.V1Container( - # name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - # image=base_image, - # command=["bash", "-c"], - # args=[exec_script], - # env=env if env else None, - # env_from=env_from if env_from else None, - # resources=resources_per_trial, - #) - - if isinstance(resources_per_trial, dict) or isinstance(resources_per_trial, client.V1ResourceRequirements): - pod_spec = get_pod_template_spec( - containers = [container_spec], - restart_policy="Never", - ) - #pod_spec = client.V1PodTemplateSpec( - # metadata=models.V1ObjectMeta( - # annotations={"sidecar.istio.io/inject": "false"} - # ), - # spec=client.V1PodSpec( - # restart_policy="Never", - # containers=[container_spec], - # ), - #) - # Create Trial specification. - trial_spec = client.V1Job( - api_version="batch/v1", - kind="Job", - spec=client.V1JobSpec( - template=pod_spec, - ), - ) - else: - worker_pod_spec = get_pod_template_spec( - containers = [container_spec], - restart_policy="Never", - ) - master_pod_spec = get_pod_template_spec( - containers = [container_spec], + pod_spec = client.V1PodTemplateSpec( + metadata=models.V1ObjectMeta( + annotations={"sidecar.istio.io/inject": "false"} + ), + spec=client.V1PodSpec( restart_policy="Never", - ) - trial_spec = get_pytorchjob_template( - name=name, - namespace=namespace, - master_pod_template_spec=master_pod_spec, - worker_pod_template_spec=worker_pod_spec, - num_workers=resources_per_trial.num_workers, - num_procs_per_worker=resources_per_trial.num_procs_per_worker, - ) + containers=[container_spec], + ), + ) # If users choose to use external models and datasets. else: @@ -548,7 +497,6 @@ def tune( from kubeflow.training.constants.constants import ( STORAGE_INITIALIZER, STORAGE_INITIALIZER_VOLUME_MOUNT, - #STORAGE_INITIALIZER_VOLUME, STORAGE_INITIALIZER_IMAGE, TRAINER_TRANSFORMER_IMAGE, ) @@ -669,6 +617,8 @@ def tune( setattr(lora_config, p_name, value) # Create init container spec. + from kubeflow.training.utils.utils import get_container_spec, get_pod_template_spec + init_container_spec = get_container_spec( name=STORAGE_INITIALIZER, base_image=STORAGE_INITIALIZER_IMAGE, @@ -684,23 +634,6 @@ def tune( ], volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], ) - #init_container_spec = client.V1Container( - # name=STORAGE_INITIALIZER, - # image=STORAGE_INITIALIZER_IMAGE, - # args=[ - # "--model_provider", - # mp, - # "--model_provider_parameters", - # json.dumps( - # model_provider_parameters.__dict__, cls=utils.SetEncoder - # ), - # "--dataset_provider", - # dp, - # "--dataset_provider_parameters", - # json.dumps(dataset_provider_parameters.__dict__), - # ], - # volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], - #) lora_config = json.dumps(lora_config.__dict__, cls=utils.SetEncoder) training_args = json.dumps(training_args.to_dict()) @@ -723,33 +656,10 @@ def tune( f"'{training_args}'", ], volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], - resources=resources_per_trial.resources_per_worker if isinstance(resources_per_trial, types.TrainerResources) else resources_per_trial, + resources=resources_per_trial, env=env, env_from=env_from, ) - # Create app container spec. - #container_spec = client.V1Container( - # name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - # image=TRAINER_TRANSFORMER_IMAGE, - # args=[ - # "--model_uri", - # model_provider_parameters.model_uri, - # "--transformer_type", - # model_provider_parameters.transformer_type.__name__, - # "--model_dir", - # VOLUME_PATH_MODEL, - # "--dataset_dir", - # VOLUME_PATH_DATASET, - # "--lora_config", - # f"'{lora_config}'", - # "--training_parameters", - # f"'{training_args}'", - # ], - # volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], - # env=env if env else None, - # env_from=env_from if env_from else None, - # resources=resources_per_trial, - #) storage_initializer_volume = models.V1Volume( name=STORAGE_INITIALIZER, @@ -758,53 +668,21 @@ def tune( ), ) - if isinstance(resources_per_trial, dict) or isinstance(resources_per_trial, client.V1ResourceRequirements): - pod_spec = get_pod_template_spec( - containers=[container_spec], - init_containers=[init_container_spec], - volumes=[storage_initializer_volume], - restart_policy="Never", - ) - #pod_spec = client.V1PodTemplateSpec( - # metadata=models.V1ObjectMeta( - # annotations={"sidecar.istio.io/inject": "false"} - # ), - # spec=client.V1PodSpec( - # restart_policy="Never", - # containers=[container_spec], - # init_containers=[init_container_spec], - # volumes=[STORAGE_INITIALIZER_VOLUME], - # ), - #) - # Create Trial specification. - trial_spec = client.V1Job( - api_version="batch/v1", - kind="Job", - spec=client.V1JobSpec( - template=pod_spec, - ), - ) - else: - # create worker pod spec - worker_pod_spec = get_pod_template_spec( - containers=[container_spec], - volumes=[storage_initializer_volume], - ) + pod_spec = get_pod_template_spec( + containers=[container_spec], + init_containers=[init_container_spec], + volumes=[storage_initializer_volume], + restart_policy="Never", + ) - # create master pod spec - master_pod_spec = get_pod_template_spec( - containers=[container_spec], - init_containers=[init_container_spec], - volumes=[storage_initializer_volume], - ) - trial_spec = get_pytorchjob_template( - name=name, - namespace=namespace, - master_pod_template_spec=master_pod_spec, - worker_pod_template_spec=worker_pod_spec, - num_workers=resources_per_trial.num_workers, - num_procs_per_worker=resources_per_trial.num_procs_per_worker, - ) + # Create Trial specification. + trial_spec = client.V1Job( + api_version="batch/v1", + kind="Job", + spec=client.V1JobSpec( + template=pod_spec, + ), + ) # Create Trial template. trial_template = models.V1beta1TrialTemplate( @@ -1113,7 +991,7 @@ def wait_for_experiment_condition( name: str, namespace: Optional[str] = None, expected_condition: str = constants.EXPERIMENT_CONDITION_SUCCEEDED, - timeout: int = 600, + timeout: int = 6000, polling_interval: int = 15, apiserver_timeout: int = constants.DEFAULT_TIMEOUT, ): From 71605b469d1a5150ea3f260872844c6e34442cda Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 7 Aug 2024 11:37:14 +0800 Subject: [PATCH 17/84] format with black Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 37 ++++++++++++------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 57624061ef3..e42778f2ad1 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -233,7 +233,7 @@ def tune( The function should not use any code declared outside of the function definition. Import statements must be added inside the function. base_image: Image to use when executing the objective function. - parameters: Dict of hyperparameters to optimize if you choose a custom objective function. You should use the Katib SDK to define the search space for these parameters. For example: + parameters: Dict of hyperparameters to optimize if you choose a custom objective function. You should use the Katib SDK to define the search space for these parameters. For example: ``` parameters = {"lr": katib.search.double(min=0.1, max=0.2)}` ``` @@ -293,16 +293,16 @@ def tune( """ print( - "Thank you for using `tune` API for LLM hyperparameter optimization. This feature is in the alpha stage. Kubeflow community is looking for your feedback. Please share your experience via #kubeflow-katib Slack channel or the Kubeflow Katib GitHub." + "Thank you for using `tune` API for LLM hyperparameter optimization. This feature is in the alpha stage. " + "Kubeflow community is looking for your feedback. Please share your experience via " + "#kubeflow-katib Slack channel or the Kubeflow Katib GitHub." ) if ( model_provider_parameters is not None or dataset_provider_parameters is not None or trainer_parameters is not None - ) and ( - objective is not None or parameters is not None - ): + ) and (objective is not None or parameters is not None): raise ValueError( "Invalid configuration for creating a Katib Experiment for hyperparameter optimization. " "You should only specify one of the following options:\n" @@ -382,15 +382,17 @@ def tune( ) # Add metrics collector to the Katib Experiment. - # Up to now, We only support parameter `kind`, of which default value is `StdOut`, to specify the kind of metrics collector. + # Up to now, We only support parameter `kind`, of which default value is `StdOut`, to specify the kind of metrics collector. experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( - collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]), + collector=models.V1beta1CollectorSpec( + kind=metrics_collector_config["kind"] + ), source=models.V1beta1SourceSpec( filter=models.V1beta1FilterSpec( metrics_format=[ # For example: train_loss=0.846 - r"([\w|-]+)\s*=\s*([+-]?\d*(\.\d+)?([Ee][+-]?\d+)?)", - # For example: 'train_loss':0.846 + r"([\w|-]+)\s*=\s*([+-]?\d*(\.\d+)?([Ee][+-]?\d+)?)", + # For example: 'train_loss':0.846 r"'([\w|-]+)'\s*:\s*([+-]?\d*(\.\d+)?([Ee][+-]?\d+)?)", ] ) @@ -458,10 +460,12 @@ def tune( # Install Python packages if that is required. if packages_to_install is not None: exec_script = ( - utils.get_script_for_python_packages(packages_to_install, pip_index_url) + utils.get_script_for_python_packages( + packages_to_install, pip_index_url + ) + exec_script ) - + container_spec = client.V1Container( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, image=base_image, @@ -480,7 +484,7 @@ def tune( restart_policy="Never", containers=[container_spec], ), - ) + ) # If users choose to use external models and datasets. else: @@ -615,7 +619,10 @@ def tune( setattr(lora_config, p_name, value) # Create init container spec. - from kubeflow.training.utils.utils import get_container_spec, get_pod_template_spec + from kubeflow.training.utils.utils import ( + get_container_spec, + get_pod_template_spec, + ) init_container_spec = get_container_spec( name=STORAGE_INITIALIZER, @@ -624,7 +631,9 @@ def tune( "--model_provider", mp, "--model_provider_parameters", - json.dumps(model_provider_parameters.__dict__, cls=utils.SetEncoder), + json.dumps( + model_provider_parameters.__dict__, cls=utils.SetEncoder + ), "--dataset_provider", dp, "--dataset_provider_parameters", From 35acedb95c2c6d5b1e5808c132157e151e025416 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 7 Aug 2024 11:51:16 +0800 Subject: [PATCH 18/84] fix Lint error Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 43 ++++++------- .../v1beta1/kubeflow/katib/types/__init__.py | 2 +- .../kubeflow/katib/types/trainer_resources.py | 3 +- sdk/python/v1beta1/test_llm.py | 63 ------------------- 4 files changed, 22 insertions(+), 89 deletions(-) delete mode 100644 sdk/python/v1beta1/test_llm.py diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index e42778f2ad1..e106f452706 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -496,26 +496,25 @@ def tune( raise ValueError("One of the required parameters is None") try: - from kubeflow.training.constants.constants import ( - STORAGE_INITIALIZER, - STORAGE_INITIALIZER_VOLUME_MOUNT, - STORAGE_INITIALIZER_IMAGE, - TRAINER_TRANSFORMER_IMAGE, - ) - from kubeflow.storage_initializer.constants import ( - VOLUME_PATH_DATASET, - VOLUME_PATH_MODEL, - ) - from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceDatasetParams, - ) - from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceModelParams, - ) - from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceTrainerParams, - ) + from kubeflow.storage_initializer.constants import \ + VOLUME_PATH_DATASET + from kubeflow.storage_initializer.constants import \ + VOLUME_PATH_MODEL + from kubeflow.storage_initializer.hugging_face import \ + HuggingFaceDatasetParams + from kubeflow.storage_initializer.hugging_face import \ + HuggingFaceModelParams + from kubeflow.storage_initializer.hugging_face import \ + HuggingFaceTrainerParams from kubeflow.storage_initializer.s3 import S3DatasetParams + from kubeflow.training.constants.constants import \ + STORAGE_INITIALIZER + from kubeflow.training.constants.constants import \ + STORAGE_INITIALIZER_IMAGE + from kubeflow.training.constants.constants import \ + STORAGE_INITIALIZER_VOLUME_MOUNT + from kubeflow.training.constants.constants import \ + TRAINER_TRANSFORMER_IMAGE import peft import transformers except ImportError: @@ -619,10 +618,8 @@ def tune( setattr(lora_config, p_name, value) # Create init container spec. - from kubeflow.training.utils.utils import ( - get_container_spec, - get_pod_template_spec, - ) + from kubeflow.training.utils.utils import get_container_spec + from kubeflow.training.utils.utils import get_pod_template_spec init_container_spec = get_container_spec( name=STORAGE_INITIALIZER, diff --git a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py index a38761478a0..a99fbea74b3 100644 --- a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py +++ b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py @@ -4,4 +4,4 @@ from kubeflow.katib.types.trainer_resources import TrainerResources # Import Kubernetes models. -from kubernetes.client import * \ No newline at end of file +from kubernetes.client import * diff --git a/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py index 6ae7fa5741b..54968af2081 100644 --- a/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py +++ b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py @@ -1,9 +1,8 @@ import pprint import re -import six - from kubeflow.katib.configuration import Configuration +import six class TrainerResources(object): diff --git a/sdk/python/v1beta1/test_llm.py b/sdk/python/v1beta1/test_llm.py deleted file mode 100644 index 4bc81efb459..00000000000 --- a/sdk/python/v1beta1/test_llm.py +++ /dev/null @@ -1,63 +0,0 @@ -import kubeflow.katib as katib -from kubeflow.katib import KatibClient - -import transformers -from peft import LoraConfig - -from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceModelParams, - HuggingFaceDatasetParams, - HuggingFaceTrainerParams, -) - -cl = KatibClient(namespace="kubeflow") - - -# [3] Create Katib Experiment with 12 Trials and 2 CPUs per Trial. -name = "llm-experiment" -cl.tune( - name = name, - # BERT model URI and type of Transformer to train it. - model_provider_parameters = HuggingFaceModelParams( - model_uri = "hf://google-bert/bert-base-cased", - transformer_type = transformers.AutoModelForSequenceClassification, - ), - # Use 3000 samples from Yelp dataset. - dataset_provider_parameters = HuggingFaceDatasetParams( - repo_id = "yelp_review_full", - split = "train[:8]", - ), - # Specify HuggingFace Trainer parameters. - trainer_parameters = HuggingFaceTrainerParams( - training_parameters = transformers.TrainingArguments( - output_dir = "test_tune_api", - save_strategy = "no", - learning_rate = katib.search.double(min=1e-05, max=5e-05), - #no_cuda=True, #if you use cpu instead of gpu - #use_cpu=True, #if you use cpu instead of gpu - num_train_epochs=1, - ), - # Set LoRA config to reduce number of trainable model parameters. - lora_config = LoraConfig( - r = katib.search.int(min=8, max=32), - lora_alpha = 8, - lora_dropout = 0.1, - bias = "none", - ), - ), - objective_metric_name = "train_loss", - objective_type = "minimize", - algorithm_name = "random", - max_trial_count = 1, - parallel_trial_count = 1, - resources_per_trial={ - "cpu": "4", - "memory": "10G", - }, -) - -# [4] Wait until Katib Experiment is complete -cl.wait_for_experiment_condition(name=name) - -# [5] Get the best hyperparameters. -#print(cl.get_optimal_hyperparameters(name)) \ No newline at end of file From af534b36d12a4292a79a6a1b6cee7ca79c0fd171 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 7 Aug 2024 11:53:21 +0800 Subject: [PATCH 19/84] fix Lint errors Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/types/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py index a99fbea74b3..46661f0cebb 100644 --- a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py +++ b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py @@ -2,6 +2,5 @@ # Import types into type package from kubeflow.katib.types.trainer_resources import TrainerResources - # Import Kubernetes models. from kubernetes.client import * From c7f6e10125413332ebd9835f9901dd7345bb445b Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 7 Aug 2024 11:55:32 +0800 Subject: [PATCH 20/84] delete types Signed-off-by: helenxie-bit --- .../v1beta1/kubeflow/katib/types/__init__.py | 6 - .../kubeflow/katib/types/trainer_resources.py | 138 ------------------ 2 files changed, 144 deletions(-) delete mode 100644 sdk/python/v1beta1/kubeflow/katib/types/__init__.py delete mode 100644 sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py diff --git a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py deleted file mode 100644 index 46661f0cebb..00000000000 --- a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from __future__ import absolute_import - -# Import types into type package -from kubeflow.katib.types.trainer_resources import TrainerResources -# Import Kubernetes models. -from kubernetes.client import * diff --git a/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py deleted file mode 100644 index 54968af2081..00000000000 --- a/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py +++ /dev/null @@ -1,138 +0,0 @@ -import pprint -import re - -from kubeflow.katib.configuration import Configuration -import six - - -class TrainerResources(object): - def __init__(self, num_workers=None, num_procs_per_worker=None, resources_per_worker=None, local_vars_configuration=None): - if local_vars_configuration is None: - local_vars_configuration = Configuration() - self.local_vars_configuration = local_vars_configuration - - self._num_workers = None - self._num_procs_per_worker = None - self._resources_per_worker = None - - if num_workers is not None: - self.num_workers = num_workers - if num_procs_per_worker is not None: - self.num_procs_per_worker = num_procs_per_worker - if resources_per_worker is not None: - self.resources_per_worker = resources_per_worker - - @property - def num_workers(self): - """Gets the number of workers of distributed training. - - Number of workers is setting number of workers. - - :return: The number of workers of distributed training. - :rtype: int - """ - return self._num_workers - - @num_workers.setter - def num_workers(self, num_workers): - """Sets the number of workers of distributed training. - - Number of workers is setting number of workers. - - :param num_workers: The number of workers of distributed training. - :type: int - """ - - self._num_workers = num_workers - - @property - def num_procs_per_worker(self): - """Gets the number of processes per worker of distributed training. - - Number of processes per worker is the setting number of processes per worker. - - :return: The number of processed per worker of distributed training. - :rtype: int - """ - return self._num_procs_per_worker - - @num_procs_per_worker.setter - def num_procs_per_worker(self, num_procs_per_worker): - """Sets the number of processes per worker of distributed training. - - Number of processes per worker is the setting number of processes per worker. - - :param num_procs_per_worker: The number of processes per worker of distributed training. - :type: int - """ - - self._num_procs_per_worker = num_procs_per_worker - - @property - def resources_per_worker(self): - """Gets the resources per worker of distributed training. - - Resources per worker is the setting resources per worker. - - :return: The resources per worker of distributed training. - :rtype: dict or V1ResourceRequirements - """ - return self._resources_per_worker - - @resources_per_worker.setter - def resources_per_worker(self, resources_per_worker): - """Sets the resources per worker of distributed training. - - Resources per worker is the setting resources per worker. - - :param resources_per_worker: The resources per worker of distributed training. - :type: dict or V1ResourceRequirements - """ - - self._resources_per_worker = resources_per_worker - - def to_dict(self): - """Returns the resources properties as a dict""" - result = {} - - for attr, _ in six.iteritems(self.__dict__): - value = getattr(self, attr) - if isinstance(value, list): - result[attr] = list(map( - lambda x: x.to_dict() if hasattr(x, "to_dict") else x, - value - )) - elif hasattr(value, "to_dict"): - result[attr] = value.to_dict() - elif isinstance(value, dict): - result[attr] = dict(map( - lambda item: (item[0], item[1].to_dict()) - if hasattr(item[1], "to_dict") else item, - value.items() - )) - else: - result[attr] = value - - return result - - def to_str(self): - """Returns the string representation of the model""" - return pprint.pformat(self.to_dict()) - - def __repr__(self): - """For `print` and `pprint`""" - return self.to_str() - - def __eq__(self, other): - """Returns true if both objects are equal""" - if not isinstance(other, TrainerResources): - return False - - return self.to_dict() == other.to_dict() - - def __ne__(self, other): - """Returns true if both objects are not equal""" - if not isinstance(other, TrainerResources): - return True - - return self.to_dict() != other.to_dict() From 9fdbdb72a99f8c436601bb900052a6bdc76a63c0 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 7 Aug 2024 12:01:07 +0800 Subject: [PATCH 21/84] fix format Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 22 +++++-------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index e106f452706..4bb48478ba8 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -995,7 +995,7 @@ def wait_for_experiment_condition( name: str, namespace: Optional[str] = None, expected_condition: str = constants.EXPERIMENT_CONDITION_SUCCEEDED, - timeout: int = 6000, + timeout: int = 600, polling_interval: int = 15, apiserver_timeout: int = constants.DEFAULT_TIMEOUT, ): @@ -1035,9 +1035,7 @@ def wait_for_experiment_condition( ) ): utils.print_experiment_status(experiment) - logger.debug( - f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" - ) + logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") return experiment # Raise exception if Experiment is Failed. @@ -1057,9 +1055,7 @@ def wait_for_experiment_condition( ) ): utils.print_experiment_status(experiment) - logger.debug( - f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" - ) + logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") return experiment # Check if Experiment reaches Running condition. @@ -1070,9 +1066,7 @@ def wait_for_experiment_condition( ) ): utils.print_experiment_status(experiment) - logger.debug( - f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" - ) + logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") return experiment # Check if Experiment reaches Restarting condition. @@ -1083,9 +1077,7 @@ def wait_for_experiment_condition( ) ): utils.print_experiment_status(experiment) - logger.debug( - f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" - ) + logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") return experiment # Check if Experiment reaches Succeeded condition. @@ -1096,9 +1088,7 @@ def wait_for_experiment_condition( ) ): utils.print_experiment_status(experiment) - logger.debug( - f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" - ) + logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") return experiment # Otherwise, print the current Experiment results and sleep for the pooling interval. From ddd515319bd747ea7fad4e8251c132471ab2169e Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 7 Aug 2024 16:55:06 +0800 Subject: [PATCH 22/84] update format Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 4bb48478ba8..3a1dfc8a9d4 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -200,7 +200,7 @@ def tune( retain_trials: bool = False, packages_to_install: List[str] = None, pip_index_url: str = "https://pypi.org/simple", - metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"}, + metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"}, ): """Create HyperParameter Tuning Katib Experiment using one of the following options: 1. External models and datasets From b31e820a825ca1fa06eae7488c04b451df430a05 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 7 Aug 2024 16:56:47 +0800 Subject: [PATCH 23/84] update format Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 3a1dfc8a9d4..4bb48478ba8 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -200,7 +200,7 @@ def tune( retain_trials: bool = False, packages_to_install: List[str] = None, pip_index_url: str = "https://pypi.org/simple", - metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"}, + metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"}, ): """Create HyperParameter Tuning Katib Experiment using one of the following options: 1. External models and datasets From dad3831be2ec36683f3c980f2d747cfc1481d380 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 7 Aug 2024 20:03:26 +0800 Subject: [PATCH 24/84] fix e2e test error Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 4bb48478ba8..453485623cb 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -384,9 +384,7 @@ def tune( # Add metrics collector to the Katib Experiment. # Up to now, We only support parameter `kind`, of which default value is `StdOut`, to specify the kind of metrics collector. experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( - collector=models.V1beta1CollectorSpec( - kind=metrics_collector_config["kind"] - ), + collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]), source=models.V1beta1SourceSpec( filter=models.V1beta1FilterSpec( metrics_format=[ From 1afe56def822aee6145efe9fc271900fd8906a87 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 8 Aug 2024 08:27:35 +0800 Subject: [PATCH 25/84] add TODO Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 453485623cb..6177efa093c 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -659,8 +659,7 @@ def tune( ], volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], resources=resources_per_trial, - env=env, - env_from=env_from, + # TODO (helenxie-bit): Add `env` and `env_from` in the future ) storage_initializer_volume = models.V1Volume( From ad7bce8c61b2463aab6810b4a0aa5c8abdb11743 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 8 Aug 2024 08:57:00 +0800 Subject: [PATCH 26/84] format with max line length Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 6177efa093c..14920253b3f 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -382,7 +382,8 @@ def tune( ) # Add metrics collector to the Katib Experiment. - # Up to now, We only support parameter `kind`, of which default value is `StdOut`, to specify the kind of metrics collector. + # Up to now, We only support parameter `kind`, of which default value is + # `StdOut`, to specify the kind of metrics collector. experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]), source=models.V1beta1SourceSpec( From 7e58c9470e1dd0ed963683cde357e9f00ac855c6 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 8 Aug 2024 09:59:53 +0800 Subject: [PATCH 27/84] format docstring Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 138 +++++++++++------- 1 file changed, 84 insertions(+), 54 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 14920253b3f..cffc1cdac20 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -202,21 +202,45 @@ def tune( pip_index_url: str = "https://pypi.org/simple", metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"}, ): - """Create HyperParameter Tuning Katib Experiment using one of the following options: + """ + Create HyperParameter Tuning Katib Experiment using one of the following + options: + 1. External models and datasets - Parameters: `model_provider_parameters` + `dataset_provider_parameters` + `trainer_parameters`. - Usage: Specify both `model_provider_parameters` and `dataset_provider_parameters` to download models and datasets from external platforms (currently supports HuggingFace and Amazon S3) using the Storage Initializer. The `trainer_parameters` should be of type `HuggingFaceTrainerParams` to set the hyperparameters search space. This API will automatically define the "Trainer" class in HuggingFace with the provided parameters and utilize `Trainer.train()` from HuggingFace to obtain the metrics for optimizing hyperparameters. + Parameters: `model_provider_parameters` + `dataset_provider_parameters` + + `trainer_parameters`. + Usage: Specify both `model_provider_parameters` and + `dataset_provider_parameters` to download models and datasets from external + platforms (currently supports HuggingFace and Amazon S3) using the Storage + Initializer. The `trainer_parameters` should be of type + `HuggingFaceTrainerParams` to set the hyperparameters search space. This API + will automatically define the "Trainer" class in HuggingFace with the provided + parameters and utilize `Trainer.train()` from HuggingFace to obtain the metrics + for optimizing hyperparameters. + 2. Custom objective function Parameters: `objective` + `base_image` + `parameters`. - Usage: Specify the `objective` parameter to define your own objective function. The `base_image` parameter will be used to execute the objective function. The `parameters` should be a dictionary to define the search space for these parameters. + Usage: Specify the `objective` parameter to define your own objective function. + The `base_image` parameter will be used to execute the objective function. The + `parameters` should be a dictionary to define the search space for these + parameters. Args: name: Name for the Experiment. - model_provider_parameters: Parameters for the model provider in the Storage Initializer. - For example, HuggingFace model name and Transformer type for that model, like: AutoModelForSequenceClassification. This argument must be the type of `kubeflow.storage_initializer.hugging_face.HuggingFaceModelParams`. - dataset_provider_parameters: Parameters for the dataset provider in the Storage Initializer. - For example, name of the HuggingFace dataset or AWS S3 configuration. This argument must be the type of `kubeflow.storage_initializer.hugging_face.HuggingFaceDatasetParams` or `kubeflow.storage_initializer.s3.S3DatasetParams` - trainer_parameters: Parameters for configuring the training process, including settings for the hyperparameters search space. It should be of type `HuggingFaceTrainerParams`. You should use the Katib SDK to define the search space for these parameters.For example: + model_provider_parameters: Parameters for the model provider in the Storage + Initializer. + For example, HuggingFace model name and Transformer type for that model, + like: AutoModelForSequenceClassification. This argument must be the type + of `kubeflow.storage_initializer.hugging_face.HuggingFaceModelParams`. + dataset_provider_parameters: Parameters for the dataset provider in the + Storage Initializer. + For example, name of the HuggingFace dataset or AWS S3 configuration. + This argument must be the type of `kubeflow.storage_initializer.hugging_face.HuggingFaceDatasetParams` + or `kubeflow.storage_initializer.s3.S3DatasetParams` + trainer_parameters: Parameters for configuring the training process, + including settings for the hyperparameters search space. It should be of + type `HuggingFaceTrainerParams`. You should use the Katib SDK to define + the search space for these parameters. For example: ``` trainer_parameters = HuggingFaceTrainerParams( training_parameters = transformers.TrainingArguments( @@ -224,19 +248,22 @@ def tune( ), ), ``` - Also, you can use these parameters to define input for training the models. - storage_config: Configuration for Storage Initializer PVC to download pre-trained model and dataset. - You can configure PVC size and storage class name in this argument. - objective: Objective function that Katib uses to train the model. - This function must be Callable and it must have only one dict argument. - Katib uses this argument to send HyperParameters to the function. - The function should not use any code declared outside of the function - definition. Import statements must be added inside the function. + Also, you can use these parameters to define input for training the + models. + storage_config: Configuration for Storage Initializer PVC to download + pre-trained model and dataset. You can configure PVC size and storage + class name in this argument. + objective: Objective function that Katib uses to train the model. This + function must be Callable and it must have only one dict argument. Katib + uses this argument to send HyperParameters to the function. The function + should not use any code declared outside of the function definition. + Import statements must be added inside the function. base_image: Image to use when executing the objective function. - parameters: Dict of hyperparameters to optimize if you choose a custom objective function. You should use the Katib SDK to define the search space for these parameters. For example: - ``` - parameters = {"lr": katib.search.double(min=0.1, max=0.2)}` - ``` + parameters: Dict of HyperParameters to tune your Experiment if you choose a custom + objective function. You should use Katib SDK to define the search space for these + parameters. For example: + `parameters = {"lr": katib.search.double(min=0.1, max=0.2)}` + Also, you can use these parameters to define input for your objective function. namespace: Namespace for the Experiment. env_per_trial: Environment variable(s) to be attached to each trial container. @@ -259,24 +286,24 @@ def tune( values check this doc: https://www.kubeflow.org/docs/components/katib/experiment/#configuration-spec. parallel_trial_count: Number of Trials that Experiment runs in parallel. max_failed_trial_count: Maximum number of Trials allowed to fail. - resources_per_trial: A parameter that lets you specify how much - resources each trial container should have. You can either specify a - kubernetes.client.V1ResourceRequirements object (documented here: - https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1ResourceRequirements.md) - or a dictionary that includes one or more of the following keys: - `cpu`, `memory`, or `gpu` (other keys will be ignored). Appropriate - values for these keys are documented here: - https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/. - For example: + resources_per_trial: A parameter that lets you specify how much resources + each trial container should have. You can either specify a + kubernetes.client.V1ResourceRequirements object (documented here: + https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1ResourceRequirements.md) + or a dictionary that includes one or more of the following keys: `cpu`, + `memory`, or `gpu` (other keys will be ignored). Appropriate values + for these keys are documented here: + https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/. + For example: { "cpu": "1", "gpu": "1", "memory": "2Gi", } - Please note, `gpu` specifies a resource request with a key of - `nvidia.com/gpu`, i.e. an NVIDIA GPU. If you need a different type - of GPU, pass in a V1ResourceRequirement instance instead, since it's - more flexible. This parameter is optional and defaults to None. + Please note, `gpu` specifies a resource request with a key of + `nvidia.com/gpu`, i.e. an NVIDIA GPU. If you need a different type of + GPU, pass in a V1ResourceRequirement instance instead, since it's more + flexible. This parameter is optional and defaults to None. retain_trials: Whether Trials' resources (e.g. pods) are deleted after Succeeded state. packages_to_install: List of Python packages to install in addition to the base image packages. These packages are installed before @@ -293,9 +320,10 @@ def tune( """ print( - "Thank you for using `tune` API for LLM hyperparameter optimization. This feature is in the alpha stage. " - "Kubeflow community is looking for your feedback. Please share your experience via " - "#kubeflow-katib Slack channel or the Kubeflow Katib GitHub." + "Thank you for using `tune` API for LLM hyperparameter optimization. This feature " + "is in the alpha stage. Kubeflow community is looking for your feedback. Please " + "share your experience via #kubeflow-katib Slack channel or the Kubeflow Katib " + "GitHub." ) if ( @@ -304,10 +332,12 @@ def tune( or trainer_parameters is not None ) and (objective is not None or parameters is not None): raise ValueError( - "Invalid configuration for creating a Katib Experiment for hyperparameter optimization. " - "You should only specify one of the following options:\n" - "1. Use external models and datasets: specify `model_provider_parameters`, `dataset_provider_parameters` and `trainer_parameters`;\n" - "2. Use custom objective function: specify `objective`, `base_image` and `parameters`." + "Invalid configuration for creating a Katib Experiment for hyperparameter " + "optimization. You should only specify one of the following options:\n" + "1. Use external models and datasets: specify `model_provider_parameters`, " + "`dataset_provider_parameters` and `trainer_parameters`;\n" + "2. Use custom objective function: specify `objective`, `base_image` and " + "`parameters`." ) if not name: @@ -801,8 +831,8 @@ def get_experiment_conditions( experiment: models.V1beta1Experiment = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """Get the Experiment conditions. Experiment is in the condition when - `status` is True for the appropriate condition `type`. + """Get the Experiment conditions. Experiment is in the condition when `status` + is True for the appropriate condition `type`. Args: name: Name for the Experiment. @@ -997,8 +1027,8 @@ def wait_for_experiment_condition( polling_interval: int = 15, apiserver_timeout: int = constants.DEFAULT_TIMEOUT, ): - """Wait until Experiment reaches specific condition. By default it waits - for the Succeeded condition. + """Wait until Experiment reaches specific condition. By default it waits for the + Succeeded condition. Args: name: Name for the Experiment. @@ -1109,9 +1139,9 @@ def edit_experiment_budget( max_failed_trial_count: int = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """Update Experiment budget for the running Trials. You can modify Trial - budget to resume Succeeded Experiments with `LongRunning` and `FromVolume` - resume policies. + """Update Experiment budget for the running Trials. You can modify Trial budget + to resume Succeeded Experiments with `LongRunning` and `FromVolume` resume + policies. Learn about resuming Experiments here: https://www.kubeflow.org/docs/components/katib/resume-experiment/ @@ -1350,8 +1380,8 @@ def list_trials( namespace: Optional[str] = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """List of all Trials in namespace. If Experiment name is set, - it returns all Trials belong to the Experiment. + """List of all Trials in namespace. If Experiment name is set, it returns all + Trials belong to the Experiment. Args: experiment_name: Optional name for the Experiment. @@ -1410,8 +1440,8 @@ def get_success_trial_details( namespace: Optional[str] = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """Get the Succeeded Trial details. If Experiment name is set, - it returns Succeeded Trials details belong to the Experiment. + """Get the Succeeded Trial details. If Experiment name is set, it returns + Succeeded Trials details belong to the Experiment. Args: experiment_name: Optional name for the Experiment. @@ -1519,8 +1549,8 @@ def get_trial_metrics( db_manager_address: str = constants.DEFAULT_DB_MANAGER_ADDRESS, timeout: str = constants.DEFAULT_TIMEOUT, ): - """Get the Trial Metric Results from the Katib DB. - Katib DB Manager service should be accessible while calling this API. + """Get the Trial Metric Results from the Katib DB. Katib DB Manager service + should be accessible while calling this API. If you run this API in-cluster (e.g. from the Kubeflow Notebook) you can use the default Katib DB Manager address: `katib-db-manager.kubeflow:6789`. From 61dc8ca1d9e8bec88c3ebc210c0e9b6b587f563a Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 8 Aug 2024 15:25:55 +0800 Subject: [PATCH 28/84] update format Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index cffc1cdac20..27307e305c8 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -214,8 +214,8 @@ def tune( platforms (currently supports HuggingFace and Amazon S3) using the Storage Initializer. The `trainer_parameters` should be of type `HuggingFaceTrainerParams` to set the hyperparameters search space. This API - will automatically define the "Trainer" class in HuggingFace with the provided - parameters and utilize `Trainer.train()` from HuggingFace to obtain the metrics + will automatically define the "Trainer" in HuggingFace with the provided + parameters and utilize `Trainer.train()` from HuggingFace to obtain the metrics for optimizing hyperparameters. 2. Custom objective function From ba0d7d173dd943236c8865c9047fc215c7b3e2f9 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 9 Aug 2024 07:23:34 +0800 Subject: [PATCH 29/84] add helper functions Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 106 ++++--------- .../v1beta1/kubeflow/katib/utils/utils.py | 142 +++++++++++++++++- 2 files changed, 168 insertions(+), 80 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 27307e305c8..1eebd3fe47f 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -214,8 +214,8 @@ def tune( platforms (currently supports HuggingFace and Amazon S3) using the Storage Initializer. The `trainer_parameters` should be of type `HuggingFaceTrainerParams` to set the hyperparameters search space. This API - will automatically define the "Trainer" in HuggingFace with the provided - parameters and utilize `Trainer.train()` from HuggingFace to obtain the metrics + will automatically define the "Trainer" in HuggingFace with the provided + parameters and utilize `Trainer.train()` from HuggingFace to obtain the metrics for optimizing hyperparameters. 2. Custom objective function @@ -259,9 +259,9 @@ class name in this argument. should not use any code declared outside of the function definition. Import statements must be added inside the function. base_image: Image to use when executing the objective function. - parameters: Dict of HyperParameters to tune your Experiment if you choose a custom - objective function. You should use Katib SDK to define the search space for these - parameters. For example: + parameters: Dict of HyperParameters to tune your Experiment if you choose a custom + objective function. You should use Katib SDK to define the search space for these + parameters. For example: `parameters = {"lr": katib.search.double(min=0.1, max=0.2)}` Also, you can use these parameters to define input for your objective function. @@ -286,12 +286,12 @@ class name in this argument. values check this doc: https://www.kubeflow.org/docs/components/katib/experiment/#configuration-spec. parallel_trial_count: Number of Trials that Experiment runs in parallel. max_failed_trial_count: Maximum number of Trials allowed to fail. - resources_per_trial: A parameter that lets you specify how much resources + resources_per_trial: A parameter that lets you specify how much resources each trial container should have. You can either specify a kubernetes.client.V1ResourceRequirements object (documented here: https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1ResourceRequirements.md) - or a dictionary that includes one or more of the following keys: `cpu`, - `memory`, or `gpu` (other keys will be ignored). Appropriate values + or a dictionary that includes one or more of the following keys: `cpu`, + `memory`, or `gpu` (other keys will be ignored). Appropriate values for these keys are documented here: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/. For example: @@ -301,8 +301,8 @@ class name in this argument. "memory": "2Gi", } Please note, `gpu` specifies a resource request with a key of - `nvidia.com/gpu`, i.e. an NVIDIA GPU. If you need a different type of - GPU, pass in a V1ResourceRequirement instance instead, since it's more + `nvidia.com/gpu`, i.e. an NVIDIA GPU. If you need a different type of + GPU, pass in a V1ResourceRequirement instance instead, since it's more flexible. This parameter is optional and defaults to None. retain_trials: Whether Trials' resources (e.g. pods) are deleted after Succeeded state. packages_to_install: List of Python packages to install in addition @@ -382,16 +382,6 @@ class name in this argument. if max_failed_trial_count is not None: experiment.spec.max_failed_trial_count = max_failed_trial_count - # Add resources to the Katib Experiment. - if isinstance(resources_per_trial, dict): - if "gpu" in resources_per_trial: - resources_per_trial["nvidia.com/gpu"] = resources_per_trial.pop("gpu") - - resources_per_trial = client.V1ResourceRequirements( - requests=resources_per_trial, - limits=resources_per_trial, - ) - # Add environment variables to the Katib Experiment. env = [] env_from = [] @@ -415,7 +405,9 @@ class name in this argument. # Up to now, We only support parameter `kind`, of which default value is # `StdOut`, to specify the kind of metrics collector. experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( - collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]), + collector=models.V1beta1CollectorSpec( + kind=metrics_collector_config["kind"] + ), source=models.V1beta1SourceSpec( filter=models.V1beta1FilterSpec( metrics_format=[ @@ -437,13 +429,6 @@ class name in this argument. # Validate objective function. utils.validate_objective_function(objective) - # Extract objective function implementation. - objective_code = inspect.getsource(objective) - - # Objective function might be defined in some indented scope - # (e.g. in another function). We need to dedent the function code. - objective_code = textwrap.dedent(objective_code) - # Iterate over input parameters. input_params = {} experiment_params = [] @@ -466,53 +451,21 @@ class name in this argument. # Otherwise, add value to the function input. input_params[p_name] = p_value - # Wrap objective function to execute it from the file. For example - # def objective(parameters): - # print(f'Parameters are {parameters}') - # objective({'lr': '${trialParameters.lr}', 'epochs': '${trialParameters.epochs}', 'is_dist': False}) - objective_code = f"{objective_code}\n{objective.__name__}({input_params})\n" - - # Prepare execute script template. - exec_script = textwrap.dedent( - """ - program_path=$(mktemp -d) - read -r -d '' SCRIPT << EOM\n - {objective_code} - EOM - printf "%s" "$SCRIPT" > $program_path/ephemeral_objective.py - python3 -u $program_path/ephemeral_objective.py""" - ) - - # Add objective code to the execute script. - exec_script = exec_script.format(objective_code=objective_code) - - # Install Python packages if that is required. - if packages_to_install is not None: - exec_script = ( - utils.get_script_for_python_packages( - packages_to_install, pip_index_url - ) - + exec_script - ) - - container_spec = client.V1Container( + container_spec = utils.get_container_spec( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - image=base_image, - command=["bash", "-c"], - args=[exec_script], - env=env if env else None, - env_from=env_from if env_from else None, + base_image=base_image, + train_func=objective, + train_func_parameters=input_params, + packages_to_install=packages_to_install, + pip_index_url=pip_index_url, resources=resources_per_trial, + env=env, + env_from=env_from, ) - pod_spec = client.V1PodTemplateSpec( - metadata=models.V1ObjectMeta( - annotations={"sidecar.istio.io/inject": "false"} - ), - spec=client.V1PodSpec( - restart_policy="Never", - containers=[container_spec], - ), + pod_spec = utils.get_pod_template_spec( + containers=[container_spec], + restart_policy="Never", ) # If users choose to use external models and datasets. @@ -646,11 +599,7 @@ class name in this argument. value = type(old_attr)(p_value) setattr(lora_config, p_name, value) - # Create init container spec. - from kubeflow.training.utils.utils import get_container_spec - from kubeflow.training.utils.utils import get_pod_template_spec - - init_container_spec = get_container_spec( + init_container_spec = utils.get_container_spec( name=STORAGE_INITIALIZER, base_image=STORAGE_INITIALIZER_IMAGE, args=[ @@ -671,7 +620,7 @@ class name in this argument. lora_config = json.dumps(lora_config.__dict__, cls=utils.SetEncoder) training_args = json.dumps(training_args.to_dict()) - container_spec = get_container_spec( + container_spec = utils.get_container_spec( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, base_image=TRAINER_TRANSFORMER_IMAGE, args=[ @@ -690,7 +639,6 @@ class name in this argument. ], volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], resources=resources_per_trial, - # TODO (helenxie-bit): Add `env` and `env_from` in the future ) storage_initializer_volume = models.V1Volume( @@ -700,7 +648,7 @@ class name in this argument. ), ) - pod_spec = get_pod_template_spec( + pod_spec = utils.get_pod_template_spec( containers=[container_spec], init_containers=[init_container_spec], volumes=[storage_initializer_volume], diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 8c90a001d96..d3e8bc6c0e6 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -16,7 +16,7 @@ import json import os import textwrap -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union from kubeflow.katib import models from kubeflow.katib.constants import constants @@ -131,6 +131,146 @@ def __init__(self, obj): self.data = json.dumps(obj) +def get_command_using_train_func( + train_func: Optional[Callable], + train_func_parameters: Optional[Dict[str, Any]] = None, + packages_to_install: Optional[List[str]] = None, + pip_index_url: str = "https://pypi.org/simple", +) -> Tuple[List[str], List[str]]: + """ + Get container args and command from the given training function and parameters. + """ + # Check if function is callable. + if not callable(train_func): + raise ValueError( + f"Training function must be callable, got function type: {type(train_func)}" + ) + + # Extract function implementation. + func_code = inspect.getsource(train_func) + + # Function might be defined in some indented scope (e.g. in another function). + # We need to dedent the function code. + func_code = textwrap.dedent(func_code) + + # Wrap function code to execute it from the file. For example: + # def train(parameters): + # print('Start Training...') + # train({'lr': 0.01}) + if train_func_parameters is None: + func_code = f"{func_code}\n{train_func.__name__}()\n" + else: + func_code = f"{func_code}\n{train_func.__name__}({train_func_parameters})\n" + + # Prepare execute script template. + exec_script = textwrap.dedent( + """ + program_path=$(mktemp -d) + read -r -d '' SCRIPT << EOM\n + {func_code} + EOM + printf "%s" \"$SCRIPT\" > \"$program_path/ephemeral_script.py\" + python3 -u \"$program_path/ephemeral_script.py\"""" + ) + + # Add function code to the execute script. + exec_script = exec_script.format(func_code=func_code) + + # Install Python packages if that is required. + if packages_to_install is not None: + exec_script = ( + get_script_for_python_packages(packages_to_install, pip_index_url) + + exec_script + ) + + # Return container command and args to execute training function. + return ["bash", "-c"], [exec_script] + + +def get_container_spec( + name: str, + base_image: str, + train_func: Optional[Callable] = None, + train_func_parameters: Optional[Dict[str, Any]] = None, + packages_to_install: Optional[List[str]] = None, + pip_index_url: str = "https://pypi.org/simple", + args: Optional[List[str]] = None, + resources: Union[dict, models.V1ResourceRequirements, None] = None, + volume_mounts: Optional[List[models.V1VolumeMount]] = None, + env: Optional[List[models.V1EnvVar]] = None, + env_from: Optional[List[models.V1EnvFromSource]] = None, +) -> models.V1Container: + """ + Get container spec for the given parameters. + """ + + if name is None or base_image is None: + raise ValueError("Container name or base image cannot be none") + + # Create initial container spec. + container_spec = models.V1Container( + name=name, image=base_image, args=args, volume_mounts=volume_mounts + ) + + # If training function is set, override container command and args to execute the function. + if train_func is not None: + container_spec.command, container_spec.args = get_command_using_train_func( + train_func=train_func, + train_func_parameters=train_func_parameters, + packages_to_install=packages_to_install, + pip_index_url=pip_index_url, + ) + + # Convert dict to the Kubernetes container resources if that is required. + if isinstance(resources, dict): + # Convert all keys in resources to lowercase. + resources = {k.lower(): v for k, v in resources.items()} + if "gpu" in resources: + resources["nvidia.com/gpu"] = resources.pop("gpu") + + resources = models.V1ResourceRequirements( + requests=resources, + limits=resources, + ) + + # Add resources to the container spec. + container_spec.resources = resources + + # Add environment variables to the container spec. + if env: + container_spec.env = env + if env_from: + container_spec.env_from = env_from + + + return container_spec + + +def get_pod_template_spec( + containers: List[models.V1Container], + init_containers: Optional[List[models.V1Container]] = None, + volumes: Optional[List[models.V1Volume]] = None, + restart_policy: Optional[str] = None, +) -> models.V1PodTemplateSpec: + """ + Get Pod template spec for the given parameters. + """ + + # Create Pod template spec. If the value is None, Pod doesn't have that parameter + pod_template_spec = models.V1PodTemplateSpec( + metadata=models.V1ObjectMeta( + annotations={"sidecar.istio.io/inject": "false"} + ), + spec=models.V1PodSpec( + init_containers=init_containers, + containers=containers, + volumes=volumes, + restart_policy=restart_policy, + ), + ) + + return pod_template_spec + def get_pvc_spec( pvc_name: str, namespace: str, From 2a1b0088b0f866f4371f1cea9383795f6ddeb6b6 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 9 Aug 2024 07:37:58 +0800 Subject: [PATCH 30/84] update format Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 1eebd3fe47f..36de65cf7d9 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -779,8 +779,8 @@ def get_experiment_conditions( experiment: models.V1beta1Experiment = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """Get the Experiment conditions. Experiment is in the condition when `status` - is True for the appropriate condition `type`. + """Get the Experiment conditions. Experiment is in the condition when + `status` is True for the appropriate condition `type`. Args: name: Name for the Experiment. @@ -975,8 +975,8 @@ def wait_for_experiment_condition( polling_interval: int = 15, apiserver_timeout: int = constants.DEFAULT_TIMEOUT, ): - """Wait until Experiment reaches specific condition. By default it waits for the - Succeeded condition. + """Wait until Experiment reaches specific condition. By default it waits + for the Succeeded condition. Args: name: Name for the Experiment. @@ -1087,9 +1087,9 @@ def edit_experiment_budget( max_failed_trial_count: int = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """Update Experiment budget for the running Trials. You can modify Trial budget - to resume Succeeded Experiments with `LongRunning` and `FromVolume` resume - policies. + """Update Experiment budget for the running Trials. You can modify Trial + budget to resume Succeeded Experiments with `LongRunning` and `FromVolume` + resume policies. Learn about resuming Experiments here: https://www.kubeflow.org/docs/components/katib/resume-experiment/ @@ -1328,8 +1328,8 @@ def list_trials( namespace: Optional[str] = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """List of all Trials in namespace. If Experiment name is set, it returns all - Trials belong to the Experiment. + """List of all Trials in namespace. If Experiment name is set, + it returns all Trials belong to the Experiment. Args: experiment_name: Optional name for the Experiment. @@ -1388,8 +1388,8 @@ def get_success_trial_details( namespace: Optional[str] = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """Get the Succeeded Trial details. If Experiment name is set, it returns - Succeeded Trials details belong to the Experiment. + """Get the Succeeded Trial details. If Experiment name is set, + it returns Succeeded Trials details belong to the Experiment. Args: experiment_name: Optional name for the Experiment. @@ -1497,8 +1497,8 @@ def get_trial_metrics( db_manager_address: str = constants.DEFAULT_DB_MANAGER_ADDRESS, timeout: str = constants.DEFAULT_TIMEOUT, ): - """Get the Trial Metric Results from the Katib DB. Katib DB Manager service - should be accessible while calling this API. + """Get the Trial Metric Results from the Katib DB. + Katib DB Manager service should be accessible while calling this API. If you run this API in-cluster (e.g. from the Kubeflow Notebook) you can use the default Katib DB Manager address: `katib-db-manager.kubeflow:6789`. From b3685214d76071314ce6594950b9d8a819c9b0e9 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 9 Aug 2024 07:56:33 +0800 Subject: [PATCH 31/84] update format Signed-off-by: helenxie-bit --- .../v1beta1/kubeflow/katib/api/katib_client.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 36de65cf7d9..a8282717fe3 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -779,7 +779,7 @@ def get_experiment_conditions( experiment: models.V1beta1Experiment = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """Get the Experiment conditions. Experiment is in the condition when + """Get the Experiment conditions. Experiment is in the condition when `status` is True for the appropriate condition `type`. Args: @@ -975,7 +975,7 @@ def wait_for_experiment_condition( polling_interval: int = 15, apiserver_timeout: int = constants.DEFAULT_TIMEOUT, ): - """Wait until Experiment reaches specific condition. By default it waits + """Wait until Experiment reaches specific condition. By default it waits for the Succeeded condition. Args: @@ -1087,8 +1087,8 @@ def edit_experiment_budget( max_failed_trial_count: int = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """Update Experiment budget for the running Trials. You can modify Trial - budget to resume Succeeded Experiments with `LongRunning` and `FromVolume` + """Update Experiment budget for the running Trials. You can modify Trial + budget to resume Succeeded Experiments with `LongRunning` and `FromVolume` resume policies. Learn about resuming Experiments here: https://www.kubeflow.org/docs/components/katib/resume-experiment/ @@ -1328,7 +1328,7 @@ def list_trials( namespace: Optional[str] = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """List of all Trials in namespace. If Experiment name is set, + """List of all Trials in namespace. If Experiment name is set, it returns all Trials belong to the Experiment. Args: @@ -1388,7 +1388,7 @@ def get_success_trial_details( namespace: Optional[str] = None, timeout: int = constants.DEFAULT_TIMEOUT, ): - """Get the Succeeded Trial details. If Experiment name is set, + """Get the Succeeded Trial details. If Experiment name is set, it returns Succeeded Trials details belong to the Experiment. Args: @@ -1497,7 +1497,7 @@ def get_trial_metrics( db_manager_address: str = constants.DEFAULT_DB_MANAGER_ADDRESS, timeout: str = constants.DEFAULT_TIMEOUT, ): - """Get the Trial Metric Results from the Katib DB. + """Get the Trial Metric Results from the Katib DB. Katib DB Manager service should be accessible while calling this API. If you run this API in-cluster (e.g. from the Kubeflow Notebook) you can From 3ccbdf90af79d905cb22a79701759e0206d9251b Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 12 Aug 2024 15:00:36 +0800 Subject: [PATCH 32/84] run test again Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index a8282717fe3..d00df9c26d5 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -211,7 +211,7 @@ def tune( `trainer_parameters`. Usage: Specify both `model_provider_parameters` and `dataset_provider_parameters` to download models and datasets from external - platforms (currently supports HuggingFace and Amazon S3) using the Storage + platforms (currently support HuggingFace and Amazon S3) using the Storage Initializer. The `trainer_parameters` should be of type `HuggingFaceTrainerParams` to set the hyperparameters search space. This API will automatically define the "Trainer" in HuggingFace with the provided From 64e34e092e983f989c5ec006c0489b39f096a221 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 12 Aug 2024 18:32:03 +0800 Subject: [PATCH 33/84] run test again Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index d00df9c26d5..a8282717fe3 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -211,7 +211,7 @@ def tune( `trainer_parameters`. Usage: Specify both `model_provider_parameters` and `dataset_provider_parameters` to download models and datasets from external - platforms (currently support HuggingFace and Amazon S3) using the Storage + platforms (currently supports HuggingFace and Amazon S3) using the Storage Initializer. The `trainer_parameters` should be of type `HuggingFaceTrainerParams` to set the hyperparameters search space. This API will automatically define the "Trainer" in HuggingFace with the provided From dde724c6f111e37a574acbbc9b3611732ecbe9e9 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 12 Aug 2024 18:35:43 +0800 Subject: [PATCH 34/84] run test again Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index a8282717fe3..d00df9c26d5 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -211,7 +211,7 @@ def tune( `trainer_parameters`. Usage: Specify both `model_provider_parameters` and `dataset_provider_parameters` to download models and datasets from external - platforms (currently supports HuggingFace and Amazon S3) using the Storage + platforms (currently support HuggingFace and Amazon S3) using the Storage Initializer. The `trainer_parameters` should be of type `HuggingFaceTrainerParams` to set the hyperparameters search space. This API will automatically define the "Trainer" in HuggingFace with the provided From 1cccd4a54330a16b1837cdc87105b403041b1b18 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 14 Aug 2024 20:01:09 +0800 Subject: [PATCH 35/84] fix dict substitution in training_parameters Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index d00df9c26d5..dce95878400 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import inspect import json import logging @@ -573,7 +574,11 @@ class name in this argument. elif p_value is not None: old_attr = getattr(training_args, p_name, None) if old_attr is not None: - value = type(old_attr)(p_value) + if isinstance(p_value, dict): + # Update the existing dictionary without nesting + value = copy.deepcopy(p_value) + else: + value = type(old_attr)(p_value) setattr(training_args, p_name, value) lora_config = trainer_parameters.lora_config From 510661d50c1d78f3e90775e71a511f6d97f4319e Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 18 Aug 2024 07:56:04 +0800 Subject: [PATCH 36/84] fix typo Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/utils/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index d3e8bc6c0e6..91aabec6750 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -271,6 +271,7 @@ def get_pod_template_spec( return pod_template_spec + def get_pvc_spec( pvc_name: str, namespace: str, @@ -288,7 +289,7 @@ def get_pvc_spec( pvc_spec = models.V1PersistentVolumeClaim( api_version="v1", kind="PersistentVolumeClaim", - metadata={"name": pvc_name, "namepsace": namespace}, + metadata={"name": pvc_name, "namespace": namespace}, spec=models.V1PersistentVolumeClaimSpec( access_modes=storage_config["access_modes"], resources=models.V1ResourceRequirements( From f6b15a2b3d44461c79d155c1f09f76d6ed2c65c5 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 18 Aug 2024 10:53:26 +0800 Subject: [PATCH 37/84] resolve conflicts and add check for case of no parameters Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index b48950a3d60..c9c4ad370d5 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -333,10 +333,20 @@ class name in this argument. ) if ( - model_provider_parameters is not None - or dataset_provider_parameters is not None - or trainer_parameters is not None - ) and (objective is not None or parameters is not None): + ( + model_provider_parameters is not None + or dataset_provider_parameters is not None + or trainer_parameters is not None + ) + and (objective is not None or parameters is not None) + ) or ( + ( + model_provider_parameters is None + and dataset_provider_parameters is None + and trainer_parameters is None + ) + and (objective is None and parameters is None) + ): raise ValueError( "Invalid configuration for creating a Katib Experiment for hyperparameter " "optimization. You should only specify one of the following options:\n" @@ -1467,9 +1477,9 @@ def get_success_trial_details( ): output = {} output["name"] = trial.metadata.name - output["parameter_assignments"] = ( - trial.spec.parameter_assignments - ) + output[ + "parameter_assignments" + ] = trial.spec.parameter_assignments output["metrics"] = trial.status.observation.metrics result.append(output) except multiprocessing.TimeoutError: From 6a3e046169019f0742f15a58a53f8c184a6a42d4 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 18 Aug 2024 11:01:04 +0800 Subject: [PATCH 38/84] fix format Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 35 +++++++++---------- .../v1beta1/kubeflow/katib/utils/utils.py | 4 +-- 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index c9c4ad370d5..86688136d6e 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -349,7 +349,7 @@ class name in this argument. ): raise ValueError( "Invalid configuration for creating a Katib Experiment for hyperparameter " - "optimization. You should only specify one of the following options:\n" + "optimization. You should specify one of the following options:\n" "1. Use external models and datasets: specify `model_provider_parameters`, " "`dataset_provider_parameters` and `trainer_parameters`;\n" "2. Use custom objective function: specify `objective`, `base_image` and " @@ -494,25 +494,22 @@ class name in this argument. raise ValueError("One of the required parameters is None") try: - from kubeflow.storage_initializer.constants import \ - VOLUME_PATH_DATASET - from kubeflow.storage_initializer.constants import \ - VOLUME_PATH_MODEL - from kubeflow.storage_initializer.hugging_face import \ - HuggingFaceDatasetParams - from kubeflow.storage_initializer.hugging_face import \ - HuggingFaceModelParams - from kubeflow.storage_initializer.hugging_face import \ - HuggingFaceTrainerParams + from kubeflow.storage_initializer.constants import VOLUME_PATH_DATASET + from kubeflow.storage_initializer.constants import VOLUME_PATH_MODEL + from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceDatasetParams, + ) + from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceModelParams, + ) + from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceTrainerParams, + ) from kubeflow.storage_initializer.s3 import S3DatasetParams - from kubeflow.training.constants.constants import \ - STORAGE_INITIALIZER - from kubeflow.training.constants.constants import \ - STORAGE_INITIALIZER_IMAGE - from kubeflow.training.constants.constants import \ - STORAGE_INITIALIZER_VOLUME_MOUNT - from kubeflow.training.constants.constants import \ - TRAINER_TRANSFORMER_IMAGE + from kubeflow.training.constants.constants import STORAGE_INITIALIZER + from kubeflow.training.constants.constants import STORAGE_INITIALIZER_IMAGE + from kubeflow.training.constants.constants import STORAGE_INITIALIZER_VOLUME_MOUNT + from kubeflow.training.constants.constants import TRAINER_TRANSFORMER_IMAGE import peft import transformers except ImportError: diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 91aabec6750..77b10944f67 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -258,9 +258,7 @@ def get_pod_template_spec( # Create Pod template spec. If the value is None, Pod doesn't have that parameter pod_template_spec = models.V1PodTemplateSpec( - metadata=models.V1ObjectMeta( - annotations={"sidecar.istio.io/inject": "false"} - ), + metadata=models.V1ObjectMeta(annotations={"sidecar.istio.io/inject": "false"}), spec=models.V1PodSpec( init_containers=init_containers, containers=containers, From 25541b92a6d20018d97f5c137d0741228c3402e3 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 18 Aug 2024 11:07:16 +0800 Subject: [PATCH 39/84] fix format Signed-off-by: helenxie-bit --- .../v1beta1/kubeflow/katib/api/katib_client.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 86688136d6e..3fa4de8e3af 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -507,9 +507,15 @@ class name in this argument. ) from kubeflow.storage_initializer.s3 import S3DatasetParams from kubeflow.training.constants.constants import STORAGE_INITIALIZER - from kubeflow.training.constants.constants import STORAGE_INITIALIZER_IMAGE - from kubeflow.training.constants.constants import STORAGE_INITIALIZER_VOLUME_MOUNT - from kubeflow.training.constants.constants import TRAINER_TRANSFORMER_IMAGE + from kubeflow.training.constants.constants import ( + STORAGE_INITIALIZER_IMAGE, + ) + from kubeflow.training.constants.constants import ( + STORAGE_INITIALIZER_VOLUME_MOUNT, + ) + from kubeflow.training.constants.constants import ( + TRAINER_TRANSFORMER_IMAGE, + ) import peft import transformers except ImportError: @@ -1474,9 +1480,9 @@ def get_success_trial_details( ): output = {} output["name"] = trial.metadata.name - output[ - "parameter_assignments" - ] = trial.spec.parameter_assignments + output["parameter_assignments"] = ( + trial.spec.parameter_assignments + ) output["metrics"] = trial.status.observation.metrics result.append(output) except multiprocessing.TimeoutError: From 99e74d19a96c0205e8ac496861cc38b57f1fd22e Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 18 Aug 2024 11:10:47 +0800 Subject: [PATCH 40/84] fix format Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/utils/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 77b10944f67..61c6b864f45 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -242,7 +242,6 @@ def get_container_spec( if env_from: container_spec.env_from = env_from - return container_spec From 96cf99c8733c549667c747345ea40fb20b1c5242 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 18 Aug 2024 15:02:46 +0800 Subject: [PATCH 41/84] fix flake8 error Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 170 +++++------------- .../kubeflow/katib/constants/constants.py | 1 - .../v1beta1/kubeflow/katib/utils/utils.py | 8 +- 3 files changed, 47 insertions(+), 132 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 3fa4de8e3af..8591a01257d 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -13,24 +13,22 @@ # limitations under the License. import copy -import inspect import json import logging import multiprocessing -import textwrap import time -from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING, Union +from typing import Any, Callable, Dict, List, Optional, Union logger = logging.getLogger(__name__) -import grpc +import kubeflow.katib.katib_api_pb2 as katib_api_pb2 from kubeflow.katib import models from kubeflow.katib.api_client import ApiClient from kubeflow.katib.constants import constants -import kubeflow.katib.katib_api_pb2 as katib_api_pb2 from kubeflow.katib.utils import utils -from kubernetes import client -from kubernetes import config +from kubernetes import client, config + +import grpc logger = logging.getLogger(__name__) @@ -136,18 +134,14 @@ def create_experiment( "name" ] # if "generate_name" is used, "name" gets a prefix from server except multiprocessing.TimeoutError: - raise TimeoutError( - f"Timeout to create Katib Experiment: {namespace}/{experiment_name}" - ) + raise TimeoutError(f"Timeout to create Katib Experiment: {namespace}/{experiment_name}") except Exception as e: if hasattr(e, "status") and e.status == 409: raise Exception( f"A Katib Experiment with the name " f"{namespace}/{experiment_name} already exists." ) - raise RuntimeError( - f"Failed to create Katib Experiment: {namespace}/{experiment_name}" - ) + raise RuntimeError(f"Failed to create Katib Experiment: {namespace}/{experiment_name}") logger.debug(f"Experiment {namespace}/{experiment_name} has been created") @@ -189,9 +183,7 @@ def tune( Union[Dict[str, str], List[Union[client.V1EnvVar, client.V1EnvFromSource]]] ] = None, algorithm_name: str = "random", - algorithm_settings: Union[ - dict, List[models.V1beta1AlgorithmSetting], None - ] = None, + algorithm_settings: Union[dict, List[models.V1beta1AlgorithmSetting], None] = None, objective_metric_name: str = None, additional_metric_names: List[str] = [], objective_type: str = "maximize", @@ -402,10 +394,7 @@ class name in this argument. env = [] env_from = [] if isinstance(env_per_trial, dict): - env = [ - client.V1EnvVar(name=str(k), value=str(v)) - for k, v in env_per_trial.items() - ] + env = [client.V1EnvVar(name=str(k), value=str(v)) for k, v in env_per_trial.items()] elif env_per_trial: for x in env_per_trial: if isinstance(x, client.V1EnvVar): @@ -413,17 +402,13 @@ class name in this argument. elif isinstance(x, client.V1EnvFromSource): env_from.append(x) else: - raise ValueError( - f"Incorrect value for env_per_trial: {env_per_trial}" - ) + raise ValueError(f"Incorrect value for env_per_trial: {env_per_trial}") # Add metrics collector to the Katib Experiment. # Up to now, We only support parameter `kind`, of which default value is # `StdOut`, to specify the kind of metrics collector. experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( - collector=models.V1beta1CollectorSpec( - kind=metrics_collector_config["kind"] - ), + collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]), source=models.V1beta1SourceSpec( filter=models.V1beta1FilterSpec( metrics_format=[ @@ -494,30 +479,21 @@ class name in this argument. raise ValueError("One of the required parameters is None") try: - from kubeflow.storage_initializer.constants import VOLUME_PATH_DATASET - from kubeflow.storage_initializer.constants import VOLUME_PATH_MODEL - from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceDatasetParams, + from kubeflow.storage_initializer.constants import ( + VOLUME_PATH_DATASET, + VOLUME_PATH_MODEL, ) from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceDatasetParams, HuggingFaceModelParams, ) - from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceTrainerParams, - ) from kubeflow.storage_initializer.s3 import S3DatasetParams - from kubeflow.training.constants.constants import STORAGE_INITIALIZER from kubeflow.training.constants.constants import ( + STORAGE_INITIALIZER, STORAGE_INITIALIZER_IMAGE, - ) - from kubeflow.training.constants.constants import ( STORAGE_INITIALIZER_VOLUME_MOUNT, - ) - from kubeflow.training.constants.constants import ( TRAINER_TRANSFORMER_IMAGE, ) - import peft - import transformers except ImportError: raise ImportError( "Tune API dependencies not installed. " @@ -536,15 +512,11 @@ class name in this argument. ), ) except Exception as e: - pvc_list = self.core_api.list_namespaced_persistent_volume_claim( - namespace - ) + pvc_list = self.core_api.list_namespaced_persistent_volume_claim(namespace) # Check if the PVC with the specified name exists. for pvc in pvc_list.items: if pvc.metadata.name == name: - print( - f"PVC '{name}' already exists in namespace " f"{namespace}." - ) + print(f"PVC '{name}' already exists in namespace " f"{namespace}.") break else: raise RuntimeError(f"failed to create PVC. Error: {e}") @@ -624,14 +596,12 @@ class name in this argument. init_container_spec = utils.get_container_spec( name=STORAGE_INITIALIZER, - base_image=STORAGE_INITIALIZER_IMAGE, + base_image="docker.io/helenxiehz428/test", #STORAGE_INITIALIZER_IMAGE, args=[ "--model_provider", mp, "--model_provider_parameters", - json.dumps( - model_provider_parameters.__dict__, cls=utils.SetEncoder - ), + json.dumps(model_provider_parameters.__dict__, cls=utils.SetEncoder), "--dataset_provider", dp, "--dataset_provider_parameters", @@ -645,7 +615,7 @@ class name in this argument. container_spec = utils.get_container_spec( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - base_image=TRAINER_TRANSFORMER_IMAGE, + base_image="docker.io/helenxiehz428/test_llm4", #TRAINER_TRANSFORMER_IMAGE, args=[ "--model_uri", model_provider_parameters.model_uri, @@ -666,9 +636,7 @@ class name in this argument. storage_initializer_volume = models.V1Volume( name=STORAGE_INITIALIZER, - persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource( - claim_name=name - ), + persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource(claim_name=name), ) pod_spec = utils.get_pod_template_spec( @@ -780,19 +748,13 @@ def list_experiments( ) response = thread.get(timeout) result = [ - self.api_client.deserialize( - utils.FakeResponse(item), models.V1beta1Experiment - ) + self.api_client.deserialize(utils.FakeResponse(item), models.V1beta1Experiment) for item in response.get("items") ] except multiprocessing.TimeoutError: - raise TimeoutError( - f"Timeout to list Katib Experiments in namespace: {namespace}" - ) + raise TimeoutError(f"Timeout to list Katib Experiments in namespace: {namespace}") except Exception: - raise RuntimeError( - f"Failed to list Katib Experiments in namespace: {namespace}" - ) + raise RuntimeError(f"Failed to list Katib Experiments in namespace: {namespace}") return result def get_experiment_conditions( @@ -1029,20 +991,14 @@ def wait_for_experiment_condition( # Wait for Failed condition. if ( expected_condition == constants.EXPERIMENT_CONDITION_FAILED - and self.is_experiment_failed( - name, namespace, experiment, apiserver_timeout - ) + and self.is_experiment_failed(name, namespace, experiment, apiserver_timeout) ): utils.print_experiment_status(experiment) - logger.debug( - f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" - ) + logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") return experiment # Raise exception if Experiment is Failed. - elif self.is_experiment_failed( - name, namespace, experiment, apiserver_timeout - ): + elif self.is_experiment_failed(name, namespace, experiment, apiserver_timeout): raise RuntimeError( f"Experiment: {namespace}/{name} is Failed. " f"Experiment conditions: {experiment.status.conditions}" @@ -1051,48 +1007,34 @@ def wait_for_experiment_condition( # Check if Experiment reaches Created condition. elif ( expected_condition == constants.EXPERIMENT_CONDITION_CREATED - and self.is_experiment_created( - name, namespace, experiment, apiserver_timeout - ) + and self.is_experiment_created(name, namespace, experiment, apiserver_timeout) ): utils.print_experiment_status(experiment) - logger.debug( - f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" - ) + logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") return experiment # Check if Experiment reaches Running condition. elif ( expected_condition == constants.EXPERIMENT_CONDITION_RUNNING - and self.is_experiment_running( - name, namespace, experiment, apiserver_timeout - ) + and self.is_experiment_running(name, namespace, experiment, apiserver_timeout) ): utils.print_experiment_status(experiment) - logger.debug( - f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" - ) + logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") return experiment # Check if Experiment reaches Restarting condition. elif ( expected_condition == constants.EXPERIMENT_CONDITION_RESTARTING - and self.is_experiment_restarting( - name, namespace, experiment, apiserver_timeout - ) + and self.is_experiment_restarting(name, namespace, experiment, apiserver_timeout) ): utils.print_experiment_status(experiment) - logger.debug( - f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" - ) + logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") return experiment # Check if Experiment reaches Succeeded condition. elif ( expected_condition == constants.EXPERIMENT_CONDITION_SUCCEEDED - and self.is_experiment_succeeded( - name, namespace, experiment, apiserver_timeout - ) + and self.is_experiment_succeeded(name, namespace, experiment, apiserver_timeout) ): utils.print_experiment_status(experiment) @@ -1219,9 +1161,7 @@ def delete_experiment( body=delete_options, ) except multiprocessing.TimeoutError: - raise TimeoutError( - f"Timeout to delete Katib Experiment: {namespace}/{name}" - ) + raise TimeoutError(f"Timeout to delete Katib Experiment: {namespace}/{name}") except Exception: raise RuntimeError(f"Failed to delete Katib Experiment: {namespace}/{name}") @@ -1303,19 +1243,13 @@ def list_suggestions( ) response = thread.get(timeout) result = [ - self.api_client.deserialize( - utils.FakeResponse(item), models.V1beta1Suggestion - ) + self.api_client.deserialize(utils.FakeResponse(item), models.V1beta1Suggestion) for item in response.get("items") ] except multiprocessing.TimeoutError: - raise TimeoutError( - f"Timeout to list Katib Suggestions in namespace: {namespace}" - ) + raise TimeoutError(f"Timeout to list Katib Suggestions in namespace: {namespace}") except Exception: - raise RuntimeError( - f"Failed to list Katib Suggestions in namespace: {namespace}" - ) + raise RuntimeError(f"Failed to list Katib Suggestions in namespace: {namespace}") return result def get_trial( @@ -1407,15 +1341,11 @@ def list_trials( ) response = thread.get(timeout) result = [ - self.api_client.deserialize( - utils.FakeResponse(item), models.V1beta1Trial - ) + self.api_client.deserialize(utils.FakeResponse(item), models.V1beta1Trial) for item in response.get("items") ] except multiprocessing.TimeoutError: - raise TimeoutError( - f"Timeout to list Katib Trials in namespace: {namespace}" - ) + raise TimeoutError(f"Timeout to list Katib Trials in namespace: {namespace}") except Exception: raise RuntimeError(f"Failed to list Katib Trials in namespace: {namespace}") return result @@ -1467,28 +1397,18 @@ def get_success_trial_details( ) response = thread.get(timeout) for item in response.get("items"): - trial = self.api_client.deserialize( - utils.FakeResponse(item), models.V1beta1Trial - ) - if ( - trial.status - and trial.status.conditions - and len(trial.status.conditions) > 0 - ): + trial = self.api_client.deserialize(utils.FakeResponse(item), models.V1beta1Trial) + if trial.status and trial.status.conditions and len(trial.status.conditions) > 0: if utils.has_condition( trial.status.conditions, constants.TRIAL_CONDITION_SUCCEEDED ): output = {} output["name"] = trial.metadata.name - output["parameter_assignments"] = ( - trial.spec.parameter_assignments - ) + output["parameter_assignments"] = trial.spec.parameter_assignments output["metrics"] = trial.status.observation.metrics result.append(output) except multiprocessing.TimeoutError: - raise TimeoutError( - f"Timeout to list Katib Trials in namespace: {namespace}" - ) + raise TimeoutError(f"Timeout to list Katib Trials in namespace: {namespace}") except Exception: raise RuntimeError(f"Failed to list Katib Trials in namespace: {namespace}") return result diff --git a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py index fa4e5882727..8e2620bc168 100644 --- a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py +++ b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py @@ -14,7 +14,6 @@ import os -from kubernetes import client # How long to wait in seconds for requests to the Kubernetes or gRPC API Server. DEFAULT_TIMEOUT = 120 diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 61c6b864f45..07601405de4 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -85,7 +85,6 @@ def validate_metrics_value(value: Any): def validate_objective_function(objective: Callable): - # Check if objective function is callable. if not callable(objective): raise ValueError( @@ -179,8 +178,7 @@ def get_command_using_train_func( # Install Python packages if that is required. if packages_to_install is not None: exec_script = ( - get_script_for_python_packages(packages_to_install, pip_index_url) - + exec_script + get_script_for_python_packages(packages_to_install, pip_index_url) + exec_script ) # Return container command and args to execute training function. @@ -289,9 +287,7 @@ def get_pvc_spec( metadata={"name": pvc_name, "namespace": namespace}, spec=models.V1PersistentVolumeClaimSpec( access_modes=storage_config["access_modes"], - resources=models.V1ResourceRequirements( - requests={"storage": storage_config["size"]} - ), + resources=models.V1ResourceRequirements(requests={"storage": storage_config["size"]}), ), ) From c56880602796687c97eccc2b884e46c7b5b5e0dc Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 18 Aug 2024 15:09:50 +0800 Subject: [PATCH 42/84] fix format Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/utils/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 07601405de4..e743dae8816 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -178,7 +178,8 @@ def get_command_using_train_func( # Install Python packages if that is required. if packages_to_install is not None: exec_script = ( - get_script_for_python_packages(packages_to_install, pip_index_url) + exec_script + get_script_for_python_packages(packages_to_install, pip_index_url) + + exec_script ) # Return container command and args to execute training function. @@ -287,7 +288,9 @@ def get_pvc_spec( metadata={"name": pvc_name, "namespace": namespace}, spec=models.V1PersistentVolumeClaimSpec( access_modes=storage_config["access_modes"], - resources=models.V1ResourceRequirements(requests={"storage": storage_config["size"]}), + resources=models.V1ResourceRequirements( + requests={"storage": storage_config["size"]} + ), ), ) From 6f65253ff951e3683b3d88fbc0e4a4e3969490ae Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 18 Aug 2024 15:35:13 +0800 Subject: [PATCH 43/84] fix format Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 175 +++++++++++++----- 1 file changed, 127 insertions(+), 48 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 8591a01257d..7cc10d7521a 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -17,21 +17,26 @@ import logging import multiprocessing import time -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING, Union -logger = logging.getLogger(__name__) - -import kubeflow.katib.katib_api_pb2 as katib_api_pb2 from kubeflow.katib import models from kubeflow.katib.api_client import ApiClient from kubeflow.katib.constants import constants +import kubeflow.katib.katib_api_pb2 as katib_api_pb2 from kubeflow.katib.utils import utils -from kubernetes import client, config +from kubernetes import client +from kubernetes import config import grpc logger = logging.getLogger(__name__) +if TYPE_CHECKING: + from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams + from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams + from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams + from kubeflow.storage_initializer.s3 import S3DatasetParams + class KatibClient(object): def __init__( @@ -134,14 +139,18 @@ def create_experiment( "name" ] # if "generate_name" is used, "name" gets a prefix from server except multiprocessing.TimeoutError: - raise TimeoutError(f"Timeout to create Katib Experiment: {namespace}/{experiment_name}") + raise TimeoutError( + f"Timeout to create Katib Experiment: {namespace}/{experiment_name}" + ) except Exception as e: if hasattr(e, "status") and e.status == 409: raise Exception( f"A Katib Experiment with the name " f"{namespace}/{experiment_name} already exists." ) - raise RuntimeError(f"Failed to create Katib Experiment: {namespace}/{experiment_name}") + raise RuntimeError( + f"Failed to create Katib Experiment: {namespace}/{experiment_name}" + ) logger.debug(f"Experiment {namespace}/{experiment_name} has been created") @@ -183,7 +192,9 @@ def tune( Union[Dict[str, str], List[Union[client.V1EnvVar, client.V1EnvFromSource]]] ] = None, algorithm_name: str = "random", - algorithm_settings: Union[dict, List[models.V1beta1AlgorithmSetting], None] = None, + algorithm_settings: Union[ + dict, List[models.V1beta1AlgorithmSetting], None + ] = None, objective_metric_name: str = None, additional_metric_names: List[str] = [], objective_type: str = "maximize", @@ -230,8 +241,8 @@ def tune( dataset_provider_parameters: Parameters for the dataset provider in the Storage Initializer. For example, name of the HuggingFace dataset or AWS S3 configuration. - This argument must be the type of `kubeflow.storage_initializer.hugging_face.HuggingFaceDatasetParams` - or `kubeflow.storage_initializer.s3.S3DatasetParams` + This argument must be the type of `kubeflow.storage_initializer.hugging_face. + HuggingFaceDatasetParams` or `kubeflow.storage_initializer.s3.S3DatasetParams`. trainer_parameters: Parameters for configuring the training process, including settings for the hyperparameters search space. It should be of type `HuggingFaceTrainerParams`. You should use the Katib SDK to define @@ -394,7 +405,10 @@ class name in this argument. env = [] env_from = [] if isinstance(env_per_trial, dict): - env = [client.V1EnvVar(name=str(k), value=str(v)) for k, v in env_per_trial.items()] + env = [ + client.V1EnvVar(name=str(k), value=str(v)) + for k, v in env_per_trial.items() + ] elif env_per_trial: for x in env_per_trial: if isinstance(x, client.V1EnvVar): @@ -402,13 +416,17 @@ class name in this argument. elif isinstance(x, client.V1EnvFromSource): env_from.append(x) else: - raise ValueError(f"Incorrect value for env_per_trial: {env_per_trial}") + raise ValueError( + f"Incorrect value for env_per_trial: {env_per_trial}" + ) # Add metrics collector to the Katib Experiment. # Up to now, We only support parameter `kind`, of which default value is # `StdOut`, to specify the kind of metrics collector. experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( - collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]), + collector=models.V1beta1CollectorSpec( + kind=metrics_collector_config["kind"] + ), source=models.V1beta1SourceSpec( filter=models.V1beta1FilterSpec( metrics_format=[ @@ -479,19 +497,23 @@ class name in this argument. raise ValueError("One of the required parameters is None") try: - from kubeflow.storage_initializer.constants import ( - VOLUME_PATH_DATASET, - VOLUME_PATH_MODEL, - ) + from kubeflow.storage_initializer.constants import VOLUME_PATH_DATASET + from kubeflow.storage_initializer.constants import VOLUME_PATH_MODEL from kubeflow.storage_initializer.hugging_face import ( HuggingFaceDatasetParams, + ) + from kubeflow.storage_initializer.hugging_face import ( HuggingFaceModelParams, ) from kubeflow.storage_initializer.s3 import S3DatasetParams + from kubeflow.training.constants.constants import STORAGE_INITIALIZER from kubeflow.training.constants.constants import ( - STORAGE_INITIALIZER, STORAGE_INITIALIZER_IMAGE, + ) + from kubeflow.training.constants.constants import ( STORAGE_INITIALIZER_VOLUME_MOUNT, + ) + from kubeflow.training.constants.constants import ( TRAINER_TRANSFORMER_IMAGE, ) except ImportError: @@ -512,11 +534,15 @@ class name in this argument. ), ) except Exception as e: - pvc_list = self.core_api.list_namespaced_persistent_volume_claim(namespace) + pvc_list = self.core_api.list_namespaced_persistent_volume_claim( + namespace + ) # Check if the PVC with the specified name exists. for pvc in pvc_list.items: if pvc.metadata.name == name: - print(f"PVC '{name}' already exists in namespace " f"{namespace}.") + print( + f"PVC '{name}' already exists in namespace " f"{namespace}." + ) break else: raise RuntimeError(f"failed to create PVC. Error: {e}") @@ -534,7 +560,8 @@ class name in this argument. dp = "hf" else: raise ValueError( - "Dataset provider parameters must be an instance of S3DatasetParams or HuggingFaceDatasetParams." + "Dataset provider parameters must be an instance of S3DatasetParams " + "or HuggingFaceDatasetParams." ) # Iterate over input parameters. @@ -596,12 +623,14 @@ class name in this argument. init_container_spec = utils.get_container_spec( name=STORAGE_INITIALIZER, - base_image="docker.io/helenxiehz428/test", #STORAGE_INITIALIZER_IMAGE, + base_image=STORAGE_INITIALIZER_IMAGE, args=[ "--model_provider", mp, "--model_provider_parameters", - json.dumps(model_provider_parameters.__dict__, cls=utils.SetEncoder), + json.dumps( + model_provider_parameters.__dict__, cls=utils.SetEncoder + ), "--dataset_provider", dp, "--dataset_provider_parameters", @@ -615,7 +644,7 @@ class name in this argument. container_spec = utils.get_container_spec( name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - base_image="docker.io/helenxiehz428/test_llm4", #TRAINER_TRANSFORMER_IMAGE, + base_image=TRAINER_TRANSFORMER_IMAGE, args=[ "--model_uri", model_provider_parameters.model_uri, @@ -636,7 +665,9 @@ class name in this argument. storage_initializer_volume = models.V1Volume( name=STORAGE_INITIALIZER, - persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource(claim_name=name), + persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource( + claim_name=name + ), ) pod_spec = utils.get_pod_template_spec( @@ -748,13 +779,19 @@ def list_experiments( ) response = thread.get(timeout) result = [ - self.api_client.deserialize(utils.FakeResponse(item), models.V1beta1Experiment) + self.api_client.deserialize( + utils.FakeResponse(item), models.V1beta1Experiment + ) for item in response.get("items") ] except multiprocessing.TimeoutError: - raise TimeoutError(f"Timeout to list Katib Experiments in namespace: {namespace}") + raise TimeoutError( + f"Timeout to list Katib Experiments in namespace: {namespace}" + ) except Exception: - raise RuntimeError(f"Failed to list Katib Experiments in namespace: {namespace}") + raise RuntimeError( + f"Failed to list Katib Experiments in namespace: {namespace}" + ) return result def get_experiment_conditions( @@ -991,14 +1028,20 @@ def wait_for_experiment_condition( # Wait for Failed condition. if ( expected_condition == constants.EXPERIMENT_CONDITION_FAILED - and self.is_experiment_failed(name, namespace, experiment, apiserver_timeout) + and self.is_experiment_failed( + name, namespace, experiment, apiserver_timeout + ) ): utils.print_experiment_status(experiment) - logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") + logger.debug( + f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" + ) return experiment # Raise exception if Experiment is Failed. - elif self.is_experiment_failed(name, namespace, experiment, apiserver_timeout): + elif self.is_experiment_failed( + name, namespace, experiment, apiserver_timeout + ): raise RuntimeError( f"Experiment: {namespace}/{name} is Failed. " f"Experiment conditions: {experiment.status.conditions}" @@ -1007,34 +1050,48 @@ def wait_for_experiment_condition( # Check if Experiment reaches Created condition. elif ( expected_condition == constants.EXPERIMENT_CONDITION_CREATED - and self.is_experiment_created(name, namespace, experiment, apiserver_timeout) + and self.is_experiment_created( + name, namespace, experiment, apiserver_timeout + ) ): utils.print_experiment_status(experiment) - logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") + logger.debug( + f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" + ) return experiment # Check if Experiment reaches Running condition. elif ( expected_condition == constants.EXPERIMENT_CONDITION_RUNNING - and self.is_experiment_running(name, namespace, experiment, apiserver_timeout) + and self.is_experiment_running( + name, namespace, experiment, apiserver_timeout + ) ): utils.print_experiment_status(experiment) - logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") + logger.debug( + f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" + ) return experiment # Check if Experiment reaches Restarting condition. elif ( expected_condition == constants.EXPERIMENT_CONDITION_RESTARTING - and self.is_experiment_restarting(name, namespace, experiment, apiserver_timeout) + and self.is_experiment_restarting( + name, namespace, experiment, apiserver_timeout + ) ): utils.print_experiment_status(experiment) - logger.debug(f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n") + logger.debug( + f"Experiment: {namespace}/{name} is {expected_condition}\n\n\n" + ) return experiment # Check if Experiment reaches Succeeded condition. elif ( expected_condition == constants.EXPERIMENT_CONDITION_SUCCEEDED - and self.is_experiment_succeeded(name, namespace, experiment, apiserver_timeout) + and self.is_experiment_succeeded( + name, namespace, experiment, apiserver_timeout + ) ): utils.print_experiment_status(experiment) @@ -1161,7 +1218,9 @@ def delete_experiment( body=delete_options, ) except multiprocessing.TimeoutError: - raise TimeoutError(f"Timeout to delete Katib Experiment: {namespace}/{name}") + raise TimeoutError( + f"Timeout to delete Katib Experiment: {namespace}/{name}" + ) except Exception: raise RuntimeError(f"Failed to delete Katib Experiment: {namespace}/{name}") @@ -1243,13 +1302,19 @@ def list_suggestions( ) response = thread.get(timeout) result = [ - self.api_client.deserialize(utils.FakeResponse(item), models.V1beta1Suggestion) + self.api_client.deserialize( + utils.FakeResponse(item), models.V1beta1Suggestion + ) for item in response.get("items") ] except multiprocessing.TimeoutError: - raise TimeoutError(f"Timeout to list Katib Suggestions in namespace: {namespace}") + raise TimeoutError( + f"Timeout to list Katib Suggestions in namespace: {namespace}" + ) except Exception: - raise RuntimeError(f"Failed to list Katib Suggestions in namespace: {namespace}") + raise RuntimeError( + f"Failed to list Katib Suggestions in namespace: {namespace}" + ) return result def get_trial( @@ -1341,11 +1406,15 @@ def list_trials( ) response = thread.get(timeout) result = [ - self.api_client.deserialize(utils.FakeResponse(item), models.V1beta1Trial) + self.api_client.deserialize( + utils.FakeResponse(item), models.V1beta1Trial + ) for item in response.get("items") ] except multiprocessing.TimeoutError: - raise TimeoutError(f"Timeout to list Katib Trials in namespace: {namespace}") + raise TimeoutError( + f"Timeout to list Katib Trials in namespace: {namespace}" + ) except Exception: raise RuntimeError(f"Failed to list Katib Trials in namespace: {namespace}") return result @@ -1397,18 +1466,28 @@ def get_success_trial_details( ) response = thread.get(timeout) for item in response.get("items"): - trial = self.api_client.deserialize(utils.FakeResponse(item), models.V1beta1Trial) - if trial.status and trial.status.conditions and len(trial.status.conditions) > 0: + trial = self.api_client.deserialize( + utils.FakeResponse(item), models.V1beta1Trial + ) + if ( + trial.status + and trial.status.conditions + and len(trial.status.conditions) > 0 + ): if utils.has_condition( trial.status.conditions, constants.TRIAL_CONDITION_SUCCEEDED ): output = {} output["name"] = trial.metadata.name - output["parameter_assignments"] = trial.spec.parameter_assignments + output["parameter_assignments"] = ( + trial.spec.parameter_assignments + ) output["metrics"] = trial.status.observation.metrics result.append(output) except multiprocessing.TimeoutError: - raise TimeoutError(f"Timeout to list Katib Trials in namespace: {namespace}") + raise TimeoutError( + f"Timeout to list Katib Trials in namespace: {namespace}" + ) except Exception: raise RuntimeError(f"Failed to list Katib Trials in namespace: {namespace}") return result From ad17ac9578a03c85c2a630774dd2e39f3488a931 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 18 Aug 2024 15:42:47 +0800 Subject: [PATCH 44/84] fix format Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 3 +-- sdk/python/v1beta1/kubeflow/katib/constants/constants.py | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 7cc10d7521a..a80fb15b9db 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -19,6 +19,7 @@ import time from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING, Union +import grpc from kubeflow.katib import models from kubeflow.katib.api_client import ApiClient from kubeflow.katib.constants import constants @@ -27,8 +28,6 @@ from kubernetes import client from kubernetes import config -import grpc - logger = logging.getLogger(__name__) if TYPE_CHECKING: diff --git a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py index 8e2620bc168..1e0478f48f8 100644 --- a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py +++ b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py @@ -14,7 +14,6 @@ import os - # How long to wait in seconds for requests to the Kubernetes or gRPC API Server. DEFAULT_TIMEOUT = 120 From 9a1e2dfab2fa6aa8ad1645d3b270919b097b948f Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 18 Aug 2024 15:45:06 +0800 Subject: [PATCH 45/84] fix format Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index a80fb15b9db..7cc10d7521a 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -19,7 +19,6 @@ import time from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING, Union -import grpc from kubeflow.katib import models from kubeflow.katib.api_client import ApiClient from kubeflow.katib.constants import constants @@ -28,6 +27,8 @@ from kubernetes import client from kubernetes import config +import grpc + logger = logging.getLogger(__name__) if TYPE_CHECKING: From dd12cc2498f949e79718d5181bd85babaea9ac27 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 19 Aug 2024 19:44:10 +0800 Subject: [PATCH 46/84] fix format Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 7cc10d7521a..a80fb15b9db 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -19,6 +19,7 @@ import time from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING, Union +import grpc from kubeflow.katib import models from kubeflow.katib.api_client import ApiClient from kubeflow.katib.constants import constants @@ -27,8 +28,6 @@ from kubernetes import client from kubernetes import config -import grpc - logger = logging.getLogger(__name__) if TYPE_CHECKING: From 160065a7a964f1393ed5ed16143abf979e5ee30c Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 21 Aug 2024 15:44:12 +0800 Subject: [PATCH 47/84] update isort file to black and fix typo Signed-off-by: helenxie-bit --- .pre-commit-config.yaml | 2 +- .../kubeflow/katib/api/katib_client.py | 34 +++++++++---------- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0681a816207..f0034dcae05 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,7 +10,7 @@ repos: hooks: - id: isort name: isort - entry: isort --profile google + entry: isort --profile black - repo: https://github.com/psf/black rev: 24.2.0 hooks: diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index a80fb15b9db..54586516a7a 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -17,23 +17,25 @@ import logging import multiprocessing import time -from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union -import grpc +import kubeflow.katib.katib_api_pb2 as katib_api_pb2 from kubeflow.katib import models from kubeflow.katib.api_client import ApiClient from kubeflow.katib.constants import constants -import kubeflow.katib.katib_api_pb2 as katib_api_pb2 from kubeflow.katib.utils import utils -from kubernetes import client -from kubernetes import config +from kubernetes import client, config + +import grpc logger = logging.getLogger(__name__) if TYPE_CHECKING: - from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams - from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams - from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams + from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceDatasetParams, + HuggingFaceModelParams, + HuggingFaceTrainerParams, + ) from kubeflow.storage_initializer.s3 import S3DatasetParams @@ -420,7 +422,7 @@ class name in this argument. ) # Add metrics collector to the Katib Experiment. - # Up to now, We only support parameter `kind`, of which default value is + # Up to now, we only support parameter `kind`, of which default value is # `StdOut`, to specify the kind of metrics collector. experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( collector=models.V1beta1CollectorSpec( @@ -496,23 +498,19 @@ class name in this argument. raise ValueError("One of the required parameters is None") try: - from kubeflow.storage_initializer.constants import VOLUME_PATH_DATASET - from kubeflow.storage_initializer.constants import VOLUME_PATH_MODEL - from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceDatasetParams, + from kubeflow.storage_initializer.constants import ( + VOLUME_PATH_DATASET, + VOLUME_PATH_MODEL, ) from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceDatasetParams, HuggingFaceModelParams, ) from kubeflow.storage_initializer.s3 import S3DatasetParams - from kubeflow.training.constants.constants import STORAGE_INITIALIZER from kubeflow.training.constants.constants import ( + STORAGE_INITIALIZER, STORAGE_INITIALIZER_IMAGE, - ) - from kubeflow.training.constants.constants import ( STORAGE_INITIALIZER_VOLUME_MOUNT, - ) - from kubeflow.training.constants.constants import ( TRAINER_TRANSFORMER_IMAGE, ) except ImportError: From 48a3ee07ba965229185a0a8404e8f338472e5e25 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 21 Aug 2024 16:38:46 +0800 Subject: [PATCH 48/84] modify the set of metrics format Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 30 +++++++++---------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 54586516a7a..6993919dea2 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -422,22 +422,10 @@ class name in this argument. ) # Add metrics collector to the Katib Experiment. - # Up to now, we only support parameter `kind`, of which default value is - # `StdOut`, to specify the kind of metrics collector. + # Up to now, we only support parameter `kind`, of which default value + # is `StdOut`, to specify the kind of metrics collector. experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( - collector=models.V1beta1CollectorSpec( - kind=metrics_collector_config["kind"] - ), - source=models.V1beta1SourceSpec( - filter=models.V1beta1FilterSpec( - metrics_format=[ - # For example: train_loss=0.846 - r"([\w|-]+)\s*=\s*([+-]?\d*(\.\d+)?([Ee][+-]?\d+)?)", - # For example: 'train_loss':0.846 - r"'([\w|-]+)'\s*:\s*([+-]?\d*(\.\d+)?([Ee][+-]?\d+)?)", - ] - ) - ), + collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]) ) # Create Container and Pod specifications. @@ -515,10 +503,20 @@ class name in this argument. ) except ImportError: raise ImportError( - "Tune API dependencies not installed. " + "LLM dependencies for Tune API are not installed. " + "Run: pip install -U 'kubeflow-katib[huggingface]' " ) + # Add metrics format for the metrics collector. + experiment.spec.metrics_collector_spec.source=models.V1beta1SourceSpec( + filter=models.V1beta1FilterSpec( + metrics_format=[ + # For example: 'train_loss':0.846 + r"'([\w|-]+)'\s*:\s*([+-]?\d*(\.\d+)?([Ee][+-]?\d+)?)", + ] + ) + ) + # Create PVC for the Storage Initializer. # TODO (helenxie-bit): PVC Creation should be part of Katib Controller. try: From 0f8a8efcb6e22aab689da5270f6cdb13d406cf17 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 22 Aug 2024 05:26:53 +0800 Subject: [PATCH 49/84] update tune API Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 377 +++++++++++------- .../v1beta1/kubeflow/katib/utils/utils.py | 222 +++-------- 2 files changed, 278 insertions(+), 321 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 6993919dea2..497b8cd9723 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -12,15 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy +import inspect import json import logging import multiprocessing +import textwrap import time -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union import kubeflow.katib.katib_api_pb2 as katib_api_pb2 -from kubeflow.katib import models +from kubeflow.katib import models, types from kubeflow.katib.api_client import ApiClient from kubeflow.katib.constants import constants from kubeflow.katib.utils import utils @@ -30,14 +31,6 @@ logger = logging.getLogger(__name__) -if TYPE_CHECKING: - from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceDatasetParams, - HuggingFaceModelParams, - HuggingFaceTrainerParams, - ) - from kubeflow.storage_initializer.s3 import S3DatasetParams - class KatibClient(object): def __init__( @@ -171,15 +164,15 @@ def create_experiment( ) ) + # fmt: off def tune( self, # TODO (andreyvelich): How to be consistent with other APIs (name) ? name: str, - model_provider_parameters: Optional["HuggingFaceModelParams"] = None, - dataset_provider_parameters: Optional[ - Union["HuggingFaceDatasetParams", "S3DatasetParams"] - ] = None, - trainer_parameters: Optional["HuggingFaceTrainerParams"] = None, + model_provider_parameters: Optional["HuggingFaceModelParams"] = None, # noqa: F821 + dataset_provider_parameters: Optional[Union[ + "HuggingFaceDatasetParams", "S3DatasetParams"]] = None, # noqa: F821 + trainer_parameters: Optional["HuggingFaceTrainerParams"] = None, # noqa: F821 storage_config: Optional[Dict[str, Optional[Union[str, List[str]]]]] = { "size": constants.PVC_DEFAULT_SIZE, "storage_class": None, @@ -203,12 +196,16 @@ def tune( max_trial_count: int = None, parallel_trial_count: int = None, max_failed_trial_count: int = None, - resources_per_trial: Union[dict, client.V1ResourceRequirements, None] = None, + resources_per_trial: Union[ + dict, client.V1ResourceRequirements, types.TrainerResources, None + ] = None, retain_trials: bool = False, packages_to_install: List[str] = None, pip_index_url: str = "https://pypi.org/simple", metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"}, ): + # fmt: on + """ Create HyperParameter Tuning Katib Experiment using one of the following options: @@ -296,8 +293,9 @@ class name in this argument. parallel_trial_count: Number of Trials that Experiment runs in parallel. max_failed_trial_count: Maximum number of Trials allowed to fail. resources_per_trial: A parameter that lets you specify how much resources - each trial container should have. You can either specify a - kubernetes.client.V1ResourceRequirements object (documented here: + each trial container should have. + For custom objective function, you can either specify a kubernetes.client. + V1ResourceRequirements object (documented here: https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1ResourceRequirements.md) or a dictionary that includes one or more of the following keys: `cpu`, `memory`, or `gpu` (other keys will be ignored). Appropriate values @@ -313,6 +311,29 @@ class name in this argument. `nvidia.com/gpu`, i.e. an NVIDIA GPU. If you need a different type of GPU, pass in a V1ResourceRequirement instance instead, since it's more flexible. This parameter is optional and defaults to None. + + For external models and datasets, you can specify a types.TrainerResources object, + which includes `num_workers`, `num_procs_per_worker`, and `resources_per_worker`. + For example: + ``` + resources_per_trial = types.TrainerResources( + num_workers=4, + num_procs_per_worker=2, + resources_per_worker={ + "gpu": "2", + "cpu": "5", + "memory": "10Gi" + } + ) + ``` + - num_workers: Number of PyTorchJob workers. + - num_procs_per_worker: Number of processes per PyTorchJob worker for + `torchrun` CLI. You can use this parameter if you want to use more than 1 GPU + per PyTorchJob worker. + - resources_per_worker: A parameter that lets you specify how much resources + each PyTorchJob worker container should have. You can either specify + a kubernetes.client.V1ResourceRequirements object or a dictionary, same as + resources specified under the option of custom objective function. retain_trials: Whether Trials' resources (e.g. pods) are deleted after Succeeded state. packages_to_install: List of Python packages to install in addition to the base image packages. These packages are installed before @@ -402,41 +423,18 @@ class name in this argument. if max_failed_trial_count is not None: experiment.spec.max_failed_trial_count = max_failed_trial_count - # Add environment variables to the Katib Experiment. - env = [] - env_from = [] - if isinstance(env_per_trial, dict): - env = [ - client.V1EnvVar(name=str(k), value=str(v)) - for k, v in env_per_trial.items() - ] - elif env_per_trial: - for x in env_per_trial: - if isinstance(x, client.V1EnvVar): - env.append(x) - elif isinstance(x, client.V1EnvFromSource): - env_from.append(x) - else: - raise ValueError( - f"Incorrect value for env_per_trial: {env_per_trial}" - ) - - # Add metrics collector to the Katib Experiment. - # Up to now, we only support parameter `kind`, of which default value - # is `StdOut`, to specify the kind of metrics collector. - experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( - collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]) - ) - - # Create Container and Pod specifications. # If users choose to use a custom objective function. if objective is not None: - if not base_image or not parameters: - raise ValueError("One of the required parameters is None.") - # Validate objective function. utils.validate_objective_function(objective) + # Extract objective function implementation. + objective_code = inspect.getsource(objective) + + # Objective function might be defined in some indented scope + # (e.g. in another function). We need to dedent the function code. + objective_code = textwrap.dedent(objective_code) + # Iterate over input parameters. input_params = {} experiment_params = [] @@ -459,21 +457,110 @@ class name in this argument. # Otherwise, add value to the function input. input_params[p_name] = p_value - container_spec = utils.get_container_spec( - name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - base_image=base_image, - train_func=objective, - train_func_parameters=input_params, - packages_to_install=packages_to_install, - pip_index_url=pip_index_url, - resources=resources_per_trial, - env=env, - env_from=env_from, + # Wrap objective function to execute it from the file. For example: + # def objective(parameters): + # print(f'Parameters are {parameters}') + # objective({ + # 'lr': '${trialParameters.lr}', + # 'epochs': '${trialParameters.epochs}', + # 'is_dist': False + # }) + objective_code = f"{objective_code}\n{objective.__name__}({input_params})\n" + + # Prepare execute script template. + exec_script = textwrap.dedent( + """ + program_path=$(mktemp -d) + read -r -d '' SCRIPT << EOM\n + {objective_code} + EOM + printf "%s" "$SCRIPT" > $program_path/ephemeral_objective.py + python3 -u $program_path/ephemeral_objective.py""" ) - pod_spec = utils.get_pod_template_spec( - containers=[container_spec], - restart_policy="Never", + # Add objective code to the execute script. + exec_script = exec_script.format(objective_code=objective_code) + + # Install Python packages if that is required. + if packages_to_install is not None: + exec_script = ( + utils.get_script_for_python_packages( + packages_to_install, pip_index_url + ) + + exec_script + ) + + if isinstance(resources_per_trial, dict): + if "gpu" in resources_per_trial: + resources_per_trial["nvidia.com/gpu"] = resources_per_trial.pop( + "gpu" + ) + + resources_per_trial = client.V1ResourceRequirements( + requests=resources_per_trial, + limits=resources_per_trial, + ) + + env = [] + env_from = [] + if isinstance(env_per_trial, dict): + env = [ + client.V1EnvVar(name=str(k), value=str(v)) + for k, v in env_per_trial.items() + ] + elif env_per_trial: + for x in env_per_trial: + if isinstance(x, client.V1EnvVar): + env.append(x) + elif isinstance(x, client.V1EnvFromSource): + env_from.append(x) + else: + raise ValueError( + f"Incorrect value for env_per_trial: {env_per_trial}" + ) + + # Add metrics collector to the Katib Experiment. + # Up to now, we only support parameter `kind`, of which default value + # is `StdOut`, to specify the kind of metrics collector. + experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( + collector=models.V1beta1CollectorSpec( + kind=metrics_collector_config["kind"] + ) + ) + + # Create Trial specification. + trial_spec = client.V1Job( + api_version="batch/v1", + kind="Job", + spec=client.V1JobSpec( + template=client.V1PodTemplateSpec( + metadata=models.V1ObjectMeta( + annotations={"sidecar.istio.io/inject": "false"} + ), + spec=client.V1PodSpec( + restart_policy="Never", + containers=[ + client.V1Container( + name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, + image=base_image, + command=["bash", "-c"], + args=[exec_script], + env=env if env else None, + env_from=env_from if env_from else None, + resources=resources_per_trial, + ) + ], + ), + ) + ), + ) + + # Create Trial template. + trial_template = models.V1beta1TrialTemplate( + primary_container_name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, + retain=retain_trials, + trial_parameters=trial_params, + trial_spec=trial_spec, ) # If users choose to use external models and datasets. @@ -495,34 +582,28 @@ class name in this argument. HuggingFaceModelParams, ) from kubeflow.storage_initializer.s3 import S3DatasetParams + from kubeflow.training import models as training_models from kubeflow.training.constants.constants import ( + JOB_PARAMETERS, + PYTORCHJOB_KIND, STORAGE_INITIALIZER, STORAGE_INITIALIZER_IMAGE, STORAGE_INITIALIZER_VOLUME_MOUNT, TRAINER_TRANSFORMER_IMAGE, ) + from kubeflow.training.utils import utils as training_utils except ImportError: raise ImportError( "LLM dependencies for Tune API are not installed. " + "Run: pip install -U 'kubeflow-katib[huggingface]' " ) - # Add metrics format for the metrics collector. - experiment.spec.metrics_collector_spec.source=models.V1beta1SourceSpec( - filter=models.V1beta1FilterSpec( - metrics_format=[ - # For example: 'train_loss':0.846 - r"'([\w|-]+)'\s*:\s*([+-]?\d*(\.\d+)?([Ee][+-]?\d+)?)", - ] - ) - ) - # Create PVC for the Storage Initializer. # TODO (helenxie-bit): PVC Creation should be part of Katib Controller. try: self.core_api.create_namespaced_persistent_volume_claim( namespace=namespace, - body=utils.get_pvc_spec( + body=training_utils.get_pvc_spec( pvc_name=name, namespace=namespace, storage_config=storage_config, @@ -559,64 +640,18 @@ class name in this argument. "or HuggingFaceDatasetParams." ) - # Iterate over input parameters. + # Iterate over input parameters and do substitutions. experiment_params = [] trial_params = [] - training_args = trainer_parameters.training_parameters - for ( - p_name, - p_value, - ) in trainer_parameters.training_parameters.to_dict().items(): - if not hasattr(training_args, p_name): - logger.warning( - f"Training parameter {p_name} is not supported by the current transformer." - ) - continue - if isinstance(p_value, models.V1beta1ParameterSpec): - old_attr = getattr(training_args, p_name, None) - if old_attr is not None: - value = f"${{trialParameters.{p_name}}}" - setattr(training_args, p_name, value) - p_value.name = p_name - experiment_params.append(p_value) - trial_params.append( - models.V1beta1TrialParameterSpec(name=p_name, reference=p_name) - ) - elif p_value is not None: - old_attr = getattr(training_args, p_name, None) - if old_attr is not None: - if isinstance(p_value, dict): - # Update the existing dictionary without nesting - value = copy.deepcopy(p_value) - else: - value = type(old_attr)(p_value) - setattr(training_args, p_name, value) - - lora_config = trainer_parameters.lora_config - for p_name, p_value in trainer_parameters.lora_config.__dict__.items(): - if not hasattr(lora_config, p_name): - logger.warning( - f"Training parameter {p_name} is not supported by the current peft." - ) - continue - if isinstance(p_value, models.V1beta1ParameterSpec): - old_attr = getattr(lora_config, p_name, None) - if old_attr is not None: - value = f"${{trialParameters.{p_name}}}" - setattr(lora_config, p_name, value) - p_value.name = p_name - experiment_params.append(p_value) - trial_params.append( - models.V1beta1TrialParameterSpec(name=p_name, reference=p_name) - ) - elif p_value is not None: - old_attr = getattr(lora_config, p_name, None) - if old_attr is not None: - value = type(old_attr)(p_value) - setattr(lora_config, p_name, value) + training_args = utils.parameter_substitution( + trainer_parameters.training_parameters, experiment_params, trial_params + ) + lora_config = utils.parameter_substitution( + trainer_parameters.lora_config, experiment_params, trial_params + ) - init_container_spec = utils.get_container_spec( + init_container_spec = training_utils.get_container_spec( name=STORAGE_INITIALIZER, base_image=STORAGE_INITIALIZER_IMAGE, args=[ @@ -634,11 +669,8 @@ class name in this argument. volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], ) - lora_config = json.dumps(lora_config.__dict__, cls=utils.SetEncoder) - training_args = json.dumps(training_args.to_dict()) - - container_spec = utils.get_container_spec( - name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, + container_spec = training_utils.get_container_spec( + name=JOB_PARAMETERS[PYTORCHJOB_KIND]["container"], base_image=TRAINER_TRANSFORMER_IMAGE, args=[ "--model_uri", @@ -655,7 +687,7 @@ class name in this argument. f"'{training_args}'", ], volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], - resources=resources_per_trial, + resources=resources_per_trial.resources_per_worker, ) storage_initializer_volume = models.V1Volume( @@ -665,29 +697,74 @@ class name in this argument. ), ) - pod_spec = utils.get_pod_template_spec( + # create worker pod spec + worker_pod_template_spec = training_utils.get_pod_template_spec( + containers=[container_spec], + volumes=[storage_initializer_volume], + ) + + # create master pod spec + master_pod_template_spec = training_utils.get_pod_template_spec( containers=[container_spec], init_containers=[init_container_spec], volumes=[storage_initializer_volume], - restart_policy="Never", ) - # Create Trial specification. - trial_spec = client.V1Job( - api_version="batch/v1", - kind="Job", - spec=client.V1JobSpec( - template=pod_spec, - ), - ) + # Create pytorchjob. + pytorchjob = training_models.KubeflowOrgV1PyTorchJob( + api_version="kubeflow.org/v1", + kind="PyTorchJob", + spec=training_models.KubeflowOrgV1PyTorchJobSpec( + run_policy=training_models.KubeflowOrgV1RunPolicy( + clean_pod_policy=None + ), + pytorch_replica_specs={}, + ), + ) - # Create Trial template. - trial_template = models.V1beta1TrialTemplate( - primary_container_name=constants.DEFAULT_PRIMARY_CONTAINER_NAME, - retain=retain_trials, - trial_parameters=trial_params, - trial_spec=trial_spec, - ) + if resources_per_trial.num_procs_per_worker: + pytorchjob.spec.nproc_per_node = str( + resources_per_trial.num_procs_per_worker + ) + + pytorchjob.spec.pytorch_replica_specs["Master"] = ( + training_models.KubeflowOrgV1ReplicaSpec( + replicas=1, + template=master_pod_template_spec, + ) + ) + + if resources_per_trial.num_workers > 1: + pytorchjob.spec.pytorch_replica_specs["Worker"] = ( + training_models.KubeflowOrgV1ReplicaSpec( + replicas=resources_per_trial.num_workers - 1, + template=worker_pod_template_spec, + ) + ) + + # Add metrics collector to the Katib Experiment. + # Specify metrics format for the collector. + experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( + collector=models.V1beta1CollectorSpec( + kind=metrics_collector_config["kind"] + ), + source=models.V1beta1SourceSpec( + filter=models.V1beta1FilterSpec( + metrics_format=[ + # For example: 'train_loss':0.846 + r"'([\w|-]+)'\s*:\s*([+-]?\d*(\.\d+)?([Ee][+-]?\d+)?)", + ] + ) + ), + ) + + # Create Trial template. + trial_template = models.V1beta1TrialTemplate( + primary_container_name=JOB_PARAMETERS[PYTORCHJOB_KIND]["container"], + retain=retain_trials, + trial_parameters=trial_params, + trial_spec=pytorchjob, + ) # Add parameters to the Katib Experiment. experiment.spec.parameters = experiment_params diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index e743dae8816..3e2dc4459e3 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -12,15 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import inspect import json +import logging import os import textwrap -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, List, Union from kubeflow.katib import models from kubeflow.katib.constants import constants +logger = logging.getLogger(__name__) + def is_running_in_k8s(): return os.path.isdir("/var/run/secrets/kubernetes.io/") @@ -130,176 +134,6 @@ def __init__(self, obj): self.data = json.dumps(obj) -def get_command_using_train_func( - train_func: Optional[Callable], - train_func_parameters: Optional[Dict[str, Any]] = None, - packages_to_install: Optional[List[str]] = None, - pip_index_url: str = "https://pypi.org/simple", -) -> Tuple[List[str], List[str]]: - """ - Get container args and command from the given training function and parameters. - """ - # Check if function is callable. - if not callable(train_func): - raise ValueError( - f"Training function must be callable, got function type: {type(train_func)}" - ) - - # Extract function implementation. - func_code = inspect.getsource(train_func) - - # Function might be defined in some indented scope (e.g. in another function). - # We need to dedent the function code. - func_code = textwrap.dedent(func_code) - - # Wrap function code to execute it from the file. For example: - # def train(parameters): - # print('Start Training...') - # train({'lr': 0.01}) - if train_func_parameters is None: - func_code = f"{func_code}\n{train_func.__name__}()\n" - else: - func_code = f"{func_code}\n{train_func.__name__}({train_func_parameters})\n" - - # Prepare execute script template. - exec_script = textwrap.dedent( - """ - program_path=$(mktemp -d) - read -r -d '' SCRIPT << EOM\n - {func_code} - EOM - printf "%s" \"$SCRIPT\" > \"$program_path/ephemeral_script.py\" - python3 -u \"$program_path/ephemeral_script.py\"""" - ) - - # Add function code to the execute script. - exec_script = exec_script.format(func_code=func_code) - - # Install Python packages if that is required. - if packages_to_install is not None: - exec_script = ( - get_script_for_python_packages(packages_to_install, pip_index_url) - + exec_script - ) - - # Return container command and args to execute training function. - return ["bash", "-c"], [exec_script] - - -def get_container_spec( - name: str, - base_image: str, - train_func: Optional[Callable] = None, - train_func_parameters: Optional[Dict[str, Any]] = None, - packages_to_install: Optional[List[str]] = None, - pip_index_url: str = "https://pypi.org/simple", - args: Optional[List[str]] = None, - resources: Union[dict, models.V1ResourceRequirements, None] = None, - volume_mounts: Optional[List[models.V1VolumeMount]] = None, - env: Optional[List[models.V1EnvVar]] = None, - env_from: Optional[List[models.V1EnvFromSource]] = None, -) -> models.V1Container: - """ - Get container spec for the given parameters. - """ - - if name is None or base_image is None: - raise ValueError("Container name or base image cannot be none") - - # Create initial container spec. - container_spec = models.V1Container( - name=name, image=base_image, args=args, volume_mounts=volume_mounts - ) - - # If training function is set, override container command and args to execute the function. - if train_func is not None: - container_spec.command, container_spec.args = get_command_using_train_func( - train_func=train_func, - train_func_parameters=train_func_parameters, - packages_to_install=packages_to_install, - pip_index_url=pip_index_url, - ) - - # Convert dict to the Kubernetes container resources if that is required. - if isinstance(resources, dict): - # Convert all keys in resources to lowercase. - resources = {k.lower(): v for k, v in resources.items()} - if "gpu" in resources: - resources["nvidia.com/gpu"] = resources.pop("gpu") - - resources = models.V1ResourceRequirements( - requests=resources, - limits=resources, - ) - - # Add resources to the container spec. - container_spec.resources = resources - - # Add environment variables to the container spec. - if env: - container_spec.env = env - if env_from: - container_spec.env_from = env_from - - return container_spec - - -def get_pod_template_spec( - containers: List[models.V1Container], - init_containers: Optional[List[models.V1Container]] = None, - volumes: Optional[List[models.V1Volume]] = None, - restart_policy: Optional[str] = None, -) -> models.V1PodTemplateSpec: - """ - Get Pod template spec for the given parameters. - """ - - # Create Pod template spec. If the value is None, Pod doesn't have that parameter - pod_template_spec = models.V1PodTemplateSpec( - metadata=models.V1ObjectMeta(annotations={"sidecar.istio.io/inject": "false"}), - spec=models.V1PodSpec( - init_containers=init_containers, - containers=containers, - volumes=volumes, - restart_policy=restart_policy, - ), - ) - - return pod_template_spec - - -def get_pvc_spec( - pvc_name: str, - namespace: str, - storage_config: Dict[str, Optional[Union[str, List[str]]]], -): - if pvc_name is None or namespace is None: - raise ValueError("One of the required storage config argument is None") - - if "size" not in storage_config: - storage_config["size"] = constants.PVC_DEFAULT_SIZE - - if "access_modes" not in storage_config: - storage_config["access_modes"] = constants.PVC_DEFAULT_ACCESS_MODES - - pvc_spec = models.V1PersistentVolumeClaim( - api_version="v1", - kind="PersistentVolumeClaim", - metadata={"name": pvc_name, "namespace": namespace}, - spec=models.V1PersistentVolumeClaimSpec( - access_modes=storage_config["access_modes"], - resources=models.V1ResourceRequirements( - requests={"storage": storage_config["size"]} - ), - ), - ) - - if "storage_class" in storage_config: - pvc_spec.spec.storage_class_name = storage_config["storage_class"] - - return pvc_spec - - class SetEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, set): @@ -307,3 +141,49 @@ def default(self, obj): if isinstance(obj, type): return obj.__name__ return json.JSONEncoder.default(self, obj) + + +def parameter_substitution( + parameters: Union["TrainingArguments", "LoraConfig"], # noqa: F821 + experiment_params: List[models.V1beta1ParameterSpec], + trial_params: List[models.V1beta1TrialParameterSpec], +): + from peft import LoraConfig # noqa: F401 + from transformers import TrainingArguments # noqa: F401 + + if isinstance(parameters, TrainingArguments): + parameters_dict = parameters.to_dict() + else: + parameters_dict = parameters.__dict__ + + for p_name, p_value in parameters_dict.items(): + if not hasattr(parameters, p_name): + logger.warning(f"Training parameter {p_name} is not supported.") + continue + + if isinstance(p_value, models.V1beta1ParameterSpec): + old_attr = getattr(parameters, p_name, None) + if old_attr is not None: + value = f"${{trialParameters.{p_name}}}" + setattr(parameters, p_name, value) + p_value.name = p_name + experiment_params.append(p_value) + trial_params.append( + models.V1beta1TrialParameterSpec(name=p_name, reference=p_name) + ) + elif p_value is not None: + old_attr = getattr(parameters, p_name, None) + if old_attr is not None: + if isinstance(p_value, dict): + # Update the existing dictionary without nesting + value = copy.deepcopy(p_value) + else: + value = type(old_attr)(p_value) + setattr(parameters, p_name, value) + + if isinstance(parameters, TrainingArguments): + parameters = json.dumps(parameters.to_dict()) + else: + parameters = json.dumps(parameters.__dict__, cls=SetEncoder) + + return parameters From 3bc3d87d8a9bc7e28d083672358f455f2e529301 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 22 Aug 2024 05:31:15 +0800 Subject: [PATCH 50/84] add types.TrainerResources class Signed-off-by: helenxie-bit --- .../v1beta1/kubeflow/katib/types/__init__.py | 7 + .../kubeflow/katib/types/trainer_resources.py | 135 ++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 sdk/python/v1beta1/kubeflow/katib/types/__init__.py create mode 100644 sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py diff --git a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py new file mode 100644 index 00000000000..73f1110cd5a --- /dev/null +++ b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py @@ -0,0 +1,7 @@ +from __future__ import absolute_import + +# Import types into type package. +from kubeflow.katib.types.trainer_resources import TrainerResources + +# Import Kubernetes models. +from kubernetes.client import * diff --git a/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py new file mode 100644 index 00000000000..2789afc5485 --- /dev/null +++ b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py @@ -0,0 +1,135 @@ +import pprint + +import six +from kubeflow.katib.configuration import Configuration + + +class TrainerResources(object): + def __init__( + self, + num_workers=None, + num_procs_per_worker=None, + resources_per_worker=None, + local_vars_configuration=None, + ): + if local_vars_configuration is None: + local_vars_configuration = Configuration() + self.local_vars_configuration = local_vars_configuration + + self._num_workers = None + self._num_procs_per_worker = None + self._resources_per_worker = None + + if num_workers is not None: + self.num_workers = num_workers + if num_procs_per_worker is not None: + self.num_procs_per_worker = num_procs_per_worker + if resources_per_worker is not None: + self.resources_per_worker = resources_per_worker + + @property + def num_workers(self): + """Gets the number of workers of distributed training. + Number of workers is setting number of workers. + :return: The number of workers of distributed training. + :rtype: int + """ + return self._num_workers + + @num_workers.setter + def num_workers(self, num_workers): + """Sets the number of workers of distributed training. + Number of workers is setting number of workers. + :param num_workers: The number of workers of distributed training. + :type: int + """ + + self._num_workers = num_workers + + @property + def num_procs_per_worker(self): + """Gets the number of processes per worker of distributed training. + Number of processes per worker is the setting number of processes per worker. + :return: The number of processed per worker of distributed training. + :rtype: int + """ + return self._num_procs_per_worker + + @num_procs_per_worker.setter + def num_procs_per_worker(self, num_procs_per_worker): + """Sets the number of processes per worker of distributed training. + Number of processes per worker is the setting number of processes per worker. + :param num_procs_per_worker: The number of processes per worker of distributed training. + :type: int + """ + + self._num_procs_per_worker = num_procs_per_worker + + @property + def resources_per_worker(self): + """Gets the resources per worker of distributed training. + Resources per worker is the setting resources per worker. + :return: The resources per worker of distributed training. + :rtype: dict or V1ResourceRequirements + """ + return self._resources_per_worker + + @resources_per_worker.setter + def resources_per_worker(self, resources_per_worker): + """Sets the resources per worker of distributed training. + Resources per worker is the setting resources per worker. + :param resources_per_worker: The resources per worker of distributed training. + :type: dict or V1ResourceRequirements + """ + + self._resources_per_worker = resources_per_worker + + def to_dict(self): + """Returns the resources properties as a dict""" + result = {} + + for attr, _ in six.iteritems(self.__dict__): + value = getattr(self, attr) + if isinstance(value, list): + result[attr] = list( + map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value) + ) + elif hasattr(value, "to_dict"): + result[attr] = value.to_dict() + elif isinstance(value, dict): + result[attr] = dict( + map( + lambda item: ( + (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") + else item + ), + value.items(), + ) + ) + else: + result[attr] = value + + return result + + def to_str(self): + """Returns the string representation of the model""" + return pprint.pformat(self.to_dict()) + + def __repr__(self): + """For `print` and `pprint`""" + return self.to_str() + + def __eq__(self, other): + """Returns true if both objects are equal""" + if not isinstance(other, TrainerResources): + return False + + return self.to_dict() == other.to_dict() + + def __ne__(self, other): + """Returns true if both objects are not equal""" + if not isinstance(other, TrainerResources): + return True + + return self.to_dict() != other.to_dict() From 4f6fc35d7d14a0812f3b9490d04177f39cd18daf Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 22 Aug 2024 05:35:41 +0800 Subject: [PATCH 51/84] fix flake8 error Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/types/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py index 73f1110cd5a..f837d705039 100644 --- a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py +++ b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py @@ -1,3 +1,8 @@ +# coding: utf-8 + +# flake8: noqa + + from __future__ import absolute_import # Import types into type package. From 038aeda6a76fdc5d543dba5fe165223e02d10364 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 22 Aug 2024 16:45:38 +0800 Subject: [PATCH 52/84] rerun tests Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 497b8cd9723..6ef37ca828b 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -743,7 +743,7 @@ class name in this argument. ) # Add metrics collector to the Katib Experiment. - # Specify metrics format for the collector. + # Specify metrics format for the collector, for example: 'train_loss':0.846 experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( collector=models.V1beta1CollectorSpec( kind=metrics_collector_config["kind"] @@ -751,7 +751,6 @@ class name in this argument. source=models.V1beta1SourceSpec( filter=models.V1beta1FilterSpec( metrics_format=[ - # For example: 'train_loss':0.846 r"'([\w|-]+)'\s*:\s*([+-]?\d*(\.\d+)?([Ee][+-]?\d+)?)", ] ) From 62a668219bdcb82f82d68de6c120f6f2f2a57736 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 23 Aug 2024 08:15:40 +0800 Subject: [PATCH 53/84] rerun tests Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 6ef37ca828b..824d58a0aa8 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -743,7 +743,7 @@ class name in this argument. ) # Add metrics collector to the Katib Experiment. - # Specify metrics format for the collector, for example: 'train_loss':0.846 + # Specify metrics format for the collector. For example: 'train_loss':0.846 experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( collector=models.V1beta1CollectorSpec( kind=metrics_collector_config["kind"] From d7dd567adafc2fb1ce9b926b36f5c9895f7ddd05 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 23 Aug 2024 08:18:25 +0800 Subject: [PATCH 54/84] resolve conflict Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 824d58a0aa8..ee6e6997bd6 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -20,6 +20,7 @@ import time from typing import Any, Callable, Dict, List, Optional, Union +import grpc import kubeflow.katib.katib_api_pb2 as katib_api_pb2 from kubeflow.katib import models, types from kubeflow.katib.api_client import ApiClient @@ -27,8 +28,6 @@ from kubeflow.katib.utils import utils from kubernetes import client, config -import grpc - logger = logging.getLogger(__name__) From fe390514efcc6657fc586b607d06c10dfb186d64 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 23 Aug 2024 21:17:16 +0800 Subject: [PATCH 55/84] rerun tests Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index bb4671c5345..d46d6a18946 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -743,7 +743,7 @@ class name in this argument. ) # Add metrics collector to the Katib Experiment. - # Specify metrics format for the collector. For example: 'train_loss':0.846 + # Specify metrics format for the collector, for example: 'train_loss':0.846 experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( collector=models.V1beta1CollectorSpec( kind=metrics_collector_config["kind"] From d20ea35f8ba590335383699c5c905e59cfc32990 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 23 Aug 2024 21:26:55 +0800 Subject: [PATCH 56/84] rerun tests Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index d46d6a18946..bb4671c5345 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -743,7 +743,7 @@ class name in this argument. ) # Add metrics collector to the Katib Experiment. - # Specify metrics format for the collector, for example: 'train_loss':0.846 + # Specify metrics format for the collector. For example: 'train_loss':0.846 experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( collector=models.V1beta1CollectorSpec( kind=metrics_collector_config["kind"] From ef27bf6fac4760927aec5ae8a614376c11a86ded Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 23 Aug 2024 21:29:45 +0800 Subject: [PATCH 57/84] rerun tests Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index bb4671c5345..d46d6a18946 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -743,7 +743,7 @@ class name in this argument. ) # Add metrics collector to the Katib Experiment. - # Specify metrics format for the collector. For example: 'train_loss':0.846 + # Specify metrics format for the collector, for example: 'train_loss':0.846 experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( collector=models.V1beta1CollectorSpec( kind=metrics_collector_config["kind"] From 466ca394de17329996346b1ead48afd8c51793ba Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 23 Aug 2024 21:36:39 +0800 Subject: [PATCH 58/84] rerun tests Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index d46d6a18946..bb4671c5345 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -743,7 +743,7 @@ class name in this argument. ) # Add metrics collector to the Katib Experiment. - # Specify metrics format for the collector, for example: 'train_loss':0.846 + # Specify metrics format for the collector. For example: 'train_loss':0.846 experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( collector=models.V1beta1CollectorSpec( kind=metrics_collector_config["kind"] From 741df8a92511b836fafed639ef4fa9855ba2a17e Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 23 Aug 2024 21:58:45 +0800 Subject: [PATCH 59/84] rerun tests Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index bb4671c5345..d46d6a18946 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -743,7 +743,7 @@ class name in this argument. ) # Add metrics collector to the Katib Experiment. - # Specify metrics format for the collector. For example: 'train_loss':0.846 + # Specify metrics format for the collector, for example: 'train_loss':0.846 experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( collector=models.V1beta1CollectorSpec( kind=metrics_collector_config["kind"] From e131636716158963a9d9b74615e9d3a13769aa2b Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 24 Aug 2024 06:56:55 +0800 Subject: [PATCH 60/84] rerun tests Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index d46d6a18946..bb4671c5345 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -743,7 +743,7 @@ class name in this argument. ) # Add metrics collector to the Katib Experiment. - # Specify metrics format for the collector, for example: 'train_loss':0.846 + # Specify metrics format for the collector. For example: 'train_loss':0.846 experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( collector=models.V1beta1CollectorSpec( kind=metrics_collector_config["kind"] From fe1348f49406e6ba98a25e1eba400b52a88f2502 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 24 Aug 2024 07:18:43 +0800 Subject: [PATCH 61/84] rerun tests Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index bb4671c5345..d46d6a18946 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -743,7 +743,7 @@ class name in this argument. ) # Add metrics collector to the Katib Experiment. - # Specify metrics format for the collector. For example: 'train_loss':0.846 + # Specify metrics format for the collector, for example: 'train_loss':0.846 experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( collector=models.V1beta1CollectorSpec( kind=metrics_collector_config["kind"] From 2484e491a1473c707a9bf1072de743f11e52bdcc Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 24 Aug 2024 07:20:47 +0800 Subject: [PATCH 62/84] rerun tests Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index d46d6a18946..bb4671c5345 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -743,7 +743,7 @@ class name in this argument. ) # Add metrics collector to the Katib Experiment. - # Specify metrics format for the collector, for example: 'train_loss':0.846 + # Specify metrics format for the collector. For example: 'train_loss':0.846 experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( collector=models.V1beta1CollectorSpec( kind=metrics_collector_config["kind"] From f0453b07a1b3d903c9e54b5e49a14118a14bad2a Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 24 Aug 2024 07:29:12 +0800 Subject: [PATCH 63/84] rerun tests Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index bb4671c5345..d46d6a18946 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -743,7 +743,7 @@ class name in this argument. ) # Add metrics collector to the Katib Experiment. - # Specify metrics format for the collector. For example: 'train_loss':0.846 + # Specify metrics format for the collector, for example: 'train_loss':0.846 experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( collector=models.V1beta1CollectorSpec( kind=metrics_collector_config["kind"] From 64ccbc7f2f13f51c440a924ce3d5b16e89f167c9 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 24 Aug 2024 07:35:34 +0800 Subject: [PATCH 64/84] rerun tests Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index d46d6a18946..bb4671c5345 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -743,7 +743,7 @@ class name in this argument. ) # Add metrics collector to the Katib Experiment. - # Specify metrics format for the collector, for example: 'train_loss':0.846 + # Specify metrics format for the collector. For example: 'train_loss':0.846 experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( collector=models.V1beta1CollectorSpec( kind=metrics_collector_config["kind"] From 1ad05e6863403a9faebfe752b4926704ba8f091c Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 28 Aug 2024 07:16:36 +0800 Subject: [PATCH 65/84] delete properties of 'TrainerResources' Signed-off-by: helenxie-bit --- .../kubeflow/katib/types/trainer_resources.py | 107 ------------------ 1 file changed, 107 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py index 2789afc5485..1c32f8061eb 100644 --- a/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py +++ b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py @@ -26,110 +26,3 @@ def __init__( self.num_procs_per_worker = num_procs_per_worker if resources_per_worker is not None: self.resources_per_worker = resources_per_worker - - @property - def num_workers(self): - """Gets the number of workers of distributed training. - Number of workers is setting number of workers. - :return: The number of workers of distributed training. - :rtype: int - """ - return self._num_workers - - @num_workers.setter - def num_workers(self, num_workers): - """Sets the number of workers of distributed training. - Number of workers is setting number of workers. - :param num_workers: The number of workers of distributed training. - :type: int - """ - - self._num_workers = num_workers - - @property - def num_procs_per_worker(self): - """Gets the number of processes per worker of distributed training. - Number of processes per worker is the setting number of processes per worker. - :return: The number of processed per worker of distributed training. - :rtype: int - """ - return self._num_procs_per_worker - - @num_procs_per_worker.setter - def num_procs_per_worker(self, num_procs_per_worker): - """Sets the number of processes per worker of distributed training. - Number of processes per worker is the setting number of processes per worker. - :param num_procs_per_worker: The number of processes per worker of distributed training. - :type: int - """ - - self._num_procs_per_worker = num_procs_per_worker - - @property - def resources_per_worker(self): - """Gets the resources per worker of distributed training. - Resources per worker is the setting resources per worker. - :return: The resources per worker of distributed training. - :rtype: dict or V1ResourceRequirements - """ - return self._resources_per_worker - - @resources_per_worker.setter - def resources_per_worker(self, resources_per_worker): - """Sets the resources per worker of distributed training. - Resources per worker is the setting resources per worker. - :param resources_per_worker: The resources per worker of distributed training. - :type: dict or V1ResourceRequirements - """ - - self._resources_per_worker = resources_per_worker - - def to_dict(self): - """Returns the resources properties as a dict""" - result = {} - - for attr, _ in six.iteritems(self.__dict__): - value = getattr(self, attr) - if isinstance(value, list): - result[attr] = list( - map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value) - ) - elif hasattr(value, "to_dict"): - result[attr] = value.to_dict() - elif isinstance(value, dict): - result[attr] = dict( - map( - lambda item: ( - (item[0], item[1].to_dict()) - if hasattr(item[1], "to_dict") - else item - ), - value.items(), - ) - ) - else: - result[attr] = value - - return result - - def to_str(self): - """Returns the string representation of the model""" - return pprint.pformat(self.to_dict()) - - def __repr__(self): - """For `print` and `pprint`""" - return self.to_str() - - def __eq__(self, other): - """Returns true if both objects are equal""" - if not isinstance(other, TrainerResources): - return False - - return self.to_dict() == other.to_dict() - - def __ne__(self, other): - """Returns true if both objects are not equal""" - if not isinstance(other, TrainerResources): - return True - - return self.to_dict() != other.to_dict() From 1b054ac5462e70dc61f8b19f227d55efcd7b2839 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 28 Aug 2024 07:19:57 +0800 Subject: [PATCH 66/84] fix format error Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py index 1c32f8061eb..6a802467226 100644 --- a/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py +++ b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py @@ -1,6 +1,3 @@ -import pprint - -import six from kubeflow.katib.configuration import Configuration From 5394113dbc2b2a31bf62fd7f935ca68a71a23645 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 29 Aug 2024 09:12:19 +0800 Subject: [PATCH 67/84] update types Signed-off-by: helenxie-bit --- hack/gen-python-sdk/post_gen.py | 2 ++ .../v1beta1/kubeflow/katib/api/katib_client.py | 8 +++++--- sdk/python/v1beta1/kubeflow/katib/types/__init__.py | 12 ------------ 3 files changed, 7 insertions(+), 15 deletions(-) delete mode 100644 sdk/python/v1beta1/kubeflow/katib/types/__init__.py diff --git a/hack/gen-python-sdk/post_gen.py b/hack/gen-python-sdk/post_gen.py index 70eab3a2595..c71de8642d6 100644 --- a/hack/gen-python-sdk/post_gen.py +++ b/hack/gen-python-sdk/post_gen.py @@ -41,6 +41,8 @@ def _rewrite_helper(input_file, output_file, rewrite_rules): if output_file == "sdk/python/v1beta1/kubeflow/katib/__init__.py": lines.append("# Import Katib API client.\n") lines.append("from kubeflow.katib.api.katib_client import KatibClient\n") + lines.append("# Import Katib TrainerResources class.\n") + lines.append("from kubeflow.katib.types.trainer_resources import TrainerResources\n") lines.append("# Import Katib report metrics functions\n") lines.append("from kubeflow.katib.api.report_metrics import report_metrics\n") lines.append("# Import Katib helper functions.\n") diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index bb4671c5345..04bcc50e858 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -20,15 +20,17 @@ import time from typing import Any, Callable, Dict, List, Optional, Union -import grpc import kubeflow.katib.katib_api_pb2 as katib_api_pb2 import kubeflow.katib.katib_api_pb2_grpc as katib_api_pb2_grpc -from kubeflow.katib import models, types +from kubeflow.katib import models from kubeflow.katib.api_client import ApiClient from kubeflow.katib.constants import constants +from kubeflow.katib.types.trainer_resources import TrainerResources from kubeflow.katib.utils import utils from kubernetes import client, config +import grpc + logger = logging.getLogger(__name__) @@ -197,7 +199,7 @@ def tune( parallel_trial_count: int = None, max_failed_trial_count: int = None, resources_per_trial: Union[ - dict, client.V1ResourceRequirements, types.TrainerResources, None + dict, client.V1ResourceRequirements, TrainerResources, None ] = None, retain_trials: bool = False, packages_to_install: List[str] = None, diff --git a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py deleted file mode 100644 index f837d705039..00000000000 --- a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -# coding: utf-8 - -# flake8: noqa - - -from __future__ import absolute_import - -# Import types into type package. -from kubeflow.katib.types.trainer_resources import TrainerResources - -# Import Kubernetes models. -from kubernetes.client import * From dc3a104a7bc6016443acc7ef6f84e18f9c53b50a Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 29 Aug 2024 09:15:50 +0800 Subject: [PATCH 68/84] fix format Signed-off-by: helenxie-bit --- hack/gen-python-sdk/post_gen.py | 4 +++- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/hack/gen-python-sdk/post_gen.py b/hack/gen-python-sdk/post_gen.py index c71de8642d6..1803bb20430 100644 --- a/hack/gen-python-sdk/post_gen.py +++ b/hack/gen-python-sdk/post_gen.py @@ -42,7 +42,9 @@ def _rewrite_helper(input_file, output_file, rewrite_rules): lines.append("# Import Katib API client.\n") lines.append("from kubeflow.katib.api.katib_client import KatibClient\n") lines.append("# Import Katib TrainerResources class.\n") - lines.append("from kubeflow.katib.types.trainer_resources import TrainerResources\n") + lines.append( + "from kubeflow.katib.types.trainer_resources import TrainerResources\n" + ) lines.append("# Import Katib report metrics functions\n") lines.append("from kubeflow.katib.api.report_metrics import report_metrics\n") lines.append("# Import Katib helper functions.\n") diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 04bcc50e858..cc610ed6149 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -20,6 +20,7 @@ import time from typing import Any, Callable, Dict, List, Optional, Union +import grpc import kubeflow.katib.katib_api_pb2 as katib_api_pb2 import kubeflow.katib.katib_api_pb2_grpc as katib_api_pb2_grpc from kubeflow.katib import models @@ -29,8 +30,6 @@ from kubeflow.katib.utils import utils from kubernetes import client, config -import grpc - logger = logging.getLogger(__name__) From dc007b17a3a23be6c24cea72ea80bb0058a7d1c8 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 29 Aug 2024 09:34:46 +0800 Subject: [PATCH 69/84] add import of 'TrainerResources' in '__init__.py' of katib Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sdk/python/v1beta1/kubeflow/katib/__init__.py b/sdk/python/v1beta1/kubeflow/katib/__init__.py index 7aef4c9897d..bafe7befea3 100644 --- a/sdk/python/v1beta1/kubeflow/katib/__init__.py +++ b/sdk/python/v1beta1/kubeflow/katib/__init__.py @@ -71,6 +71,8 @@ # Import Katib API client. from kubeflow.katib.api.katib_client import KatibClient +# Import Katib TrainerResources class. +from kubeflow.katib.types.trainer_resources import TrainerResources # Import Katib report metrics functions from kubeflow.katib.api.report_metrics import report_metrics # Import Katib helper functions. From 3d7c9c2c2bd183c74e95265feaa7e3a22324da50 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 29 Aug 2024 12:51:57 +0800 Subject: [PATCH 70/84] rerun tests Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index cc610ed6149..67acb6bcd71 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -744,7 +744,7 @@ class name in this argument. ) # Add metrics collector to the Katib Experiment. - # Specify metrics format for the collector. For example: 'train_loss':0.846 + # Specify metrics format for the collector, for example: 'train_loss':0.846 experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( collector=models.V1beta1CollectorSpec( kind=metrics_collector_config["kind"] From 96db205ce46e945181cbce23dd0f15c4858913d4 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 29 Aug 2024 15:55:52 +0800 Subject: [PATCH 71/84] revert changes and rerun tests Signed-off-by: helenxie-bit --- hack/gen-python-sdk/post_gen.py | 8 ++++---- sdk/python/v1beta1/kubeflow/katib/__init__.py | 2 +- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 6 +++--- sdk/python/v1beta1/kubeflow/katib/types/__init__.py | 7 +++++++ 4 files changed, 15 insertions(+), 8 deletions(-) create mode 100644 sdk/python/v1beta1/kubeflow/katib/types/__init__.py diff --git a/hack/gen-python-sdk/post_gen.py b/hack/gen-python-sdk/post_gen.py index 1803bb20430..8c66b9b6fb5 100644 --- a/hack/gen-python-sdk/post_gen.py +++ b/hack/gen-python-sdk/post_gen.py @@ -41,10 +41,10 @@ def _rewrite_helper(input_file, output_file, rewrite_rules): if output_file == "sdk/python/v1beta1/kubeflow/katib/__init__.py": lines.append("# Import Katib API client.\n") lines.append("from kubeflow.katib.api.katib_client import KatibClient\n") - lines.append("# Import Katib TrainerResources class.\n") - lines.append( - "from kubeflow.katib.types.trainer_resources import TrainerResources\n" - ) + # lines.append("# Import Katib TrainerResources class.\n") + # lines.append( + # "from kubeflow.katib.types.trainer_resources import TrainerResources\n" + # ) lines.append("# Import Katib report metrics functions\n") lines.append("from kubeflow.katib.api.report_metrics import report_metrics\n") lines.append("# Import Katib helper functions.\n") diff --git a/sdk/python/v1beta1/kubeflow/katib/__init__.py b/sdk/python/v1beta1/kubeflow/katib/__init__.py index bafe7befea3..df1985c8596 100644 --- a/sdk/python/v1beta1/kubeflow/katib/__init__.py +++ b/sdk/python/v1beta1/kubeflow/katib/__init__.py @@ -72,7 +72,7 @@ # Import Katib API client. from kubeflow.katib.api.katib_client import KatibClient # Import Katib TrainerResources class. -from kubeflow.katib.types.trainer_resources import TrainerResources +# from kubeflow.katib.types.trainer_resources import TrainerResources # Import Katib report metrics functions from kubeflow.katib.api.report_metrics import report_metrics # Import Katib helper functions. diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 67acb6bcd71..5242d6c501d 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -23,10 +23,10 @@ import grpc import kubeflow.katib.katib_api_pb2 as katib_api_pb2 import kubeflow.katib.katib_api_pb2_grpc as katib_api_pb2_grpc -from kubeflow.katib import models +from kubeflow.katib import models, types from kubeflow.katib.api_client import ApiClient from kubeflow.katib.constants import constants -from kubeflow.katib.types.trainer_resources import TrainerResources +# from kubeflow.katib.types.trainer_resources import TrainerResources from kubeflow.katib.utils import utils from kubernetes import client, config @@ -198,7 +198,7 @@ def tune( parallel_trial_count: int = None, max_failed_trial_count: int = None, resources_per_trial: Union[ - dict, client.V1ResourceRequirements, TrainerResources, None + dict, client.V1ResourceRequirements, types.TrainerResources, None ] = None, retain_trials: bool = False, packages_to_install: List[str] = None, diff --git a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py new file mode 100644 index 00000000000..a38761478a0 --- /dev/null +++ b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py @@ -0,0 +1,7 @@ +from __future__ import absolute_import + +# Import types into type package +from kubeflow.katib.types.trainer_resources import TrainerResources + +# Import Kubernetes models. +from kubernetes.client import * \ No newline at end of file From 1a56c072efc865dd28792bfb5f488c303b91d894 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 29 Aug 2024 16:15:10 +0800 Subject: [PATCH 72/84] check pvc and pv status of katib deployments Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/types/__init__.py | 2 +- test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py index a38761478a0..a99fbea74b3 100644 --- a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py +++ b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py @@ -4,4 +4,4 @@ from kubeflow.katib.types.trainer_resources import TrainerResources # Import Kubernetes models. -from kubernetes.client import * \ No newline at end of file +from kubernetes.client import * diff --git a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh index d0b05caf712..52f4933e87f 100755 --- a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh +++ b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh @@ -64,6 +64,15 @@ fi echo "Deploying Katib" cd ../../../../../ && WITH_DATABASE_TYPE=$WITH_DATABASE_TYPE make deploy && cd - +echo "Get PVC" +kubectl get pvc +echo "Describe PVC" +kubectl describe pvc katib-mysql +echo "Get PV" +kubectl get pv +echo "Get StorageClass" +kubectl get storageclass + # Wait until all Katib pods is running. TIMEOUT=120s From da2b6e03b503c87c63bbfca7c0c98818c562ea5b Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 29 Aug 2024 16:24:55 +0800 Subject: [PATCH 73/84] check pvc and pv status of katib deployments Signed-off-by: helenxie-bit --- test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh index 52f4933e87f..2edb788030b 100755 --- a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh +++ b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh @@ -65,9 +65,7 @@ echo "Deploying Katib" cd ../../../../../ && WITH_DATABASE_TYPE=$WITH_DATABASE_TYPE make deploy && cd - echo "Get PVC" -kubectl get pvc -echo "Describe PVC" -kubectl describe pvc katib-mysql +kubectl get pvc -n kubeflow echo "Get PV" kubectl get pv echo "Get StorageClass" From 970a5921202f50ab4eea03b729ead1f05231712e Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 29 Aug 2024 17:31:25 +0800 Subject: [PATCH 74/84] recommit changes Signed-off-by: helenxie-bit --- hack/gen-python-sdk/post_gen.py | 8 ++++---- sdk/python/v1beta1/kubeflow/katib/__init__.py | 2 +- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 6 +++--- sdk/python/v1beta1/kubeflow/katib/types/__init__.py | 7 ------- test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh | 7 ------- 5 files changed, 8 insertions(+), 22 deletions(-) delete mode 100644 sdk/python/v1beta1/kubeflow/katib/types/__init__.py diff --git a/hack/gen-python-sdk/post_gen.py b/hack/gen-python-sdk/post_gen.py index 8c66b9b6fb5..1803bb20430 100644 --- a/hack/gen-python-sdk/post_gen.py +++ b/hack/gen-python-sdk/post_gen.py @@ -41,10 +41,10 @@ def _rewrite_helper(input_file, output_file, rewrite_rules): if output_file == "sdk/python/v1beta1/kubeflow/katib/__init__.py": lines.append("# Import Katib API client.\n") lines.append("from kubeflow.katib.api.katib_client import KatibClient\n") - # lines.append("# Import Katib TrainerResources class.\n") - # lines.append( - # "from kubeflow.katib.types.trainer_resources import TrainerResources\n" - # ) + lines.append("# Import Katib TrainerResources class.\n") + lines.append( + "from kubeflow.katib.types.trainer_resources import TrainerResources\n" + ) lines.append("# Import Katib report metrics functions\n") lines.append("from kubeflow.katib.api.report_metrics import report_metrics\n") lines.append("# Import Katib helper functions.\n") diff --git a/sdk/python/v1beta1/kubeflow/katib/__init__.py b/sdk/python/v1beta1/kubeflow/katib/__init__.py index df1985c8596..bafe7befea3 100644 --- a/sdk/python/v1beta1/kubeflow/katib/__init__.py +++ b/sdk/python/v1beta1/kubeflow/katib/__init__.py @@ -72,7 +72,7 @@ # Import Katib API client. from kubeflow.katib.api.katib_client import KatibClient # Import Katib TrainerResources class. -# from kubeflow.katib.types.trainer_resources import TrainerResources +from kubeflow.katib.types.trainer_resources import TrainerResources # Import Katib report metrics functions from kubeflow.katib.api.report_metrics import report_metrics # Import Katib helper functions. diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 5242d6c501d..67acb6bcd71 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -23,10 +23,10 @@ import grpc import kubeflow.katib.katib_api_pb2 as katib_api_pb2 import kubeflow.katib.katib_api_pb2_grpc as katib_api_pb2_grpc -from kubeflow.katib import models, types +from kubeflow.katib import models from kubeflow.katib.api_client import ApiClient from kubeflow.katib.constants import constants -# from kubeflow.katib.types.trainer_resources import TrainerResources +from kubeflow.katib.types.trainer_resources import TrainerResources from kubeflow.katib.utils import utils from kubernetes import client, config @@ -198,7 +198,7 @@ def tune( parallel_trial_count: int = None, max_failed_trial_count: int = None, resources_per_trial: Union[ - dict, client.V1ResourceRequirements, types.TrainerResources, None + dict, client.V1ResourceRequirements, TrainerResources, None ] = None, retain_trials: bool = False, packages_to_install: List[str] = None, diff --git a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py b/sdk/python/v1beta1/kubeflow/katib/types/__init__.py deleted file mode 100644 index a99fbea74b3..00000000000 --- a/sdk/python/v1beta1/kubeflow/katib/types/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from __future__ import absolute_import - -# Import types into type package -from kubeflow.katib.types.trainer_resources import TrainerResources - -# Import Kubernetes models. -from kubernetes.client import * diff --git a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh index 2edb788030b..d0b05caf712 100755 --- a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh +++ b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh @@ -64,13 +64,6 @@ fi echo "Deploying Katib" cd ../../../../../ && WITH_DATABASE_TYPE=$WITH_DATABASE_TYPE make deploy && cd - -echo "Get PVC" -kubectl get pvc -n kubeflow -echo "Get PV" -kubectl get pv -echo "Get StorageClass" -kubectl get storageclass - # Wait until all Katib pods is running. TIMEOUT=120s From e529ec4fe5651be45735fb840d8482699e516893 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 30 Aug 2024 07:11:15 +0800 Subject: [PATCH 75/84] update minikube version when setup Signed-off-by: helenxie-bit --- .github/workflows/template-setup-e2e-test/action.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/template-setup-e2e-test/action.yaml b/.github/workflows/template-setup-e2e-test/action.yaml index c1b988f8b57..75ee040aea2 100644 --- a/.github/workflows/template-setup-e2e-test/action.yaml +++ b/.github/workflows/template-setup-e2e-test/action.yaml @@ -37,7 +37,7 @@ runs: version: ${{ inputs.kubernetes-version }} - name: Setup Minikube Cluster - uses: medyagh/setup-minikube@v0.0.16 + uses: medyagh/setup-minikube@v0.0.18 with: network-plugin: cni cni: flannel From 17f9deac15fa7cd4f6afa277ae4413b744ea83e8 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 30 Aug 2024 21:33:52 +0800 Subject: [PATCH 76/84] delete the code that disables formatting for the tune function Signed-off-by: helenxie-bit --- .../v1beta1/kubeflow/katib/api/katib_client.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 67acb6bcd71..e73bfa2ae42 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -165,14 +165,16 @@ def create_experiment( ) ) - # fmt: off def tune( self, # TODO (andreyvelich): How to be consistent with other APIs (name) ? name: str, - model_provider_parameters: Optional["HuggingFaceModelParams"] = None, # noqa: F821 - dataset_provider_parameters: Optional[Union[ - "HuggingFaceDatasetParams", "S3DatasetParams"]] = None, # noqa: F821 + model_provider_parameters: Optional[ + "HuggingFaceModelParams" # noqa: F821 + ] = None, + dataset_provider_parameters: Optional[ + Union["HuggingFaceDatasetParams", "S3DatasetParams"] # noqa: F821 + ] = None, trainer_parameters: Optional["HuggingFaceTrainerParams"] = None, # noqa: F821 storage_config: Optional[Dict[str, Optional[Union[str, List[str]]]]] = { "size": constants.PVC_DEFAULT_SIZE, @@ -205,8 +207,6 @@ def tune( pip_index_url: str = "https://pypi.org/simple", metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"}, ): - # fmt: on - """ Create HyperParameter Tuning Katib Experiment using one of the following options: From 1a2c1ad49ea1d61eec54359457f18adf11e5f6fa Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 30 Aug 2024 21:46:15 +0800 Subject: [PATCH 77/84] update according to andrey's feedback Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 21 ++++++++----------- sdk/python/v1beta1/setup.py | 2 +- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index e73bfa2ae42..540fc4c2b03 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -199,8 +199,8 @@ def tune( max_trial_count: int = None, parallel_trial_count: int = None, max_failed_trial_count: int = None, - resources_per_trial: Union[ - dict, client.V1ResourceRequirements, TrainerResources, None + resources_per_trial: Optional[ + Union[dict, client.V1ResourceRequirements, TrainerResources] ] = None, retain_trials: bool = False, packages_to_install: List[str] = None, @@ -351,13 +351,6 @@ class name in this argument. RuntimeError: Failed to create Katib Experiment. """ - print( - "Thank you for using `tune` API for LLM hyperparameter optimization. This feature " - "is in the alpha stage. Kubeflow community is looking for your feedback. Please " - "share your experience via #kubeflow-katib Slack channel or the Kubeflow Katib " - "GitHub." - ) - if ( ( model_provider_parameters is not None @@ -599,6 +592,13 @@ class name in this argument. + "Run: pip install -U 'kubeflow-katib[huggingface]' " ) + print( + "Thank you for using `tune` API for LLM hyperparameter optimization. This feature " + "is in the alpha stage. Kubeflow community is looking for your feedback. Please " + "share your experience via #kubeflow-katib Slack channel or the Kubeflow Katib " + "GitHub." + ) + # Create PVC for the Storage Initializer. # TODO (helenxie-bit): PVC Creation should be part of Katib Controller. try: @@ -746,9 +746,6 @@ class name in this argument. # Add metrics collector to the Katib Experiment. # Specify metrics format for the collector, for example: 'train_loss':0.846 experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( - collector=models.V1beta1CollectorSpec( - kind=metrics_collector_config["kind"] - ), source=models.V1beta1SourceSpec( filter=models.V1beta1FilterSpec( metrics_format=[ diff --git a/sdk/python/v1beta1/setup.py b/sdk/python/v1beta1/setup.py index 72d12899364..78ae02aa739 100644 --- a/sdk/python/v1beta1/setup.py +++ b/sdk/python/v1beta1/setup.py @@ -86,6 +86,6 @@ ], install_requires=REQUIRES, extras_require={ - "huggingface": ["kubeflow-training[huggingface]"], + "huggingface": ["kubeflow-training[huggingface]==1.8.0"], }, ) From 5494925a4fdf339a2c8535911801caf907da040b Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 30 Aug 2024 22:32:23 +0800 Subject: [PATCH 78/84] add helper function in utils Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 112 ++++--------- .../v1beta1/kubeflow/katib/utils/utils.py | 148 +++++++++++++----- 2 files changed, 138 insertions(+), 122 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 540fc4c2b03..a085dd00023 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -12,15 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import inspect import json import logging import multiprocessing -import textwrap import time from typing import Any, Callable, Dict, List, Optional, Union -import grpc import kubeflow.katib.katib_api_pb2 as katib_api_pb2 import kubeflow.katib.katib_api_pb2_grpc as katib_api_pb2_grpc from kubeflow.katib import models @@ -30,6 +27,8 @@ from kubeflow.katib.utils import utils from kubernetes import client, config +import grpc + logger = logging.getLogger(__name__) @@ -417,72 +416,26 @@ class name in this argument. if max_failed_trial_count is not None: experiment.spec.max_failed_trial_count = max_failed_trial_count + # Add metrics collector to the Katib Experiment. + # Up to now, we only support parameter `kind`, of which default value + # is `StdOut`, to specify the kind of metrics collector. + experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( + collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]) + ) + # If users choose to use a custom objective function. if objective is not None: - # Validate objective function. - utils.validate_objective_function(objective) - - # Extract objective function implementation. - objective_code = inspect.getsource(objective) - - # Objective function might be defined in some indented scope - # (e.g. in another function). We need to dedent the function code. - objective_code = textwrap.dedent(objective_code) - - # Iterate over input parameters. - input_params = {} + # Iterate over input parameters and do substitutions. experiment_params = [] trial_params = [] - for p_name, p_value in parameters.items(): - # If input parameter value is Katib Experiment parameter sample. - if isinstance(p_value, models.V1beta1ParameterSpec): - # Wrap value for the function input. - input_params[p_name] = f"${{trialParameters.{p_name}}}" - - # Add value to the Katib Experiment parameters. - p_value.name = p_name - experiment_params.append(p_value) - - # Add value to the Katib Experiment's Trial parameters. - trial_params.append( - models.V1beta1TrialParameterSpec(name=p_name, reference=p_name) - ) - else: - # Otherwise, add value to the function input. - input_params[p_name] = p_value - - # Wrap objective function to execute it from the file. For example: - # def objective(parameters): - # print(f'Parameters are {parameters}') - # objective({ - # 'lr': '${trialParameters.lr}', - # 'epochs': '${trialParameters.epochs}', - # 'is_dist': False - # }) - objective_code = f"{objective_code}\n{objective.__name__}({input_params})\n" - - # Prepare execute script template. - exec_script = textwrap.dedent( - """ - program_path=$(mktemp -d) - read -r -d '' SCRIPT << EOM\n - {objective_code} - EOM - printf "%s" "$SCRIPT" > $program_path/ephemeral_objective.py - python3 -u $program_path/ephemeral_objective.py""" + input_params = utils.parameter_substitution( + parameters, experiment_params, trial_params ) - # Add objective code to the execute script. - exec_script = exec_script.format(objective_code=objective_code) - - # Install Python packages if that is required. - if packages_to_install is not None: - exec_script = ( - utils.get_script_for_python_packages( - packages_to_install, pip_index_url - ) - + exec_script - ) + # Get the execution script from the objective function. + exec_script = utils.get_exec_script_from_objective( + objective, input_params, packages_to_install, pip_index_url + ) if isinstance(resources_per_trial, dict): if "gpu" in resources_per_trial: @@ -513,15 +466,6 @@ class name in this argument. f"Incorrect value for env_per_trial: {env_per_trial}" ) - # Add metrics collector to the Katib Experiment. - # Up to now, we only support parameter `kind`, of which default value - # is `StdOut`, to specify the kind of metrics collector. - experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( - collector=models.V1beta1CollectorSpec( - kind=metrics_collector_config["kind"] - ) - ) - # Create Trial specification. trial_spec = client.V1Job( api_version="batch/v1", @@ -644,7 +588,6 @@ class name in this argument. # Iterate over input parameters and do substitutions. experiment_params = [] trial_params = [] - training_args = utils.parameter_substitution( trainer_parameters.training_parameters, experiment_params, trial_params ) @@ -652,6 +595,7 @@ class name in this argument. trainer_parameters.lora_config, experiment_params, trial_params ) + # Create the init and the primary container. init_container_spec = training_utils.get_container_spec( name=STORAGE_INITIALIZER, base_image=STORAGE_INITIALIZER_IMAGE, @@ -691,6 +635,7 @@ class name in this argument. resources=resources_per_trial.resources_per_worker, ) + # Create the worker and the master pod. storage_initializer_volume = models.V1Volume( name=STORAGE_INITIALIZER, persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource( @@ -698,20 +643,18 @@ class name in this argument. ), ) - # create worker pod spec worker_pod_template_spec = training_utils.get_pod_template_spec( containers=[container_spec], volumes=[storage_initializer_volume], ) - # create master pod spec master_pod_template_spec = training_utils.get_pod_template_spec( containers=[container_spec], init_containers=[init_container_spec], volumes=[storage_initializer_volume], ) - # Create pytorchjob. + # Create PyTorchJob. pytorchjob = training_models.KubeflowOrgV1PyTorchJob( api_version="kubeflow.org/v1", kind="PyTorchJob", @@ -743,7 +686,14 @@ class name in this argument. ) ) - # Add metrics collector to the Katib Experiment. + # Create Trial template. + trial_template = models.V1beta1TrialTemplate( + primary_container_name=JOB_PARAMETERS[PYTORCHJOB_KIND]["container"], + retain=retain_trials, + trial_parameters=trial_params, + trial_spec=pytorchjob, + ) + # Specify metrics format for the collector, for example: 'train_loss':0.846 experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( source=models.V1beta1SourceSpec( @@ -755,14 +705,6 @@ class name in this argument. ), ) - # Create Trial template. - trial_template = models.V1beta1TrialTemplate( - primary_container_name=JOB_PARAMETERS[PYTORCHJOB_KIND]["container"], - retain=retain_trials, - trial_parameters=trial_params, - trial_spec=pytorchjob, - ) - # Add parameters to the Katib Experiment. experiment.spec.parameters = experiment_params diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 3e2dc4459e3..9c5fe6760ad 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -18,7 +18,7 @@ import logging import os import textwrap -from typing import Any, Callable, List, Union +from typing import Any, Callable, Dict, List, Optional, Union from kubeflow.katib import models from kubeflow.katib.constants import constants @@ -144,46 +144,120 @@ def default(self, obj): def parameter_substitution( - parameters: Union["TrainingArguments", "LoraConfig"], # noqa: F821 + parameters: Union[Dict[str, Any], "TrainingArguments", "LoraConfig"], # noqa: F821 experiment_params: List[models.V1beta1ParameterSpec], trial_params: List[models.V1beta1TrialParameterSpec], ): - from peft import LoraConfig # noqa: F401 - from transformers import TrainingArguments # noqa: F401 + if isinstance(parameters, dict): + for p_name, p_value in parameters.items(): + # If input parameter value is Katib Experiment parameter sample. + if isinstance(p_value, models.V1beta1ParameterSpec): + # Wrap value for the function input. + parameters[p_name] = f"${{trialParameters.{p_name}}}" + + # Add value to the Katib Experiment parameters. + p_value.name = p_name + experiment_params.append(p_value) + + # Add value to the Katib Experiment's Trial parameters. + trial_params.append( + models.V1beta1TrialParameterSpec(name=p_name, reference=p_name) + ) + else: + # Otherwise, add value to the function input. + parameters[p_name] = p_value - if isinstance(parameters, TrainingArguments): - parameters_dict = parameters.to_dict() else: - parameters_dict = parameters.__dict__ - - for p_name, p_value in parameters_dict.items(): - if not hasattr(parameters, p_name): - logger.warning(f"Training parameter {p_name} is not supported.") - continue - - if isinstance(p_value, models.V1beta1ParameterSpec): - old_attr = getattr(parameters, p_name, None) - if old_attr is not None: - value = f"${{trialParameters.{p_name}}}" - setattr(parameters, p_name, value) - p_value.name = p_name - experiment_params.append(p_value) - trial_params.append( - models.V1beta1TrialParameterSpec(name=p_name, reference=p_name) - ) - elif p_value is not None: - old_attr = getattr(parameters, p_name, None) - if old_attr is not None: - if isinstance(p_value, dict): - # Update the existing dictionary without nesting - value = copy.deepcopy(p_value) - else: - value = type(old_attr)(p_value) - setattr(parameters, p_name, value) - - if isinstance(parameters, TrainingArguments): - parameters = json.dumps(parameters.to_dict()) - else: - parameters = json.dumps(parameters.__dict__, cls=SetEncoder) + from peft import LoraConfig # noqa: F401 + from transformers import TrainingArguments # noqa: F401 + + if isinstance(parameters, TrainingArguments): + parameters_dict = parameters.to_dict() + else: + parameters_dict = parameters.__dict__ + + for p_name, p_value in parameters_dict.items(): + if not hasattr(parameters, p_name): + logger.warning(f"Training parameter {p_name} is not supported.") + continue + + if isinstance(p_value, models.V1beta1ParameterSpec): + old_attr = getattr(parameters, p_name, None) + if old_attr is not None: + value = f"${{trialParameters.{p_name}}}" + setattr(parameters, p_name, value) + p_value.name = p_name + experiment_params.append(p_value) + trial_params.append( + models.V1beta1TrialParameterSpec(name=p_name, reference=p_name) + ) + elif p_value is not None: + old_attr = getattr(parameters, p_name, None) + if old_attr is not None: + if isinstance(p_value, dict): + # Update the existing dictionary without nesting + value = copy.deepcopy(p_value) + else: + value = type(old_attr)(p_value) + setattr(parameters, p_name, value) + + if isinstance(parameters, TrainingArguments): + parameters = json.dumps(parameters.to_dict()) + else: + parameters = json.dumps(parameters.__dict__, cls=SetEncoder) return parameters + + +def get_exec_script_from_objective( + objective: Callable, + input_params: Dict[str, Any] = None, + packages_to_install: Optional[List[str]] = None, + pip_index_url: str = "https://pypi.org/simple", +): + """ + Get executable script for container args from the given objective function and parameters. + """ + # Validate objective function. + validate_objective_function(objective) + + # Extract objective function implementation. + objective_code = inspect.getsource(objective) + + # Objective function might be defined in some indented scope + # (e.g. in another function). We need to dedent the function code. + objective_code = textwrap.dedent(objective_code) + + # Wrap objective function to execute it from the file. For example: + # def objective(parameters): + # print(f'Parameters are {parameters}') + # objective({ + # 'lr': '${trialParameters.lr}', + # 'epochs': '${trialParameters.epochs}', + # 'is_dist': False + # }) + objective_code = f"{objective_code}\n{objective.__name__}({input_params})\n" + + # Prepare execute script template. + exec_script = textwrap.dedent( + """ + program_path=$(mktemp -d) + read -r -d '' SCRIPT << EOM\n + {objective_code} + EOM + printf "%s" "$SCRIPT" > $program_path/ephemeral_objective.py + python3 -u $program_path/ephemeral_objective.py""" + ) + + # Add objective code to the execute script. + exec_script = exec_script.format(objective_code=objective_code) + + # Install Python packages if that is required. + if packages_to_install is not None: + exec_script = ( + get_script_for_python_packages(packages_to_install, pip_index_url) + + exec_script + ) + + # Return executable script to execute objective function. + return exec_script From e1e710e806917bef853be5b892b36e1f75b17176 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 30 Aug 2024 22:33:45 +0800 Subject: [PATCH 79/84] fix format Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index a085dd00023..0aef2f11f43 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -18,6 +18,7 @@ import time from typing import Any, Callable, Dict, List, Optional, Union +import grpc import kubeflow.katib.katib_api_pb2 as katib_api_pb2 import kubeflow.katib.katib_api_pb2_grpc as katib_api_pb2_grpc from kubeflow.katib import models @@ -27,8 +28,6 @@ from kubeflow.katib.utils import utils from kubernetes import client, config -import grpc - logger = logging.getLogger(__name__) From c2df96757ce2fb39b5debf60ad7ed1873ce31ae5 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 30 Aug 2024 22:55:45 +0800 Subject: [PATCH 80/84] rerun tests Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 0aef2f11f43..f456ae27464 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -693,7 +693,7 @@ class name in this argument. trial_spec=pytorchjob, ) - # Specify metrics format for the collector, for example: 'train_loss':0.846 + # Specify metrics format for the collector. For example: 'train_loss':0.846 experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( source=models.V1beta1SourceSpec( filter=models.V1beta1FilterSpec( From 9f69329fa6e6a3b63c49ed2355dddde4fa760018 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 31 Aug 2024 06:51:34 +0800 Subject: [PATCH 81/84] move metrics_collector_spec back & update helper functions & add return type for helper functions Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 22 ++-- .../v1beta1/kubeflow/katib/utils/utils.py | 124 +++++++++--------- 2 files changed, 77 insertions(+), 69 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index f456ae27464..96b0dbd8aea 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -415,19 +415,21 @@ class name in this argument. if max_failed_trial_count is not None: experiment.spec.max_failed_trial_count = max_failed_trial_count - # Add metrics collector to the Katib Experiment. - # Up to now, we only support parameter `kind`, of which default value - # is `StdOut`, to specify the kind of metrics collector. - experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( - collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]) - ) - # If users choose to use a custom objective function. if objective is not None: + # Add metrics collector to the Katib Experiment. + # Up to now, we only support parameter `kind`, of which default value + # is `StdOut`, to specify the kind of metrics collector. + experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( + collector=models.V1beta1CollectorSpec( + kind=metrics_collector_config["kind"] + ) + ) + # Iterate over input parameters and do substitutions. experiment_params = [] trial_params = [] - input_params = utils.parameter_substitution( + input_params = utils.get_trial_substitutions_from_dict( parameters, experiment_params, trial_params ) @@ -587,10 +589,10 @@ class name in this argument. # Iterate over input parameters and do substitutions. experiment_params = [] trial_params = [] - training_args = utils.parameter_substitution( + training_args = utils.get_trial_substitutions_from_trainer( trainer_parameters.training_parameters, experiment_params, trial_params ) - lora_config = utils.parameter_substitution( + lora_config = utils.get_trial_substitutions_from_trainer( trainer_parameters.lora_config, experiment_params, trial_params ) diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 9c5fe6760ad..28f3126bbfa 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -143,68 +143,74 @@ def default(self, obj): return json.JSONEncoder.default(self, obj) -def parameter_substitution( - parameters: Union[Dict[str, Any], "TrainingArguments", "LoraConfig"], # noqa: F821 +def get_trial_substitutions_from_dict( + parameters: Dict[str, Any], experiment_params: List[models.V1beta1ParameterSpec], trial_params: List[models.V1beta1TrialParameterSpec], -): - if isinstance(parameters, dict): - for p_name, p_value in parameters.items(): - # If input parameter value is Katib Experiment parameter sample. - if isinstance(p_value, models.V1beta1ParameterSpec): - # Wrap value for the function input. - parameters[p_name] = f"${{trialParameters.{p_name}}}" - - # Add value to the Katib Experiment parameters. - p_value.name = p_name - experiment_params.append(p_value) - - # Add value to the Katib Experiment's Trial parameters. - trial_params.append( - models.V1beta1TrialParameterSpec(name=p_name, reference=p_name) - ) - else: - # Otherwise, add value to the function input. - parameters[p_name] = p_value +) -> Dict[str, str]: + for p_name, p_value in parameters.items(): + # If input parameter value is Katib Experiment parameter sample. + if isinstance(p_value, models.V1beta1ParameterSpec): + # Wrap value for the function input. + parameters[p_name] = f"${{trialParameters.{p_name}}}" + + # Add value to the Katib Experiment parameters. + p_value.name = p_name + experiment_params.append(p_value) + + # Add value to the Katib Experiment's Trial parameters. + trial_params.append( + models.V1beta1TrialParameterSpec(name=p_name, reference=p_name) + ) + else: + # Otherwise, add value to the function input. + parameters[p_name] = p_value - else: - from peft import LoraConfig # noqa: F401 - from transformers import TrainingArguments # noqa: F401 + return parameters - if isinstance(parameters, TrainingArguments): - parameters_dict = parameters.to_dict() - else: - parameters_dict = parameters.__dict__ - - for p_name, p_value in parameters_dict.items(): - if not hasattr(parameters, p_name): - logger.warning(f"Training parameter {p_name} is not supported.") - continue - - if isinstance(p_value, models.V1beta1ParameterSpec): - old_attr = getattr(parameters, p_name, None) - if old_attr is not None: - value = f"${{trialParameters.{p_name}}}" - setattr(parameters, p_name, value) - p_value.name = p_name - experiment_params.append(p_value) - trial_params.append( - models.V1beta1TrialParameterSpec(name=p_name, reference=p_name) - ) - elif p_value is not None: - old_attr = getattr(parameters, p_name, None) - if old_attr is not None: - if isinstance(p_value, dict): - # Update the existing dictionary without nesting - value = copy.deepcopy(p_value) - else: - value = type(old_attr)(p_value) - setattr(parameters, p_name, value) - - if isinstance(parameters, TrainingArguments): - parameters = json.dumps(parameters.to_dict()) - else: - parameters = json.dumps(parameters.__dict__, cls=SetEncoder) + +def get_trial_substitutions_from_trainer( + parameters: Union["TrainingArguments", "LoraConfig"], # noqa: F821 + experiment_params: List[models.V1beta1ParameterSpec], + trial_params: List[models.V1beta1TrialParameterSpec], +) -> Dict[str, str]: + from peft import LoraConfig # noqa: F401 + from transformers import TrainingArguments # noqa: F401 + + if isinstance(parameters, TrainingArguments): + parameters_dict = parameters.to_dict() + else: + parameters_dict = parameters.__dict__ + + for p_name, p_value in parameters_dict.items(): + if not hasattr(parameters, p_name): + logger.warning(f"Training parameter {p_name} is not supported.") + continue + + if isinstance(p_value, models.V1beta1ParameterSpec): + old_attr = getattr(parameters, p_name, None) + if old_attr is not None: + value = f"${{trialParameters.{p_name}}}" + setattr(parameters, p_name, value) + p_value.name = p_name + experiment_params.append(p_value) + trial_params.append( + models.V1beta1TrialParameterSpec(name=p_name, reference=p_name) + ) + elif p_value is not None: + old_attr = getattr(parameters, p_name, None) + if old_attr is not None: + if isinstance(p_value, dict): + # Update the existing dictionary without nesting + value = copy.deepcopy(p_value) + else: + value = type(old_attr)(p_value) + setattr(parameters, p_name, value) + + if isinstance(parameters, TrainingArguments): + parameters = json.dumps(parameters.to_dict()) + else: + parameters = json.dumps(parameters.__dict__, cls=SetEncoder) return parameters @@ -214,7 +220,7 @@ def get_exec_script_from_objective( input_params: Dict[str, Any] = None, packages_to_install: Optional[List[str]] = None, pip_index_url: str = "https://pypi.org/simple", -): +) -> str: """ Get executable script for container args from the given objective function and parameters. """ From 237438647c1306471992deca19fba57d37a7720c Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 31 Aug 2024 07:11:07 +0800 Subject: [PATCH 82/84] rerun tests Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 96b0dbd8aea..cd88312644f 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -695,7 +695,7 @@ class name in this argument. trial_spec=pytorchjob, ) - # Specify metrics format for the collector. For example: 'train_loss':0.846 + # Specify metrics format for the collector, for example: 'train_loss':0.846 experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( source=models.V1beta1SourceSpec( filter=models.V1beta1FilterSpec( From 233b582466d1fd6e0cc75a401071309f2d383abb Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 31 Aug 2024 07:26:44 +0800 Subject: [PATCH 83/84] fix some typos Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index cd88312644f..05fd1405a3f 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -311,11 +311,11 @@ class name in this argument. GPU, pass in a V1ResourceRequirement instance instead, since it's more flexible. This parameter is optional and defaults to None. - For external models and datasets, you can specify a types.TrainerResources object, + For external models and datasets, you can specify a TrainerResources object, which includes `num_workers`, `num_procs_per_worker`, and `resources_per_worker`. For example: ``` - resources_per_trial = types.TrainerResources( + resources_per_trial = TrainerResources( num_workers=4, num_procs_per_worker=2, resources_per_worker={ @@ -338,7 +338,6 @@ class name in this argument. to the base image packages. These packages are installed before executing the objective function. pip_index_url: The PyPI url from which to install Python packages. - metrics_collector_config: Specify the config of metrics collector, metrics_collector_config: Specify the config of metrics collector, for example, `metrics_collector_config = {"kind": "Push"}`. Currently, we only support `StdOut` and `Push` metrics collector. @@ -544,6 +543,17 @@ class name in this argument. "GitHub." ) + # Specify metrics format for the collector, for example: 'train_loss':0.846 + experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( + source=models.V1beta1SourceSpec( + filter=models.V1beta1FilterSpec( + metrics_format=[ + r"'([\w|-]+)'\s*:\s*([+-]?\d*(\.\d+)?([Ee][+-]?\d+)?)", + ] + ) + ), + ) + # Create PVC for the Storage Initializer. # TODO (helenxie-bit): PVC Creation should be part of Katib Controller. try: @@ -695,17 +705,6 @@ class name in this argument. trial_spec=pytorchjob, ) - # Specify metrics format for the collector, for example: 'train_loss':0.846 - experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( - source=models.V1beta1SourceSpec( - filter=models.V1beta1FilterSpec( - metrics_format=[ - r"'([\w|-]+)'\s*:\s*([+-]?\d*(\.\d+)?([Ee][+-]?\d+)?)", - ] - ) - ), - ) - # Add parameters to the Katib Experiment. experiment.spec.parameters = experiment_params From faa0f7f1151eda30cad2b72fced647213c1ec91e Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 2 Sep 2024 21:20:03 +0800 Subject: [PATCH 84/84] simplify the definition of 'TrainerResources' Signed-off-by: helenxie-bit --- .../kubeflow/katib/types/trainer_resources.py | 21 +++---------------- 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py index 6a802467226..87bbbbf67fc 100644 --- a/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py +++ b/sdk/python/v1beta1/kubeflow/katib/types/trainer_resources.py @@ -1,25 +1,10 @@ -from kubeflow.katib.configuration import Configuration - - class TrainerResources(object): def __init__( self, num_workers=None, num_procs_per_worker=None, resources_per_worker=None, - local_vars_configuration=None, ): - if local_vars_configuration is None: - local_vars_configuration = Configuration() - self.local_vars_configuration = local_vars_configuration - - self._num_workers = None - self._num_procs_per_worker = None - self._resources_per_worker = None - - if num_workers is not None: - self.num_workers = num_workers - if num_procs_per_worker is not None: - self.num_procs_per_worker = num_procs_per_worker - if resources_per_worker is not None: - self.resources_per_worker = resources_per_worker + self.num_workers = num_workers + self.num_procs_per_worker = num_procs_per_worker + self.resources_per_worker = resources_per_worker