From 1d9be470dbf9499c6a94070572956ff844b1c66e Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Mon, 24 Jun 2024 23:01:56 +0800 Subject: [PATCH 1/7] chore: add report_metrics. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- .../katib/api/katib_report_metrics.py | 78 +++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 sdk/python/v1beta1/kubeflow/katib/api/katib_report_metrics.py diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_report_metrics.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_report_metrics.py new file mode 100644 index 00000000000..7e326d0f6ee --- /dev/null +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_report_metrics.py @@ -0,0 +1,78 @@ +# Copyright 2024 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from datetime import datetime +from typing import Any, Dict + +import grpc +import pytz +import kubeflow.katib.katib_api_pb2 as katib_api_pb2 +from kubeflow.katib.constants import constants +from kubeflow.katib.utils import utils + +def report_metrics( + metrics: Dict[str, Any], + db_manager_address: str = constants.DEFAULT_DB_MANAGER_ADDRESS, + timeout: int = constants.DEFAULT_TIMEOUT, +): + """Push Metrics Directly to Katib DB + + [!!!] Trial name should always be passed into Katib Trials as env variable `KATIB_TRIAL_NAME`. + + Args: + metrics: Dict of metrics pushed to Katib DB. + For examle, `metrics = {"loss": 0.01, "accuracy": 0.99}`. + db-manager-address: Address for the Katib DB Manager in this format: `ip-address:port`. + timeout: Optional, gRPC API Server timeout in seconds to report metrics. + + Raises: + ValueError: The Trial name is not passed to environment variables. + RuntimeError: Unable to push Trial metrics to Katib DB. + """ + + namespace = utils.get_current_k8s_namespace() + name = os.getenv("KATIB_TRIAL_NAME") + if name is None: + raise ValueError( + "The Trial name is not passed to environment variables" + ) + + db_manager_address = db_manager_address.split(":") + channel = grpc.beta.implementations.insecure_channel( + db_manager_address[0], int(db_manager_address[1]) + ) + + with katib_api_pb2.beta_create_DBManager_stub(channel) as client: + try: + timestamp = datetime.now(pytz.UTC).isoformat(timespec="nanoseconds") + client.ReportObservationLog( + request=katib_api_pb2.ReportObservationLogRequest( + trial_name=name, + observation_logs=katib_api_pb2.ObservationLog( + metric_logs=[ + katib_api_pb2.MetricLog( + time_stamp=timestamp, + metric=katib_api_pb2.Metric(name=name,value=value) + ) + for name, value in metrics.items() + ] + ) + ), + timeout=timeout, + ) + except Exception as e: + raise RuntimeError( + f"Unable to push metrics to Katib DB for Trial {namespace}/{name}. Exception: {e}" + ) \ No newline at end of file From 01d3e2d2263d85017eed259ca9408701f43c4f06 Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Mon, 1 Jul 2024 02:27:09 +0000 Subject: [PATCH 2/7] fix: modify the code according to the first review. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- sdk/python/v1beta1/kubeflow/katib/__init__.py | 2 ++ .../v1beta1/kubeflow/katib/api/katib_report_metrics.py | 10 +++++----- .../v1beta1/kubeflow/katib/constants/constants.py | 3 +++ 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/__init__.py b/sdk/python/v1beta1/kubeflow/katib/__init__.py index a949c8567e1..7bb4961c420 100644 --- a/sdk/python/v1beta1/kubeflow/katib/__init__.py +++ b/sdk/python/v1beta1/kubeflow/katib/__init__.py @@ -71,6 +71,8 @@ # Import Katib API client. from kubeflow.katib.api.katib_client import KatibClient +# Import Katib report metrics functions +from kubeflow.katib.api.katib_report_metrics import report_metrics # Import Katib helper functions. import kubeflow.katib.api.search as search # Import Katib helper constants. diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_report_metrics.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_report_metrics.py index 7e326d0f6ee..84b0358e777 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_report_metrics.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_report_metrics.py @@ -13,11 +13,10 @@ # limitations under the License. import os -from datetime import datetime +from datetime import datetime, timezone from typing import Any, Dict import grpc -import pytz import kubeflow.katib.katib_api_pb2 as katib_api_pb2 from kubeflow.katib.constants import constants from kubeflow.katib.utils import utils @@ -29,7 +28,7 @@ def report_metrics( ): """Push Metrics Directly to Katib DB - [!!!] Trial name should always be passed into Katib Trials as env variable `KATIB_TRIAL_NAME`. + Katib always pass Trial name as env variable `KATIB_TRIAL_NAME` to the training container. Args: metrics: Dict of metrics pushed to Katib DB. @@ -56,7 +55,7 @@ def report_metrics( with katib_api_pb2.beta_create_DBManager_stub(channel) as client: try: - timestamp = datetime.now(pytz.UTC).isoformat(timespec="nanoseconds") + timestamp = datetime.now(timezone.utc).strftime(constants.RFC3339_FORMAT) client.ReportObservationLog( request=katib_api_pb2.ReportObservationLogRequest( trial_name=name, @@ -75,4 +74,5 @@ def report_metrics( except Exception as e: raise RuntimeError( f"Unable to push metrics to Katib DB for Trial {namespace}/{name}. Exception: {e}" - ) \ No newline at end of file + ) + \ No newline at end of file diff --git a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py index 74aa30e4b26..5bdb9911f76 100644 --- a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py +++ b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py @@ -17,6 +17,9 @@ # How long to wait in seconds for requests to the Kubernetes or gRPC API Server. DEFAULT_TIMEOUT = 120 +# RFC3339 time format +RFC3339_FORMAT = "%Y-%m-%dT%H:%M:%SZ" + # Global CRD version KATIB_VERSION = os.environ.get("EXPERIMENT_VERSION", "v1beta1") From 3fb2e1d161a0eedea27bc1843e582dd2a4eb0fe8 Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Tue, 2 Jul 2024 02:23:26 +0000 Subject: [PATCH 3/7] chore: add validation for metrics value & rename katib_report_metrics.py to report_metrics.py. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- sdk/python/v1beta1/kubeflow/katib/__init__.py | 2 +- .../v1beta1/kubeflow/katib/api/katib_client.py | 1 + .../{katib_report_metrics.py => report_metrics.py} | 9 ++++++++- sdk/python/v1beta1/kubeflow/katib/utils/utils.py | 12 +++++++++++- 4 files changed, 21 insertions(+), 3 deletions(-) rename sdk/python/v1beta1/kubeflow/katib/api/{katib_report_metrics.py => report_metrics.py} (91%) diff --git a/sdk/python/v1beta1/kubeflow/katib/__init__.py b/sdk/python/v1beta1/kubeflow/katib/__init__.py index 7bb4961c420..8a663f3556c 100644 --- a/sdk/python/v1beta1/kubeflow/katib/__init__.py +++ b/sdk/python/v1beta1/kubeflow/katib/__init__.py @@ -72,7 +72,7 @@ # Import Katib API client. from kubeflow.katib.api.katib_client import KatibClient # Import Katib report metrics functions -from kubeflow.katib.api.katib_report_metrics import report_metrics +from sdk.python.v1beta1.kubeflow.katib.api.report_metrics import report_metrics # Import Katib helper functions. import kubeflow.katib.api.search as search # Import Katib helper constants. diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 7988dbaa898..94930d17829 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -171,6 +171,7 @@ def tune( env_per_trial: Optional[ Union[Dict[str, str], List[Union[client.V1EnvVar, client.V1EnvFromSource]]] ] = None, + algorithm_name: str = "random", algorithm_settings: Union[ dict, List[models.V1beta1AlgorithmSetting], None diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_report_metrics.py b/sdk/python/v1beta1/kubeflow/katib/api/report_metrics.py similarity index 91% rename from sdk/python/v1beta1/kubeflow/katib/api/katib_report_metrics.py rename to sdk/python/v1beta1/kubeflow/katib/api/report_metrics.py index 84b0358e777..b8d7abf2e51 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_report_metrics.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/report_metrics.py @@ -41,6 +41,7 @@ def report_metrics( RuntimeError: Unable to push Trial metrics to Katib DB. """ + # Get Trial's namespace and name namespace = utils.get_current_k8s_namespace() name = os.getenv("KATIB_TRIAL_NAME") if name is None: @@ -48,11 +49,17 @@ def report_metrics( "The Trial name is not passed to environment variables" ) + # Get channel for grpc call to db manager db_manager_address = db_manager_address.split(":") channel = grpc.beta.implementations.insecure_channel( db_manager_address[0], int(db_manager_address[1]) ) + + # Validate metrics value in dict + for value in metrics.values(): + utils.validate_metrics_value(value) + # Dial katib db manager to report metrics with katib_api_pb2.beta_create_DBManager_stub(channel) as client: try: timestamp = datetime.now(timezone.utc).strftime(constants.RFC3339_FORMAT) @@ -63,7 +70,7 @@ def report_metrics( metric_logs=[ katib_api_pb2.MetricLog( time_stamp=timestamp, - metric=katib_api_pb2.Metric(name=name,value=value) + metric=katib_api_pb2.Metric(name=name,value=str(value)) ) for name, value in metrics.items() ] diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 97c46772611..cae99b6cae3 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -15,7 +15,7 @@ import json import os import textwrap -from typing import Callable +from typing import Callable, Any import inspect from kubeflow.katib import models @@ -72,6 +72,16 @@ def print_experiment_status(experiment: models.V1beta1Experiment): print(f"Current Optimal Trial:\n {experiment.status.current_optimal_trial}") print(f"Experiment conditions:\n {experiment.status.conditions}") +def validate_metrics_value(value: Any): + """Validate if the metrics value can be converted to type `float`.""" + try: + float(value) + except Exception: + raise ValueError( + f"Invalid value {value} for metrics value. " + "The metrics value should have or can be converted to type `float`. " + ) + def validate_objective_function(objective: Callable): From 609ef676bc504b304924515a4125638cc6a2c1ab Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Tue, 2 Jul 2024 02:33:21 +0000 Subject: [PATCH 4/7] fix: update import path in __init__.py. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- sdk/python/v1beta1/kubeflow/katib/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/__init__.py b/sdk/python/v1beta1/kubeflow/katib/__init__.py index 8a663f3556c..7aef4c9897d 100644 --- a/sdk/python/v1beta1/kubeflow/katib/__init__.py +++ b/sdk/python/v1beta1/kubeflow/katib/__init__.py @@ -72,7 +72,7 @@ # Import Katib API client. from kubeflow.katib.api.katib_client import KatibClient # Import Katib report metrics functions -from sdk.python.v1beta1.kubeflow.katib.api.report_metrics import report_metrics +from kubeflow.katib.api.report_metrics import report_metrics # Import Katib helper functions. import kubeflow.katib.api.search as search # Import Katib helper constants. From af69f36ab0b947e845c39b00ee13edae1be9dff1 Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Tue, 2 Jul 2024 02:51:22 +0000 Subject: [PATCH 5/7] fix: delete blank line. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 94930d17829..7988dbaa898 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -171,7 +171,6 @@ def tune( env_per_trial: Optional[ Union[Dict[str, str], List[Union[client.V1EnvVar, client.V1EnvFromSource]]] ] = None, - algorithm_name: str = "random", algorithm_settings: Union[ dict, List[models.V1beta1AlgorithmSetting], None From ec8431f290a7f96db256bac94574c801985a41e4 Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Tue, 2 Jul 2024 14:20:14 +0000 Subject: [PATCH 6/7] fix: update RuntimeError doc string & correct spelling error & add new line. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- sdk/python/v1beta1/kubeflow/katib/api/report_metrics.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/report_metrics.py b/sdk/python/v1beta1/kubeflow/katib/api/report_metrics.py index b8d7abf2e51..0375cf95b29 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/report_metrics.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/report_metrics.py @@ -28,7 +28,7 @@ def report_metrics( ): """Push Metrics Directly to Katib DB - Katib always pass Trial name as env variable `KATIB_TRIAL_NAME` to the training container. + Katib always passes Trial name as env variable `KATIB_TRIAL_NAME` to the training container. Args: metrics: Dict of metrics pushed to Katib DB. @@ -38,7 +38,8 @@ def report_metrics( Raises: ValueError: The Trial name is not passed to environment variables. - RuntimeError: Unable to push Trial metrics to Katib DB. + RuntimeError: Unable to push Trial metrics to Katib DB or + metrics value has incorrect format (cannot be converted to type `float`). """ # Get Trial's namespace and name @@ -82,4 +83,5 @@ def report_metrics( raise RuntimeError( f"Unable to push metrics to Katib DB for Trial {namespace}/{name}. Exception: {e}" ) + \ No newline at end of file From fa1866beead0df69fdea35ec1053ab25179c886d Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Tue, 2 Jul 2024 15:35:27 +0000 Subject: [PATCH 7/7] fix: delete blank in the last line. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- sdk/python/v1beta1/kubeflow/katib/api/report_metrics.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/report_metrics.py b/sdk/python/v1beta1/kubeflow/katib/api/report_metrics.py index 0375cf95b29..c506b85c5ff 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/report_metrics.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/report_metrics.py @@ -83,5 +83,3 @@ def report_metrics( raise RuntimeError( f"Unable to push metrics to Katib DB for Trial {namespace}/{name}. Exception: {e}" ) - - \ No newline at end of file