From 829092a3793873f96dc82c464639884b092dfb7d Mon Sep 17 00:00:00 2001 From: Gary Miguel Date: Wed, 18 Dec 2024 14:55:04 -0800 Subject: [PATCH 1/5] Support old-style TensorFlow events (tensorboard) Fixes: https://github.com/kubeflow/katib/issues/2466 Signed-off-by: Gary Miguel --- .../tfevent_loader.py | 46 +++++++++++++------ 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py b/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py index f41597f9237..0422af2d2bb 100644 --- a/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py +++ b/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py @@ -30,11 +30,23 @@ import rfc3339 import tensorflow as tf from tensorboard.backend.event_processing.event_accumulator import EventAccumulator -from tensorboard.backend.event_processing.tag_types import TENSORS +from tensorboard.backend.event_processing.tag_types import SCALARS, TENSORS from pkg.metricscollector.v1beta1.common import const +def _should_consider(tag: str, metric_name: str, tfefile: str) -> bool: + tfefile_parent_dir = ( + os.path.dirname(metric_name) + if len(metric_name.split("/")) >= 2 + else os.path.dirname(tfefile) + ) + basedir_name = os.path.dirname(tfefile) + return tag.startswith(metric_name.split("/")[-1]) and basedir_name.endswith( + tfefile_parent_dir + ) + + class TFEventFileParser: def __init__(self, metric_names): self.metric_names = metric_names @@ -47,21 +59,15 @@ def find_all_files(directory): def parse_summary(self, tfefile): metric_logs = [] - event_accumulator = EventAccumulator(tfefile, size_guidance={TENSORS: 0}) + event_accumulator = EventAccumulator( + tfefile, size_guidance={SCALARS: 0, TENSORS: 0} + ) event_accumulator.Reload() - for tag in event_accumulator.Tags()[TENSORS]: + tags = event_accumulator.Tags() + for tag in tags[TENSORS]: for m in self.metric_names: - tfefile_parent_dir = ( - os.path.dirname(m) - if len(m.split("/")) >= 2 - else os.path.dirname(tfefile) - ) - basedir_name = os.path.dirname(tfefile) - if not tag.startswith(m.split("/")[-1]) or not basedir_name.endswith( - tfefile_parent_dir - ): + if not _should_consider(tag, m, tfefile): continue - for tensor in event_accumulator.Tensors(tag): ml = api_pb2.MetricLog( time_stamp=rfc3339.rfc3339( @@ -72,7 +78,19 @@ def parse_summary(self, tfefile): ), ) metric_logs.append(ml) - + # support old-style tensorboard metrics too + for tag in tags[SCALARS]: + for m in self.metric_names: + if not _should_consider(tag, m, tfefile): + continue + for scalar in event_accumulator.Scalars(tag): + ml = api_pb2.MetricLog( + time_stamp=rfc3339.rfc3339( + datetime.fromtimestamp(scalar.wall_time) + ), + metric=api_pb2.Metric(name=m, value=str(scalar.value)), + ) + metric_logs.append(ml) return metric_logs From afb9f9e110d243a6407b4839249e8b334ae88ecd Mon Sep 17 00:00:00 2001 From: Gary Miguel Date: Wed, 5 Feb 2025 20:57:46 -0800 Subject: [PATCH 2/5] format Signed-off-by: Gary Miguel --- .../v1beta1/tfevent-metricscollector/tfevent_loader.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py b/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py index 0422af2d2bb..8f102550c44 100644 --- a/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py +++ b/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py @@ -42,9 +42,8 @@ def _should_consider(tag: str, metric_name: str, tfefile: str) -> bool: else os.path.dirname(tfefile) ) basedir_name = os.path.dirname(tfefile) - return tag.startswith(metric_name.split("/")[-1]) and basedir_name.endswith( - tfefile_parent_dir - ) + return (tag.startswith(metric_name.split("/")[-1]) and + basedir_name.endswith(tfefile_parent_dir)) class TFEventFileParser: @@ -91,6 +90,7 @@ def parse_summary(self, tfefile): metric=api_pb2.Metric(name=m, value=str(scalar.value)), ) metric_logs.append(ml) + return metric_logs From 9f0e32fefd8168db33a989648d7cc7cd124b31a7 Mon Sep 17 00:00:00 2001 From: Gary Miguel Date: Thu, 6 Feb 2025 09:44:41 -0800 Subject: [PATCH 3/5] test Signed-off-by: Gary Miguel --- .../test_tfevent_metricscollector.py | 54 +++++++++++++++---- test/unit/v1beta1/requirements.txt | 1 + 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py b/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py index 305954e9081..e385ddaaa97 100644 --- a/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py +++ b/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py @@ -13,35 +13,69 @@ # limitations under the License. import os +import tempfile import unittest +import tensorboardX + import utils +METRIC_DIR_NAMES = ("train", "test") +METRIC_NAMES = ("accuracy", "loss") +QUALIFIED_METRIC_NAMES = tuple( + f"{dir}/{metric}" + for dir in METRIC_DIR_NAMES + for metric in METRIC_NAMES +) + class TestTFEventMetricsCollector(unittest.TestCase): def test_parse_file(self): current_dir = os.path.dirname(os.path.abspath(__file__)) logs_dir = os.path.join(current_dir, "testdata/tfevent-metricscollector/logs") - # Metric format is "{{dirname}}/{{metrics name}}" - metric_names = ["train/accuracy", "train/loss", "test/loss", "test/accuracy"] - metric_logs = utils.get_metric_logs(logs_dir, metric_names) + + metric_logs = utils.get_metric_logs(logs_dir, QUALIFIED_METRIC_NAMES) self.assertEqual(20, len(metric_logs)) for log in metric_logs: actual = log["metric"]["name"] - self.assertIn(actual, metric_names) + self.assertIn(actual, QUALIFIED_METRIC_NAMES) + + train_metric_logs = utils.get_metric_logs( + os.path.join(logs_dir, "train"), METRIC_NAMES) + self.assertEqual(10, len(train_metric_logs)) + + for log in train_metric_logs: + actual = log["metric"]["name"] + self.assertIn(actual, METRIC_NAMES) - # Metric format is "{{metrics name}}" - metric_names = ["accuracy", "loss"] - metrics_file_dir = os.path.join(logs_dir, "train") - metric_logs = utils.get_metric_logs(metrics_file_dir, metric_names) - self.assertEqual(10, len(metric_logs)) + def test_parse_file_with_tensorboardX(self): + logs_dir = tempfile.mkdtemp() + num_iters = 3 + + for dir_name in METRIC_DIR_NAMES: + with tensorboardX.SummaryWriter(os.path.join(logs_dir, dir_name)) as writer: + for metric_name in METRIC_NAMES: + for iter in range(num_iters): + writer.add_scalar(metric_name, 0.1, iter) + + + metric_logs = utils.get_metric_logs(logs_dir, QUALIFIED_METRIC_NAMES) + self.assertEqual(num_iters * len(QUALIFIED_METRIC_NAMES), len(metric_logs)) for log in metric_logs: actual = log["metric"]["name"] - self.assertIn(actual, metric_names) + self.assertIn(actual, QUALIFIED_METRIC_NAMES) + + train_metric_logs = utils.get_metric_logs( + os.path.join(logs_dir, "train"), METRIC_NAMES) + self.assertEqual(num_iters * len(METRIC_NAMES), len(train_metric_logs)) + + for log in train_metric_logs: + actual = log["metric"]["name"] + self.assertIn(actual, METRIC_NAMES) if __name__ == '__main__': diff --git a/test/unit/v1beta1/requirements.txt b/test/unit/v1beta1/requirements.txt index e6cc18aa541..7c1a6614f56 100644 --- a/test/unit/v1beta1/requirements.txt +++ b/test/unit/v1beta1/requirements.txt @@ -1,3 +1,4 @@ grpcio-testing==1.64.1 pytest==7.2.0 +tensorboardX==2.6.2.2 kubeflow-training[huggingface]==1.8.1 From 67774654addc66b595d2c338b684f7bad373b358 Mon Sep 17 00:00:00 2001 From: Gary Miguel Date: Thu, 6 Feb 2025 09:48:02 -0800 Subject: [PATCH 4/5] don't continue loops Signed-off-by: Gary Miguel --- .../tfevent_loader.py | 42 +++++++++---------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py b/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py index 8f102550c44..3f35d83e895 100644 --- a/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py +++ b/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py @@ -65,31 +65,29 @@ def parse_summary(self, tfefile): tags = event_accumulator.Tags() for tag in tags[TENSORS]: for m in self.metric_names: - if not _should_consider(tag, m, tfefile): - continue - for tensor in event_accumulator.Tensors(tag): - ml = api_pb2.MetricLog( - time_stamp=rfc3339.rfc3339( - datetime.fromtimestamp(tensor.wall_time) - ), - metric=api_pb2.Metric( - name=m, value=str(tf.make_ndarray(tensor.tensor_proto)) - ), - ) - metric_logs.append(ml) + if _should_consider(tag, m, tfefile): + for tensor in event_accumulator.Tensors(tag): + ml = api_pb2.MetricLog( + time_stamp=rfc3339.rfc3339( + datetime.fromtimestamp(tensor.wall_time) + ), + metric=api_pb2.Metric( + name=m, value=str(tf.make_ndarray(tensor.tensor_proto)) + ), + ) + metric_logs.append(ml) # support old-style tensorboard metrics too for tag in tags[SCALARS]: for m in self.metric_names: - if not _should_consider(tag, m, tfefile): - continue - for scalar in event_accumulator.Scalars(tag): - ml = api_pb2.MetricLog( - time_stamp=rfc3339.rfc3339( - datetime.fromtimestamp(scalar.wall_time) - ), - metric=api_pb2.Metric(name=m, value=str(scalar.value)), - ) - metric_logs.append(ml) + if _should_consider(tag, m, tfefile): + for scalar in event_accumulator.Scalars(tag): + ml = api_pb2.MetricLog( + time_stamp=rfc3339.rfc3339( + datetime.fromtimestamp(scalar.wall_time) + ), + metric=api_pb2.Metric(name=m, value=str(scalar.value)), + ) + metric_logs.append(ml) return metric_logs From 7b4ae33713bb862a51dd71a990838b44443bf0eb Mon Sep 17 00:00:00 2001 From: Gary Miguel Date: Tue, 11 Feb 2025 16:59:35 -0800 Subject: [PATCH 5/5] format Signed-off-by: Gary Miguel --- .../v1beta1/tfevent-metricscollector/tfevent_loader.py | 5 +++-- .../metricscollector/test_tfevent_metricscollector.py | 2 -- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py b/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py index 3f35d83e895..d50bbe19b7d 100644 --- a/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py +++ b/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py @@ -42,8 +42,9 @@ def _should_consider(tag: str, metric_name: str, tfefile: str) -> bool: else os.path.dirname(tfefile) ) basedir_name = os.path.dirname(tfefile) - return (tag.startswith(metric_name.split("/")[-1]) and - basedir_name.endswith(tfefile_parent_dir)) + return tag.startswith(metric_name.split("/")[-1]) and basedir_name.endswith( + tfefile_parent_dir + ) class TFEventFileParser: diff --git a/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py b/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py index e385ddaaa97..8fd8ae0d1cc 100644 --- a/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py +++ b/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py @@ -17,10 +17,8 @@ import unittest import tensorboardX - import utils - METRIC_DIR_NAMES = ("train", "test") METRIC_NAMES = ("accuracy", "loss") QUALIFIED_METRIC_NAMES = tuple(