From 535c1e49171eea712d04f28769c7ebf697e675f9 Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Mon, 23 Sep 2024 10:57:42 -0700 Subject: [PATCH 01/13] Add incremental algorithms support --- configs/incremental.json | 99 ++++++++++++++++++++++++ sklbench/benchmarks/sklearn_estimator.py | 36 +++++++-- sklbench/report/implementation.py | 10 ++- test-configuration-linux.yml | 5 ++ test-configuration-win.yml | 4 + 5 files changed, 144 insertions(+), 10 deletions(-) create mode 100644 configs/incremental.json diff --git a/configs/incremental.json b/configs/incremental.json new file mode 100644 index 00000000..5f7a5477 --- /dev/null +++ b/configs/incremental.json @@ -0,0 +1,99 @@ +{ + "PARAMETERS_SETS": { + "common": {"bench": {"n_runs": 10, "time_limit": 60}}, + "covariance data": { + "data": [ + { + "source": "make_blobs", + "generation_kwargs": { + "centers": 1, + "n_samples": 1000, + "n_features": [16, 64] + }, + "split_kwargs": {"ignore": true} + } + ] + }, + "basic_statistics data": { + "data": { + "source": "make_blobs", + "generation_kwargs": { + "centers": 1, + "n_samples": 10000, + "n_features": [16, 64] + }, + "split_kwargs": {"ignore": true} + } + }, + "linear_regression data": { + "data": { + "source": "make_regression", + "split_kwargs": {"train_size": 0.2, "test_size": 0.8}, + "generation_kwargs": { + "n_samples": 5000, + "n_features": [40, 100], + "n_informative": 5, + "noise": 2.0 + } + } + }, + "pca data": { + "data": { + "source": "make_blobs", + "generation_kwargs": { + "centers": 1, + "n_samples": 1000, + "n_features": [16, 64] + }, + "split_kwargs": {"ignore": true} + } + }, + "covariance": { + "algorithm": [ + { + "estimator": "IncrementalEmpiricalCovariance", + "library": "sklearnex.covariance", + "estimator_methods": {"training": "partial_fit"}, + "num_batches": {"training": 2} + } + ] + }, + "basic_statistics": { + "algorithm": [ + { + "estimator": "IncrementalBasicStatistics", + "library": "sklearnex.basic_statistics", + "num_batches": {"training": 2} + } + ] + }, + "linear_regression": { + "algorithm": [ + { + "estimator": "IncrementalLinearRegression", + "library": "sklearnex.linear_model", + "num_batches": {"training": 2} + } + ] + }, + "pca": { + "algorithm": [ + { + "estimator": "IncrementalPCA", + "library": "sklearnex.preview.decomposition", + "num_batches": {"training": 2} + } + ] + } + }, + "TEMPLATES": { + "covariance": {"SETS": ["common", "covariance", "covariance data"]}, + "basic_statistics": { + "SETS": ["common", "basic_statistics", "basic_statistics data"] + }, + "linear_regression": { + "SETS": ["common", "linear_regression", "linear_regression data"] + }, + "pca": {"SETS": ["common", "pca", "pca data"]} + } +} diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index f9c0a75e..4cdde86d 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -74,7 +74,7 @@ def get_estimator(library_name: str, estimator_name: str): def get_estimator_methods(bench_case: BenchCase) -> Dict[str, List[str]]: # default estimator methods estimator_methods = { - "training": ["fit"], + "training": ["partial_fit", "fit"], "inference": ["predict", "predict_proba", "transform"], } for stage in estimator_methods.keys(): @@ -334,7 +334,9 @@ def verify_patching(stream: io.StringIO, function_name) -> bool: return acceleration_lines > 0 and fallback_lines == 0 -def create_online_function(method_instance, data_args, batch_size): +def create_online_function( + estimator_instance, method_instance, data_args, num_batches, batch_size +): n_batches = data_args[0].shape[0] // batch_size if "y" in list(inspect.signature(method_instance).parameters): @@ -345,6 +347,7 @@ def ndarray_function(x, y): x[i * batch_size : (i + 1) * batch_size], y[i * batch_size : (i + 1) * batch_size], ) + estimator_instance._onedal_finalize_fit() def dataframe_function(x, y): for i in range(n_batches): @@ -352,16 +355,19 @@ def dataframe_function(x, y): x.iloc[i * batch_size : (i + 1) * batch_size], y.iloc[i * batch_size : (i + 1) * batch_size], ) + estimator_instance._onedal_finalize_fit() else: def ndarray_function(x): for i in range(n_batches): method_instance(x[i * batch_size : (i + 1) * batch_size]) + estimator_instance._onedal_finalize_fit() def dataframe_function(x): for i in range(n_batches): method_instance(x.iloc[i * batch_size : (i + 1) * batch_size]) + estimator_instance._onedal_finalize_fit() if "ndarray" in str(type(data_args[0])): return ndarray_function @@ -414,12 +420,28 @@ def measure_sklearn_estimator( data_args = (x_train,) else: data_args = (x_test,) - batch_size = get_bench_case_value( - bench_case, f"algorithm:batch_size:{stage}" - ) - if batch_size is not None: + + if method == "partial_fit": + num_batches = get_bench_case_value(bench_case, "data:num_batches") + batch_size = get_bench_case_value(bench_case, "data:batch_size") + + if batch_size is None: + if num_batches is None: + num_batches = 5 + batch_size = ( + data_args[0].shape[0] + num_batches - 1 + ) // num_batches + if num_batches is None: + num_batches = ( + data_args[0].shape[0] + batch_size - 1 + ) // batch_size + method_instance = create_online_function( - method_instance, data_args, batch_size + estimator_instance, + method_instance, + data_args, + num_batches, + batch_size, ) # daal4py model builders enabling branch if enable_modelbuilders and stage == "inference": diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py index b577ab55..df15b5eb 100644 --- a/sklbench/report/implementation.py +++ b/sklbench/report/implementation.py @@ -16,7 +16,7 @@ import argparse import json -from typing import Dict, List +from typing import Dict, Hashable, List import openpyxl as xl import pandas as pd @@ -239,6 +239,7 @@ def get_result_tables_as_df( bench_cases = pd.DataFrame( [flatten_dict(bench_case) for bench_case in results["bench_cases"]] ) + bench_cases = bench_cases.map(lambda x: str(x) if not isinstance(x, Hashable) else x) if compatibility_mode: bench_cases = transform_results_to_compatible(bench_cases) @@ -248,7 +249,7 @@ def get_result_tables_as_df( bench_cases.drop(columns=[column], inplace=True) diffby_columns.remove(column) - return split_df_by_columns(bench_cases, splitby_columns) + return split_df_by_columns(bench_cases, splitby_columns, False) def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame: @@ -258,7 +259,10 @@ def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame: # only relative improvements are included in summary currently if len(column) > 1 and column[1] == f"{metric_name} relative improvement": metric_columns.append(column) - summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T + if metric_columns: + summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T + else: + summary = pd.DataFrame() summary.index = pd.Index([df_name]) return summary diff --git a/test-configuration-linux.yml b/test-configuration-linux.yml index a37769ce..722d1008 100644 --- a/test-configuration-linux.yml +++ b/test-configuration-linux.yml @@ -45,6 +45,11 @@ steps: conda activate bench-env python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json displayName: Sklearn example run + - script: | + source /usr/share/miniconda/etc/profile.d/conda.sh + conda activate bench-env + python -m sklbench --report -l DEBUG --report -c configs/incremental.json + displayName: Incremental algorithms example run - script: | source /usr/share/miniconda/etc/profile.d/conda.sh conda activate bench-env diff --git a/test-configuration-win.yml b/test-configuration-win.yml index a1eddaeb..82c3152a 100644 --- a/test-configuration-win.yml +++ b/test-configuration-win.yml @@ -43,6 +43,10 @@ steps: call activate bench-env python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json displayName: Sklearn example run + - script: | + call activate bench-env + python -m sklbench --report -l DEBUG --report -c configs/incremental.json + displayName: Incremental algorithms example run - script: | call activate bench-env python -m sklbench --report -l DEBUG --report -c configs/xgboost_example.json From d6952ac74715dcb0910626f9e5dce1c2eb1a3827 Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Mon, 23 Sep 2024 11:49:37 -0700 Subject: [PATCH 02/13] Fix win yml --- test-configuration-win.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test-configuration-win.yml b/test-configuration-win.yml index 82c3152a..f3ac1595 100644 --- a/test-configuration-win.yml +++ b/test-configuration-win.yml @@ -43,7 +43,7 @@ steps: call activate bench-env python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json displayName: Sklearn example run - - script: | + - script: | call activate bench-env python -m sklbench --report -l DEBUG --report -c configs/incremental.json displayName: Incremental algorithms example run From 03a152a13c62eef3fa66b61109b76874d4e9b2b1 Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Tue, 24 Sep 2024 02:46:36 -0700 Subject: [PATCH 03/13] Remove samples/ms info --- sklbench/benchmarks/sklearn_estimator.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index 4cdde86d..7e616273 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -458,10 +458,6 @@ def measure_sklearn_estimator( metrics[method]["time std[ms]"], _, ) = measure_case(bench_case, method_instance, *data_args) - if batch_size is not None: - metrics[method]["throughput[samples/ms]"] = ( - (data_args[0].shape[0] // batch_size) * batch_size - ) / metrics[method]["time[ms]"] if ensure_sklearnex_patching: full_method_name = f"{estimator_class.__name__}.{method}" sklearnex_logging_stream.seek(0) From 3ac5c236eb6255892e607a6122d4d2187e4c5451 Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Tue, 24 Sep 2024 06:45:42 -0700 Subject: [PATCH 04/13] Remove BS from config (need to add after pip version update) --- configs/incremental.json | 3 --- 1 file changed, 3 deletions(-) diff --git a/configs/incremental.json b/configs/incremental.json index 5f7a5477..c9ffb19c 100644 --- a/configs/incremental.json +++ b/configs/incremental.json @@ -88,9 +88,6 @@ }, "TEMPLATES": { "covariance": {"SETS": ["common", "covariance", "covariance data"]}, - "basic_statistics": { - "SETS": ["common", "basic_statistics", "basic_statistics data"] - }, "linear_regression": { "SETS": ["common", "linear_regression", "linear_regression data"] }, From 9461fad69a00ecbf69a3e5fcef662fb1bafd4253 Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Wed, 25 Sep 2024 02:00:29 -0700 Subject: [PATCH 05/13] Add condition for finalize --- sklbench/benchmarks/sklearn_estimator.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index 7e616273..52f5bf4e 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -347,7 +347,8 @@ def ndarray_function(x, y): x[i * batch_size : (i + 1) * batch_size], y[i * batch_size : (i + 1) * batch_size], ) - estimator_instance._onedal_finalize_fit() + if hasattr(estimator_instance, "_onedal_finalize_fit"): + estimator_instance._onedal_finalize_fit() def dataframe_function(x, y): for i in range(n_batches): @@ -355,19 +356,22 @@ def dataframe_function(x, y): x.iloc[i * batch_size : (i + 1) * batch_size], y.iloc[i * batch_size : (i + 1) * batch_size], ) - estimator_instance._onedal_finalize_fit() + if hasattr(estimator_instance, "_onedal_finalize_fit"): + estimator_instance._onedal_finalize_fit() else: def ndarray_function(x): for i in range(n_batches): method_instance(x[i * batch_size : (i + 1) * batch_size]) - estimator_instance._onedal_finalize_fit() + if hasattr(estimator_instance, "_onedal_finalize_fit"): + estimator_instance._onedal_finalize_fit() def dataframe_function(x): for i in range(n_batches): method_instance(x.iloc[i * batch_size : (i + 1) * batch_size]) - estimator_instance._onedal_finalize_fit() + if hasattr(estimator_instance, "_onedal_finalize_fit"): + estimator_instance._onedal_finalize_fit() if "ndarray" in str(type(data_args[0])): return ndarray_function From b82d772f26c1af7d261b78bf94ae97280c23c9e2 Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Wed, 25 Sep 2024 09:51:39 -0700 Subject: [PATCH 06/13] Fix num_batches usage --- sklbench/benchmarks/sklearn_estimator.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index 52f5bf4e..3f8b1641 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -337,12 +337,11 @@ def verify_patching(stream: io.StringIO, function_name) -> bool: def create_online_function( estimator_instance, method_instance, data_args, num_batches, batch_size ): - n_batches = data_args[0].shape[0] // batch_size if "y" in list(inspect.signature(method_instance).parameters): def ndarray_function(x, y): - for i in range(n_batches): + for i in range(num_batches): method_instance( x[i * batch_size : (i + 1) * batch_size], y[i * batch_size : (i + 1) * batch_size], @@ -351,7 +350,7 @@ def ndarray_function(x, y): estimator_instance._onedal_finalize_fit() def dataframe_function(x, y): - for i in range(n_batches): + for i in range(num_batches): method_instance( x.iloc[i * batch_size : (i + 1) * batch_size], y.iloc[i * batch_size : (i + 1) * batch_size], @@ -362,13 +361,13 @@ def dataframe_function(x, y): else: def ndarray_function(x): - for i in range(n_batches): + for i in range(num_batches): method_instance(x[i * batch_size : (i + 1) * batch_size]) if hasattr(estimator_instance, "_onedal_finalize_fit"): estimator_instance._onedal_finalize_fit() def dataframe_function(x): - for i in range(n_batches): + for i in range(num_batches): method_instance(x.iloc[i * batch_size : (i + 1) * batch_size]) if hasattr(estimator_instance, "_onedal_finalize_fit"): estimator_instance._onedal_finalize_fit() From b5ad233d539803da41ae41f98e7997f68394ec35 Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Fri, 27 Sep 2024 02:29:40 -0700 Subject: [PATCH 07/13] Reduce config --- configs/incremental.json | 40 +++++++++------------------------------- 1 file changed, 9 insertions(+), 31 deletions(-) diff --git a/configs/incremental.json b/configs/incremental.json index c9ffb19c..f09927ee 100644 --- a/configs/incremental.json +++ b/configs/incremental.json @@ -1,7 +1,7 @@ { "PARAMETERS_SETS": { "common": {"bench": {"n_runs": 10, "time_limit": 60}}, - "covariance data": { + "unlabeled dataset": { "data": [ { "source": "make_blobs", @@ -14,18 +14,7 @@ } ] }, - "basic_statistics data": { - "data": { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 10000, - "n_features": [16, 64] - }, - "split_kwargs": {"ignore": true} - } - }, - "linear_regression data": { + "labeled dataset": { "data": { "source": "make_regression", "split_kwargs": {"train_size": 0.2, "test_size": 0.8}, @@ -37,22 +26,11 @@ } } }, - "pca data": { - "data": { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 1000, - "n_features": [16, 64] - }, - "split_kwargs": {"ignore": true} - } - }, "covariance": { "algorithm": [ { "estimator": "IncrementalEmpiricalCovariance", - "library": "sklearnex.covariance", + "library": "sklearnex", "estimator_methods": {"training": "partial_fit"}, "num_batches": {"training": 2} } @@ -62,7 +40,7 @@ "algorithm": [ { "estimator": "IncrementalBasicStatistics", - "library": "sklearnex.basic_statistics", + "library": "sklearnex", "num_batches": {"training": 2} } ] @@ -71,7 +49,7 @@ "algorithm": [ { "estimator": "IncrementalLinearRegression", - "library": "sklearnex.linear_model", + "library": "sklearnex", "num_batches": {"training": 2} } ] @@ -80,17 +58,17 @@ "algorithm": [ { "estimator": "IncrementalPCA", - "library": "sklearnex.preview.decomposition", + "library": "sklearnex.preview", "num_batches": {"training": 2} } ] } }, "TEMPLATES": { - "covariance": {"SETS": ["common", "covariance", "covariance data"]}, + "covariance": {"SETS": ["common", "covariance", "unlabeled dataset"]}, "linear_regression": { - "SETS": ["common", "linear_regression", "linear_regression data"] + "SETS": ["common", "linear_regression", "labeled dataset"] }, - "pca": {"SETS": ["common", "pca", "pca data"]} + "pca": {"SETS": ["common", "pca", "unlabeled dataset"]} } } From fc4ad2b12ffefebdc3fe3f7103d24fc997cdad0f Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Fri, 27 Sep 2024 04:53:32 -0700 Subject: [PATCH 08/13] Add covariance module to incremental config --- configs/incremental.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/incremental.json b/configs/incremental.json index f09927ee..d36e2a16 100644 --- a/configs/incremental.json +++ b/configs/incremental.json @@ -30,7 +30,7 @@ "algorithm": [ { "estimator": "IncrementalEmpiricalCovariance", - "library": "sklearnex", + "library": "sklearnex.covariance", "estimator_methods": {"training": "partial_fit"}, "num_batches": {"training": 2} } From 040802dc7229b4713b5ccab4de4248505e762b65 Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Fri, 4 Oct 2024 02:49:02 -0700 Subject: [PATCH 09/13] Rename example config --- .../{incremental.json => sklearnex_incremental_example.json} | 0 test-configuration-linux.yml | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename configs/{incremental.json => sklearnex_incremental_example.json} (100%) diff --git a/configs/incremental.json b/configs/sklearnex_incremental_example.json similarity index 100% rename from configs/incremental.json rename to configs/sklearnex_incremental_example.json diff --git a/test-configuration-linux.yml b/test-configuration-linux.yml index 722d1008..d8c1a64e 100644 --- a/test-configuration-linux.yml +++ b/test-configuration-linux.yml @@ -48,7 +48,7 @@ steps: - script: | source /usr/share/miniconda/etc/profile.d/conda.sh conda activate bench-env - python -m sklbench --report -l DEBUG --report -c configs/incremental.json + python -m sklbench --report -l DEBUG --report -c configs/sklearnex_incremental_example.json displayName: Incremental algorithms example run - script: | source /usr/share/miniconda/etc/profile.d/conda.sh From 69cc4c1754024b2817fe87b3a0d89a926b45658b Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Fri, 4 Oct 2024 03:54:18 -0700 Subject: [PATCH 10/13] Remove bs mentioning in config (need to be added later) --- configs/sklearnex_incremental_example.json | 9 --------- 1 file changed, 9 deletions(-) diff --git a/configs/sklearnex_incremental_example.json b/configs/sklearnex_incremental_example.json index d36e2a16..37b2c7fb 100644 --- a/configs/sklearnex_incremental_example.json +++ b/configs/sklearnex_incremental_example.json @@ -36,15 +36,6 @@ } ] }, - "basic_statistics": { - "algorithm": [ - { - "estimator": "IncrementalBasicStatistics", - "library": "sklearnex", - "num_batches": {"training": 2} - } - ] - }, "linear_regression": { "algorithm": [ { From f275062098635b049f2ff822c524c44f7b62422a Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Fri, 4 Oct 2024 08:36:17 -0700 Subject: [PATCH 11/13] Fix num_batches and batch_size reading from config --- configs/sklearnex_incremental_example.json | 6 +----- sklbench/benchmarks/sklearn_estimator.py | 8 ++++++-- sklbench/report/implementation.py | 2 ++ 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/configs/sklearnex_incremental_example.json b/configs/sklearnex_incremental_example.json index 37b2c7fb..1fbbcafa 100644 --- a/configs/sklearnex_incremental_example.json +++ b/configs/sklearnex_incremental_example.json @@ -56,10 +56,6 @@ } }, "TEMPLATES": { - "covariance": {"SETS": ["common", "covariance", "unlabeled dataset"]}, - "linear_regression": { - "SETS": ["common", "linear_regression", "labeled dataset"] - }, - "pca": {"SETS": ["common", "pca", "unlabeled dataset"]} + "covariance": {"SETS": ["common", "covariance", "unlabeled dataset"]} } } diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index 3f8b1641..c4f94c47 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -425,8 +425,12 @@ def measure_sklearn_estimator( data_args = (x_test,) if method == "partial_fit": - num_batches = get_bench_case_value(bench_case, "data:num_batches") - batch_size = get_bench_case_value(bench_case, "data:batch_size") + num_batches = get_bench_case_value( + bench_case, f"algorithm:num_batches:{stage}" + ) + batch_size = get_bench_case_value( + bench_case, f"algorithm:batch_size:{stage}" + ) if batch_size is None: if num_batches is None: diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py index df15b5eb..af0398dd 100644 --- a/sklbench/report/implementation.py +++ b/sklbench/report/implementation.py @@ -236,6 +236,7 @@ def get_result_tables_as_df( splitby_columns=["estimator", "method", "function"], compatibility_mode=False, ): + print(results["bench_cases"]) bench_cases = pd.DataFrame( [flatten_dict(bench_case) for bench_case in results["bench_cases"]] ) @@ -244,6 +245,7 @@ def get_result_tables_as_df( if compatibility_mode: bench_cases = transform_results_to_compatible(bench_cases) + print(bench_cases) for column in diffby_columns.copy(): if bench_cases[column].nunique() == 1: bench_cases.drop(columns=[column], inplace=True) From 5a9be80616e5dca5e50bd27145ce11c6316b4c2d Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Fri, 4 Oct 2024 08:41:09 -0700 Subject: [PATCH 12/13] Revert accidentally pushed changes --- configs/sklearnex_incremental_example.json | 6 +++++- sklbench/report/implementation.py | 2 -- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/configs/sklearnex_incremental_example.json b/configs/sklearnex_incremental_example.json index 1fbbcafa..37b2c7fb 100644 --- a/configs/sklearnex_incremental_example.json +++ b/configs/sklearnex_incremental_example.json @@ -56,6 +56,10 @@ } }, "TEMPLATES": { - "covariance": {"SETS": ["common", "covariance", "unlabeled dataset"]} + "covariance": {"SETS": ["common", "covariance", "unlabeled dataset"]}, + "linear_regression": { + "SETS": ["common", "linear_regression", "labeled dataset"] + }, + "pca": {"SETS": ["common", "pca", "unlabeled dataset"]} } } diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py index af0398dd..df15b5eb 100644 --- a/sklbench/report/implementation.py +++ b/sklbench/report/implementation.py @@ -236,7 +236,6 @@ def get_result_tables_as_df( splitby_columns=["estimator", "method", "function"], compatibility_mode=False, ): - print(results["bench_cases"]) bench_cases = pd.DataFrame( [flatten_dict(bench_case) for bench_case in results["bench_cases"]] ) @@ -245,7 +244,6 @@ def get_result_tables_as_df( if compatibility_mode: bench_cases = transform_results_to_compatible(bench_cases) - print(bench_cases) for column in diffby_columns.copy(): if bench_cases[column].nunique() == 1: bench_cases.drop(columns=[column], inplace=True) From 1d48f3a1b35668def560fb05c4b200783102cfda Mon Sep 17 00:00:00 2001 From: Ethan Glaser Date: Mon, 17 Mar 2025 22:30:21 -0700 Subject: [PATCH 13/13] remove batch_size logic from incremental benchmarking for num_batches --- configs/README.md | 1 + sklbench/benchmarks/sklearn_estimator.py | 57 ++++++++++-------------- sklbench/report/implementation.py | 1 + 3 files changed, 26 insertions(+), 33 deletions(-) diff --git a/configs/README.md b/configs/README.md index 8d3c5ac2..07c92dc1 100644 --- a/configs/README.md +++ b/configs/README.md @@ -117,6 +117,7 @@ Configs have the three highest parameter keys: |:---------------|:--------------|:--------|:------------| | `algorithm`:`estimator` | None | | Name of measured estimator. | | `algorithm`:`estimator_params` | Empty `dict` | | Parameters for estimator constructor. | +| `algorithm`:`training`:`num_batches` | 5 | | Number of batches to benchmark `partial_fit` function, using batches the size of number of samples specified (not samples divided by `num_batches`). For incremental estimators only. | | `algorithm`:`online_inference_mode` | False | | Enables online mode for inference methods of estimator (separate call for each sample). | | `algorithm`:`sklearn_context` | None | | Parameters for sklearn `config_context` used over estimator. | | `algorithm`:`sklearnex_context` | None | | Parameters for sklearnex `config_context` used over estimator. Updated by `sklearn_context` if set. | diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index c4f94c47..dd0ef1a5 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -334,27 +334,19 @@ def verify_patching(stream: io.StringIO, function_name) -> bool: return acceleration_lines > 0 and fallback_lines == 0 -def create_online_function( - estimator_instance, method_instance, data_args, num_batches, batch_size -): +def create_online_function(estimator_instance, method_instance, data_args, num_batches): if "y" in list(inspect.signature(method_instance).parameters): def ndarray_function(x, y): for i in range(num_batches): - method_instance( - x[i * batch_size : (i + 1) * batch_size], - y[i * batch_size : (i + 1) * batch_size], - ) + method_instance(x, y) if hasattr(estimator_instance, "_onedal_finalize_fit"): estimator_instance._onedal_finalize_fit() def dataframe_function(x, y): for i in range(num_batches): - method_instance( - x.iloc[i * batch_size : (i + 1) * batch_size], - y.iloc[i * batch_size : (i + 1) * batch_size], - ) + method_instance(x, y) if hasattr(estimator_instance, "_onedal_finalize_fit"): estimator_instance._onedal_finalize_fit() @@ -362,13 +354,13 @@ def dataframe_function(x, y): def ndarray_function(x): for i in range(num_batches): - method_instance(x[i * batch_size : (i + 1) * batch_size]) + method_instance(x) if hasattr(estimator_instance, "_onedal_finalize_fit"): estimator_instance._onedal_finalize_fit() def dataframe_function(x): for i in range(num_batches): - method_instance(x.iloc[i * batch_size : (i + 1) * batch_size]) + method_instance(x) if hasattr(estimator_instance, "_onedal_finalize_fit"): estimator_instance._onedal_finalize_fit() @@ -423,32 +415,20 @@ def measure_sklearn_estimator( data_args = (x_train,) else: data_args = (x_test,) + batch_size = get_bench_case_value( + bench_case, f"algorithm:batch_size:{stage}" + ) if method == "partial_fit": num_batches = get_bench_case_value( - bench_case, f"algorithm:num_batches:{stage}" - ) - batch_size = get_bench_case_value( - bench_case, f"algorithm:batch_size:{stage}" + bench_case, f"algorithm:num_batches:{stage}", 5 ) - if batch_size is None: - if num_batches is None: - num_batches = 5 - batch_size = ( - data_args[0].shape[0] + num_batches - 1 - ) // num_batches - if num_batches is None: - num_batches = ( - data_args[0].shape[0] + batch_size - 1 - ) // batch_size - method_instance = create_online_function( estimator_instance, method_instance, data_args, - num_batches, - batch_size, + num_batches ) # daal4py model builders enabling branch if enable_modelbuilders and stage == "inference": @@ -465,6 +445,10 @@ def measure_sklearn_estimator( metrics[method]["time std[ms]"], _, ) = measure_case(bench_case, method_instance, *data_args) + if batch_size is not None: + metrics[method]["throughput[samples/ms]"] = ( + (data_args[0].shape[0] // batch_size) * batch_size + ) / metrics[method]["time[ms]"] if ensure_sklearnex_patching: full_method_name = f"{estimator_class.__name__}.{method}" sklearnex_logging_stream.seek(0) @@ -561,9 +545,16 @@ def main(bench_case: BenchCase, filters: List[BenchCase]): for stage in estimator_methods.keys(): data_descs[stage].update( { - "batch_size": get_bench_case_value( - bench_case, f"algorithm:batch_size:{stage}" - ) + key: val + for key, val in { + "batch_size": get_bench_case_value( + bench_case, f"algorithm:batch_size:{stage}" + ), + "num_batches": get_bench_case_value( + bench_case, f"algorithm:num_batches:{stage}" + ) + }.items() + if val is not None } ) if "n_classes" in data_description: diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py index 8e76479f..689396f1 100644 --- a/sklbench/report/implementation.py +++ b/sklbench/report/implementation.py @@ -94,6 +94,7 @@ "order", "n_classes", "n_clusters", + "num_batches", "batch_size", ]