diff --git a/configs/README.md b/configs/README.md index 8d3c5ac2..07c92dc1 100644 --- a/configs/README.md +++ b/configs/README.md @@ -117,6 +117,7 @@ Configs have the three highest parameter keys: |:---------------|:--------------|:--------|:------------| | `algorithm`:`estimator` | None | | Name of measured estimator. | | `algorithm`:`estimator_params` | Empty `dict` | | Parameters for estimator constructor. | +| `algorithm`:`training`:`num_batches` | 5 | | Number of batches to benchmark `partial_fit` function, using batches the size of number of samples specified (not samples divided by `num_batches`). For incremental estimators only. | | `algorithm`:`online_inference_mode` | False | | Enables online mode for inference methods of estimator (separate call for each sample). | | `algorithm`:`sklearn_context` | None | | Parameters for sklearn `config_context` used over estimator. | | `algorithm`:`sklearnex_context` | None | | Parameters for sklearnex `config_context` used over estimator. Updated by `sklearn_context` if set. | diff --git a/configs/sklearnex_incremental_example.json b/configs/sklearnex_incremental_example.json new file mode 100644 index 00000000..37b2c7fb --- /dev/null +++ b/configs/sklearnex_incremental_example.json @@ -0,0 +1,65 @@ +{ + "PARAMETERS_SETS": { + "common": {"bench": {"n_runs": 10, "time_limit": 60}}, + "unlabeled dataset": { + "data": [ + { + "source": "make_blobs", + "generation_kwargs": { + "centers": 1, + "n_samples": 1000, + "n_features": [16, 64] + }, + "split_kwargs": {"ignore": true} + } + ] + }, + "labeled dataset": { + "data": { + "source": "make_regression", + "split_kwargs": {"train_size": 0.2, "test_size": 0.8}, + "generation_kwargs": { + "n_samples": 5000, + "n_features": [40, 100], + "n_informative": 5, + "noise": 2.0 + } + } + }, + "covariance": { + "algorithm": [ + { + "estimator": "IncrementalEmpiricalCovariance", + "library": "sklearnex.covariance", + "estimator_methods": {"training": "partial_fit"}, + "num_batches": {"training": 2} + } + ] + }, + "linear_regression": { + "algorithm": [ + { + "estimator": "IncrementalLinearRegression", + "library": "sklearnex", + "num_batches": {"training": 2} + } + ] + }, + "pca": { + "algorithm": [ + { + "estimator": "IncrementalPCA", + "library": "sklearnex.preview", + "num_batches": {"training": 2} + } + ] + } + }, + "TEMPLATES": { + "covariance": {"SETS": ["common", "covariance", "unlabeled dataset"]}, + "linear_regression": { + "SETS": ["common", "linear_regression", "labeled dataset"] + }, + "pca": {"SETS": ["common", "pca", "unlabeled dataset"]} + } +} diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index f9c0a75e..dd0ef1a5 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -74,7 +74,7 @@ def get_estimator(library_name: str, estimator_name: str): def get_estimator_methods(bench_case: BenchCase) -> Dict[str, List[str]]: # default estimator methods estimator_methods = { - "training": ["fit"], + "training": ["partial_fit", "fit"], "inference": ["predict", "predict_proba", "transform"], } for stage in estimator_methods.keys(): @@ -334,34 +334,35 @@ def verify_patching(stream: io.StringIO, function_name) -> bool: return acceleration_lines > 0 and fallback_lines == 0 -def create_online_function(method_instance, data_args, batch_size): - n_batches = data_args[0].shape[0] // batch_size +def create_online_function(estimator_instance, method_instance, data_args, num_batches): if "y" in list(inspect.signature(method_instance).parameters): def ndarray_function(x, y): - for i in range(n_batches): - method_instance( - x[i * batch_size : (i + 1) * batch_size], - y[i * batch_size : (i + 1) * batch_size], - ) + for i in range(num_batches): + method_instance(x, y) + if hasattr(estimator_instance, "_onedal_finalize_fit"): + estimator_instance._onedal_finalize_fit() def dataframe_function(x, y): - for i in range(n_batches): - method_instance( - x.iloc[i * batch_size : (i + 1) * batch_size], - y.iloc[i * batch_size : (i + 1) * batch_size], - ) + for i in range(num_batches): + method_instance(x, y) + if hasattr(estimator_instance, "_onedal_finalize_fit"): + estimator_instance._onedal_finalize_fit() else: def ndarray_function(x): - for i in range(n_batches): - method_instance(x[i * batch_size : (i + 1) * batch_size]) + for i in range(num_batches): + method_instance(x) + if hasattr(estimator_instance, "_onedal_finalize_fit"): + estimator_instance._onedal_finalize_fit() def dataframe_function(x): - for i in range(n_batches): - method_instance(x.iloc[i * batch_size : (i + 1) * batch_size]) + for i in range(num_batches): + method_instance(x) + if hasattr(estimator_instance, "_onedal_finalize_fit"): + estimator_instance._onedal_finalize_fit() if "ndarray" in str(type(data_args[0])): return ndarray_function @@ -417,9 +418,17 @@ def measure_sklearn_estimator( batch_size = get_bench_case_value( bench_case, f"algorithm:batch_size:{stage}" ) - if batch_size is not None: + + if method == "partial_fit": + num_batches = get_bench_case_value( + bench_case, f"algorithm:num_batches:{stage}", 5 + ) + method_instance = create_online_function( - method_instance, data_args, batch_size + estimator_instance, + method_instance, + data_args, + num_batches ) # daal4py model builders enabling branch if enable_modelbuilders and stage == "inference": @@ -536,9 +545,16 @@ def main(bench_case: BenchCase, filters: List[BenchCase]): for stage in estimator_methods.keys(): data_descs[stage].update( { - "batch_size": get_bench_case_value( - bench_case, f"algorithm:batch_size:{stage}" - ) + key: val + for key, val in { + "batch_size": get_bench_case_value( + bench_case, f"algorithm:batch_size:{stage}" + ), + "num_batches": get_bench_case_value( + bench_case, f"algorithm:num_batches:{stage}" + ) + }.items() + if val is not None } ) if "n_classes" in data_description: diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py index 28fa2bb0..689396f1 100644 --- a/sklbench/report/implementation.py +++ b/sklbench/report/implementation.py @@ -16,7 +16,7 @@ import argparse import json -from typing import Dict, List +from typing import Dict, Hashable, List import openpyxl as xl import pandas as pd @@ -94,6 +94,7 @@ "order", "n_classes", "n_clusters", + "num_batches", "batch_size", ] @@ -239,6 +240,7 @@ def get_result_tables_as_df( bench_cases = pd.DataFrame( [flatten_dict(bench_case) for bench_case in results["bench_cases"]] ) + bench_cases = bench_cases.map(lambda x: str(x) if not isinstance(x, Hashable) else x) if compatibility_mode: bench_cases = transform_results_to_compatible(bench_cases) @@ -248,7 +250,7 @@ def get_result_tables_as_df( bench_cases.drop(columns=[column], inplace=True) diffby_columns.remove(column) - return split_df_by_columns(bench_cases, splitby_columns) + return split_df_by_columns(bench_cases, splitby_columns, False) def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame: @@ -258,7 +260,10 @@ def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame: # only relative improvements are included in summary currently if len(column) > 1 and column[1] == f"{metric_name} relative improvement": metric_columns.append(column) - summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T + if metric_columns: + summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T + else: + summary = pd.DataFrame() summary.index = pd.Index([df_name]) return summary diff --git a/test-configuration-linux.yml b/test-configuration-linux.yml index a37769ce..d8c1a64e 100644 --- a/test-configuration-linux.yml +++ b/test-configuration-linux.yml @@ -45,6 +45,11 @@ steps: conda activate bench-env python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json displayName: Sklearn example run + - script: | + source /usr/share/miniconda/etc/profile.d/conda.sh + conda activate bench-env + python -m sklbench --report -l DEBUG --report -c configs/sklearnex_incremental_example.json + displayName: Incremental algorithms example run - script: | source /usr/share/miniconda/etc/profile.d/conda.sh conda activate bench-env diff --git a/test-configuration-win.yml b/test-configuration-win.yml index a1eddaeb..f3ac1595 100644 --- a/test-configuration-win.yml +++ b/test-configuration-win.yml @@ -43,6 +43,10 @@ steps: call activate bench-env python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json displayName: Sklearn example run + - script: | + call activate bench-env + python -m sklbench --report -l DEBUG --report -c configs/incremental.json + displayName: Incremental algorithms example run - script: | call activate bench-env python -m sklbench --report -l DEBUG --report -c configs/xgboost_example.json