Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add incremental algorithms support #160

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions configs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ Configs have the three highest parameter keys:
|:---------------|:--------------|:--------|:------------|
| `algorithm`:`estimator` | None | | Name of measured estimator. |
| `algorithm`:`estimator_params` | Empty `dict` | | Parameters for estimator constructor. |
| `algorithm`:`training`:`num_batches` | 5 | | Number of batches to benchmark `partial_fit` function, using batches the size of number of samples specified (not samples divided by `num_batches`). For incremental estimators only. |
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Why same is not applied to inference?
  2. Wrong order of keys:
Suggested change
| `algorithm`:`training`:`num_batches` | 5 | | Number of batches to benchmark `partial_fit` function, using batches the size of number of samples specified (not samples divided by `num_batches`). For incremental estimators only. |
| `algorithm`:`num_batches`:`training` | 5 | | Number of batches to benchmark `partial_fit` function, using batches the size of number of samples specified (not samples divided by `num_batches`). For incremental estimators only. |

| `algorithm`:`online_inference_mode` | False | | Enables online mode for inference methods of estimator (separate call for each sample). |
| `algorithm`:`sklearn_context` | None | | Parameters for sklearn `config_context` used over estimator. |
| `algorithm`:`sklearnex_context` | None | | Parameters for sklearnex `config_context` used over estimator. Updated by `sklearn_context` if set. |
Expand Down
65 changes: 65 additions & 0 deletions configs/sklearnex_incremental_example.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
{
"PARAMETERS_SETS": {
"common": {"bench": {"n_runs": 10, "time_limit": 60}},
"unlabeled dataset": {
"data": [
{
"source": "make_blobs",
"generation_kwargs": {
"centers": 1,
"n_samples": 1000,
"n_features": [16, 64]
},
"split_kwargs": {"ignore": true}
}
]
},
"labeled dataset": {
"data": {
"source": "make_regression",
"split_kwargs": {"train_size": 0.2, "test_size": 0.8},
"generation_kwargs": {
"n_samples": 5000,
"n_features": [40, 100],
"n_informative": 5,
"noise": 2.0
}
}
},
"covariance": {
"algorithm": [
{
"estimator": "IncrementalEmpiricalCovariance",
"library": "sklearnex.covariance",
"estimator_methods": {"training": "partial_fit"},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a reason estimator_methods is only specified for one algo?

"num_batches": {"training": 2}
}
]
},
"linear_regression": {
"algorithm": [
{
"estimator": "IncrementalLinearRegression",
"library": "sklearnex",
"num_batches": {"training": 2}
}
]
},
"pca": {
"algorithm": [
{
"estimator": "IncrementalPCA",
"library": "sklearnex.preview",
"num_batches": {"training": 2}
}
]
}
},
"TEMPLATES": {
"covariance": {"SETS": ["common", "covariance", "unlabeled dataset"]},
"linear_regression": {
"SETS": ["common", "linear_regression", "labeled dataset"]
},
"pca": {"SETS": ["common", "pca", "unlabeled dataset"]}
}
}
60 changes: 38 additions & 22 deletions sklbench/benchmarks/sklearn_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def get_estimator(library_name: str, estimator_name: str):
def get_estimator_methods(bench_case: BenchCase) -> Dict[str, List[str]]:
# default estimator methods
estimator_methods = {
"training": ["fit"],
"training": ["partial_fit", "fit"],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think partial_fit should be explicitly requested in config since most of incremental estimators can work in both modes:

Suggested change
"training": ["partial_fit", "fit"],
"training": ["fit"],

"inference": ["predict", "predict_proba", "transform"],
}
for stage in estimator_methods.keys():
Expand Down Expand Up @@ -334,34 +334,35 @@ def verify_patching(stream: io.StringIO, function_name) -> bool:
return acceleration_lines > 0 and fallback_lines == 0


def create_online_function(method_instance, data_args, batch_size):
n_batches = data_args[0].shape[0] // batch_size
def create_online_function(estimator_instance, method_instance, data_args, num_batches):

if "y" in list(inspect.signature(method_instance).parameters):

def ndarray_function(x, y):
for i in range(n_batches):
method_instance(
x[i * batch_size : (i + 1) * batch_size],
y[i * batch_size : (i + 1) * batch_size],
)
for i in range(num_batches):
method_instance(x, y)
if hasattr(estimator_instance, "_onedal_finalize_fit"):
estimator_instance._onedal_finalize_fit()

def dataframe_function(x, y):
for i in range(n_batches):
method_instance(
x.iloc[i * batch_size : (i + 1) * batch_size],
y.iloc[i * batch_size : (i + 1) * batch_size],
)
for i in range(num_batches):
method_instance(x, y)
if hasattr(estimator_instance, "_onedal_finalize_fit"):
estimator_instance._onedal_finalize_fit()

else:

def ndarray_function(x):
for i in range(n_batches):
method_instance(x[i * batch_size : (i + 1) * batch_size])
for i in range(num_batches):
method_instance(x)
if hasattr(estimator_instance, "_onedal_finalize_fit"):
estimator_instance._onedal_finalize_fit()

def dataframe_function(x):
for i in range(n_batches):
method_instance(x.iloc[i * batch_size : (i + 1) * batch_size])
for i in range(num_batches):
method_instance(x)
if hasattr(estimator_instance, "_onedal_finalize_fit"):
estimator_instance._onedal_finalize_fit()
Comment on lines +364 to +365
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it necessary to call finalize_fit? wouldn't this happen automatically? we specifically have flexibly logic here (ie use of method_instance variable) so let's avoid specific calls if possible

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

finalize_fit is called if any result attribute has been called. only partial_fit would be measured here without this call


if "ndarray" in str(type(data_args[0])):
return ndarray_function
Expand Down Expand Up @@ -417,9 +418,17 @@ def measure_sklearn_estimator(
batch_size = get_bench_case_value(
bench_case, f"algorithm:batch_size:{stage}"
)
if batch_size is not None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Batch size setting is required by inference measurements.


if method == "partial_fit":
num_batches = get_bench_case_value(
bench_case, f"algorithm:num_batches:{stage}", 5
)

method_instance = create_online_function(
method_instance, data_args, batch_size
estimator_instance,
method_instance,
data_args,
num_batches
)
# daal4py model builders enabling branch
if enable_modelbuilders and stage == "inference":
Expand Down Expand Up @@ -536,9 +545,16 @@ def main(bench_case: BenchCase, filters: List[BenchCase]):
for stage in estimator_methods.keys():
data_descs[stage].update(
{
"batch_size": get_bench_case_value(
bench_case, f"algorithm:batch_size:{stage}"
)
key: val
for key, val in {
"batch_size": get_bench_case_value(
bench_case, f"algorithm:batch_size:{stage}"
),
"num_batches": get_bench_case_value(
bench_case, f"algorithm:num_batches:{stage}"
)
}.items()
if val is not None
}
)
if "n_classes" in data_description:
Expand Down
11 changes: 8 additions & 3 deletions sklbench/report/implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

import argparse
import json
from typing import Dict, List
from typing import Dict, Hashable, List

import openpyxl as xl
import pandas as pd
Expand Down Expand Up @@ -94,6 +94,7 @@
"order",
"n_classes",
"n_clusters",
"num_batches",
"batch_size",
]

Expand Down Expand Up @@ -239,6 +240,7 @@ def get_result_tables_as_df(
bench_cases = pd.DataFrame(
[flatten_dict(bench_case) for bench_case in results["bench_cases"]]
)
bench_cases = bench_cases.map(lambda x: str(x) if not isinstance(x, Hashable) else x)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is non-hashable object you are trying to convert?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

basic statistics result_options parameter is a list


if compatibility_mode:
bench_cases = transform_results_to_compatible(bench_cases)
Expand All @@ -248,7 +250,7 @@ def get_result_tables_as_df(
bench_cases.drop(columns=[column], inplace=True)
diffby_columns.remove(column)

return split_df_by_columns(bench_cases, splitby_columns)
return split_df_by_columns(bench_cases, splitby_columns, False)


def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame:
Expand All @@ -258,7 +260,10 @@ def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame:
# only relative improvements are included in summary currently
if len(column) > 1 and column[1] == f"{metric_name} relative improvement":
metric_columns.append(column)
summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T
if metric_columns:
summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T
else:
summary = pd.DataFrame()
summary.index = pd.Index([df_name])
return summary

Expand Down
5 changes: 5 additions & 0 deletions test-configuration-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ steps:
conda activate bench-env
python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json
displayName: Sklearn example run
- script: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate bench-env
python -m sklbench --report -l DEBUG --report -c configs/sklearnex_incremental_example.json
displayName: Incremental algorithms example run
- script: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate bench-env
Expand Down
4 changes: 4 additions & 0 deletions test-configuration-win.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ steps:
call activate bench-env
python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json
displayName: Sklearn example run
- script: |
call activate bench-env
python -m sklbench --report -l DEBUG --report -c configs/incremental.json
displayName: Incremental algorithms example run
- script: |
call activate bench-env
python -m sklbench --report -l DEBUG --report -c configs/xgboost_example.json
Expand Down
Loading