Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add incremental algorithms support #160

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
96 changes: 96 additions & 0 deletions configs/incremental.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
{
"PARAMETERS_SETS": {
"common": {"bench": {"n_runs": 10, "time_limit": 60}},
"covariance data": {
"data": [
{
"source": "make_blobs",
"generation_kwargs": {
"centers": 1,
"n_samples": 1000,
"n_features": [16, 64]
},
"split_kwargs": {"ignore": true}
}
]
},
"basic_statistics data": {
"data": {
"source": "make_blobs",
"generation_kwargs": {
"centers": 1,
"n_samples": 10000,
"n_features": [16, 64]
},
"split_kwargs": {"ignore": true}
}
},
"linear_regression data": {
"data": {
"source": "make_regression",
"split_kwargs": {"train_size": 0.2, "test_size": 0.8},
"generation_kwargs": {
"n_samples": 5000,
"n_features": [40, 100],
"n_informative": 5,
"noise": 2.0
}
}
},
"pca data": {
"data": {
"source": "make_blobs",
"generation_kwargs": {
"centers": 1,
"n_samples": 1000,
"n_features": [16, 64]
},
"split_kwargs": {"ignore": true}
}
},
"covariance": {
"algorithm": [
{
"estimator": "IncrementalEmpiricalCovariance",
"library": "sklearnex.covariance",
"estimator_methods": {"training": "partial_fit"},
"num_batches": {"training": 2}
}
]
},
"basic_statistics": {
"algorithm": [
{
"estimator": "IncrementalBasicStatistics",
"library": "sklearnex.basic_statistics",
"num_batches": {"training": 2}
}
]
},
"linear_regression": {
"algorithm": [
{
"estimator": "IncrementalLinearRegression",
"library": "sklearnex.linear_model",
"num_batches": {"training": 2}
}
]
},
"pca": {
"algorithm": [
{
"estimator": "IncrementalPCA",
"library": "sklearnex.preview.decomposition",
"num_batches": {"training": 2}
}
]
}
},
"TEMPLATES": {
"covariance": {"SETS": ["common", "covariance", "covariance data"]},
"linear_regression": {
"SETS": ["common", "linear_regression", "linear_regression data"]
},
"pca": {"SETS": ["common", "pca", "pca data"]}
}
}
53 changes: 37 additions & 16 deletions sklbench/benchmarks/sklearn_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def get_estimator(library_name: str, estimator_name: str):
def get_estimator_methods(bench_case: BenchCase) -> Dict[str, List[str]]:
# default estimator methods
estimator_methods = {
"training": ["fit"],
"training": ["partial_fit", "fit"],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think partial_fit should be explicitly requested in config since most of incremental estimators can work in both modes:

Suggested change
"training": ["partial_fit", "fit"],
"training": ["fit"],

"inference": ["predict", "predict_proba", "transform"],
}
for stage in estimator_methods.keys():
Expand Down Expand Up @@ -334,34 +334,43 @@ def verify_patching(stream: io.StringIO, function_name) -> bool:
return acceleration_lines > 0 and fallback_lines == 0


def create_online_function(method_instance, data_args, batch_size):
n_batches = data_args[0].shape[0] // batch_size
def create_online_function(
estimator_instance, method_instance, data_args, num_batches, batch_size
):

if "y" in list(inspect.signature(method_instance).parameters):

def ndarray_function(x, y):
for i in range(n_batches):
for i in range(num_batches):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Leave old simple logic with batch_size only.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why change? It overcomplicates data slicing with extra parameter checks and calculations, also, it is more common to know batch size before partial_fit call in real world cases.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why change?

Adding new feature which can be useful.

It overcomplicates data slicing with extra parameter checks and calculations

It costs nothing. And doing calculations in the code is better than doing them in calculator before running benchmarks.

it is more common to know batch size before partial_fit call in real world cases.

But while doing benchmarking it is not less common (I'd say even more) when the user wants to specify exact number of partial_fit calls.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it makes sense to have both, since one would depend on the other

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Upon further investigation and using this branch, I think the setup here makes sense

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It just allows user to specify either num_batches or batch_size in the config. Original usage is unimpacted. I have no objections to this

method_instance(
x[i * batch_size : (i + 1) * batch_size],
y[i * batch_size : (i + 1) * batch_size],
)
if hasattr(estimator_instance, "_onedal_finalize_fit"):
estimator_instance._onedal_finalize_fit()

def dataframe_function(x, y):
for i in range(n_batches):
for i in range(num_batches):
method_instance(
x.iloc[i * batch_size : (i + 1) * batch_size],
y.iloc[i * batch_size : (i + 1) * batch_size],
)
if hasattr(estimator_instance, "_onedal_finalize_fit"):
estimator_instance._onedal_finalize_fit()

else:

def ndarray_function(x):
for i in range(n_batches):
for i in range(num_batches):
method_instance(x[i * batch_size : (i + 1) * batch_size])
if hasattr(estimator_instance, "_onedal_finalize_fit"):
estimator_instance._onedal_finalize_fit()

def dataframe_function(x):
for i in range(n_batches):
for i in range(num_batches):
method_instance(x.iloc[i * batch_size : (i + 1) * batch_size])
if hasattr(estimator_instance, "_onedal_finalize_fit"):
estimator_instance._onedal_finalize_fit()
Comment on lines +364 to +365
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it necessary to call finalize_fit? wouldn't this happen automatically? we specifically have flexibly logic here (ie use of method_instance variable) so let's avoid specific calls if possible

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

finalize_fit is called if any result attribute has been called. only partial_fit would be measured here without this call


if "ndarray" in str(type(data_args[0])):
return ndarray_function
Expand Down Expand Up @@ -414,12 +423,28 @@ def measure_sklearn_estimator(
data_args = (x_train,)
else:
data_args = (x_test,)
batch_size = get_bench_case_value(
bench_case, f"algorithm:batch_size:{stage}"
)
if batch_size is not None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Batch size setting is required by inference measurements.


if method == "partial_fit":
num_batches = get_bench_case_value(bench_case, "data:num_batches")
batch_size = get_bench_case_value(bench_case, "data:batch_size")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of separate branch for partial_fit, extend mechanism of online_inference_mode to partial fitting too.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you provide the exact link to implementation of this mechanism? i was not able to find the usage of this parameter, just see its setting in the config.

Copy link
Contributor

@Alexsandruss Alexsandruss Oct 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, online_inference_mode was removed as unnecessary before merge of refactor branch. This mode is enabled by batch_size != None only.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can split batch size into two for training and inference.

Copy link
Author

@olegkkruglov olegkkruglov Oct 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, online_inference_mode was removed as unnecessary before merge of refactor branch.

what should I extend then?


if batch_size is None:
if num_batches is None:
num_batches = 5
batch_size = (
data_args[0].shape[0] + num_batches - 1
) // num_batches
if num_batches is None:
num_batches = (
data_args[0].shape[0] + batch_size - 1
) // batch_size

method_instance = create_online_function(
method_instance, data_args, batch_size
estimator_instance,
method_instance,
data_args,
num_batches,
batch_size,
)
# daal4py model builders enabling branch
if enable_modelbuilders and stage == "inference":
Expand All @@ -436,10 +461,6 @@ def measure_sklearn_estimator(
metrics[method]["time std[ms]"],
_,
) = measure_case(bench_case, method_instance, *data_args)
if batch_size is not None:
metrics[method]["throughput[samples/ms]"] = (
(data_args[0].shape[0] // batch_size) * batch_size
) / metrics[method]["time[ms]"]
if ensure_sklearnex_patching:
full_method_name = f"{estimator_class.__name__}.{method}"
sklearnex_logging_stream.seek(0)
Expand Down
10 changes: 7 additions & 3 deletions sklbench/report/implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

import argparse
import json
from typing import Dict, List
from typing import Dict, Hashable, List

import openpyxl as xl
import pandas as pd
Expand Down Expand Up @@ -239,6 +239,7 @@ def get_result_tables_as_df(
bench_cases = pd.DataFrame(
[flatten_dict(bench_case) for bench_case in results["bench_cases"]]
)
bench_cases = bench_cases.map(lambda x: str(x) if not isinstance(x, Hashable) else x)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is non-hashable object you are trying to convert?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

basic statistics result_options parameter is a list


if compatibility_mode:
bench_cases = transform_results_to_compatible(bench_cases)
Expand All @@ -248,7 +249,7 @@ def get_result_tables_as_df(
bench_cases.drop(columns=[column], inplace=True)
diffby_columns.remove(column)

return split_df_by_columns(bench_cases, splitby_columns)
return split_df_by_columns(bench_cases, splitby_columns, False)


def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame:
Expand All @@ -258,7 +259,10 @@ def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame:
# only relative improvements are included in summary currently
if len(column) > 1 and column[1] == f"{metric_name} relative improvement":
metric_columns.append(column)
summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T
if metric_columns:
summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T
else:
summary = pd.DataFrame()
summary.index = pd.Index([df_name])
return summary

Expand Down
5 changes: 5 additions & 0 deletions test-configuration-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ steps:
conda activate bench-env
python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json
displayName: Sklearn example run
- script: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate bench-env
python -m sklbench --report -l DEBUG --report -c configs/incremental.json
displayName: Incremental algorithms example run
- script: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate bench-env
Expand Down
4 changes: 4 additions & 0 deletions test-configuration-win.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ steps:
call activate bench-env
python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json
displayName: Sklearn example run
- script: |
call activate bench-env
python -m sklbench --report -l DEBUG --report -c configs/incremental.json
displayName: Incremental algorithms example run
- script: |
call activate bench-env
python -m sklbench --report -l DEBUG --report -c configs/xgboost_example.json
Expand Down