-
Notifications
You must be signed in to change notification settings - Fork 73
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH: Add incremental algorithms support #160
base: main
Are you sure you want to change the base?
Changes from all commits
535c1e4
d6952ac
03a152a
3ac5c23
9461fad
b82d772
b5ad233
fc4ad2b
040802d
69cc4c1
f275062
5a9be80
66d977d
1d48f3a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
{ | ||
"PARAMETERS_SETS": { | ||
"common": {"bench": {"n_runs": 10, "time_limit": 60}}, | ||
"unlabeled dataset": { | ||
"data": [ | ||
{ | ||
"source": "make_blobs", | ||
"generation_kwargs": { | ||
"centers": 1, | ||
"n_samples": 1000, | ||
"n_features": [16, 64] | ||
}, | ||
"split_kwargs": {"ignore": true} | ||
} | ||
] | ||
}, | ||
"labeled dataset": { | ||
"data": { | ||
"source": "make_regression", | ||
"split_kwargs": {"train_size": 0.2, "test_size": 0.8}, | ||
"generation_kwargs": { | ||
"n_samples": 5000, | ||
"n_features": [40, 100], | ||
"n_informative": 5, | ||
"noise": 2.0 | ||
} | ||
} | ||
}, | ||
"covariance": { | ||
"algorithm": [ | ||
{ | ||
"estimator": "IncrementalEmpiricalCovariance", | ||
"library": "sklearnex.covariance", | ||
"estimator_methods": {"training": "partial_fit"}, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there a reason estimator_methods is only specified for one algo? |
||
"num_batches": {"training": 2} | ||
} | ||
] | ||
}, | ||
"linear_regression": { | ||
"algorithm": [ | ||
{ | ||
"estimator": "IncrementalLinearRegression", | ||
"library": "sklearnex", | ||
"num_batches": {"training": 2} | ||
} | ||
] | ||
}, | ||
"pca": { | ||
"algorithm": [ | ||
{ | ||
"estimator": "IncrementalPCA", | ||
"library": "sklearnex.preview", | ||
"num_batches": {"training": 2} | ||
} | ||
] | ||
} | ||
}, | ||
"TEMPLATES": { | ||
"covariance": {"SETS": ["common", "covariance", "unlabeled dataset"]}, | ||
"linear_regression": { | ||
"SETS": ["common", "linear_regression", "labeled dataset"] | ||
}, | ||
"pca": {"SETS": ["common", "pca", "unlabeled dataset"]} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -74,7 +74,7 @@ def get_estimator(library_name: str, estimator_name: str): | |||||
def get_estimator_methods(bench_case: BenchCase) -> Dict[str, List[str]]: | ||||||
# default estimator methods | ||||||
estimator_methods = { | ||||||
"training": ["fit"], | ||||||
"training": ["partial_fit", "fit"], | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think
Suggested change
|
||||||
"inference": ["predict", "predict_proba", "transform"], | ||||||
} | ||||||
for stage in estimator_methods.keys(): | ||||||
|
@@ -334,34 +334,35 @@ def verify_patching(stream: io.StringIO, function_name) -> bool: | |||||
return acceleration_lines > 0 and fallback_lines == 0 | ||||||
|
||||||
|
||||||
def create_online_function(method_instance, data_args, batch_size): | ||||||
n_batches = data_args[0].shape[0] // batch_size | ||||||
def create_online_function(estimator_instance, method_instance, data_args, num_batches): | ||||||
|
||||||
if "y" in list(inspect.signature(method_instance).parameters): | ||||||
|
||||||
def ndarray_function(x, y): | ||||||
for i in range(n_batches): | ||||||
method_instance( | ||||||
x[i * batch_size : (i + 1) * batch_size], | ||||||
y[i * batch_size : (i + 1) * batch_size], | ||||||
) | ||||||
for i in range(num_batches): | ||||||
method_instance(x, y) | ||||||
if hasattr(estimator_instance, "_onedal_finalize_fit"): | ||||||
estimator_instance._onedal_finalize_fit() | ||||||
|
||||||
def dataframe_function(x, y): | ||||||
for i in range(n_batches): | ||||||
method_instance( | ||||||
x.iloc[i * batch_size : (i + 1) * batch_size], | ||||||
y.iloc[i * batch_size : (i + 1) * batch_size], | ||||||
) | ||||||
for i in range(num_batches): | ||||||
method_instance(x, y) | ||||||
if hasattr(estimator_instance, "_onedal_finalize_fit"): | ||||||
estimator_instance._onedal_finalize_fit() | ||||||
|
||||||
else: | ||||||
|
||||||
def ndarray_function(x): | ||||||
for i in range(n_batches): | ||||||
method_instance(x[i * batch_size : (i + 1) * batch_size]) | ||||||
for i in range(num_batches): | ||||||
method_instance(x) | ||||||
if hasattr(estimator_instance, "_onedal_finalize_fit"): | ||||||
estimator_instance._onedal_finalize_fit() | ||||||
|
||||||
def dataframe_function(x): | ||||||
for i in range(n_batches): | ||||||
method_instance(x.iloc[i * batch_size : (i + 1) * batch_size]) | ||||||
for i in range(num_batches): | ||||||
method_instance(x) | ||||||
if hasattr(estimator_instance, "_onedal_finalize_fit"): | ||||||
estimator_instance._onedal_finalize_fit() | ||||||
Comment on lines
+364
to
+365
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it necessary to call finalize_fit? wouldn't this happen automatically? we specifically have flexibly logic here (ie use of method_instance variable) so let's avoid specific calls if possible There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. finalize_fit is called if any result attribute has been called. only partial_fit would be measured here without this call |
||||||
|
||||||
if "ndarray" in str(type(data_args[0])): | ||||||
return ndarray_function | ||||||
|
@@ -417,9 +418,17 @@ def measure_sklearn_estimator( | |||||
batch_size = get_bench_case_value( | ||||||
bench_case, f"algorithm:batch_size:{stage}" | ||||||
) | ||||||
if batch_size is not None: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Batch size setting is required by inference measurements. |
||||||
|
||||||
if method == "partial_fit": | ||||||
num_batches = get_bench_case_value( | ||||||
bench_case, f"algorithm:num_batches:{stage}", 5 | ||||||
) | ||||||
|
||||||
method_instance = create_online_function( | ||||||
method_instance, data_args, batch_size | ||||||
estimator_instance, | ||||||
method_instance, | ||||||
data_args, | ||||||
num_batches | ||||||
) | ||||||
# daal4py model builders enabling branch | ||||||
if enable_modelbuilders and stage == "inference": | ||||||
|
@@ -536,9 +545,16 @@ def main(bench_case: BenchCase, filters: List[BenchCase]): | |||||
for stage in estimator_methods.keys(): | ||||||
data_descs[stage].update( | ||||||
{ | ||||||
"batch_size": get_bench_case_value( | ||||||
bench_case, f"algorithm:batch_size:{stage}" | ||||||
) | ||||||
key: val | ||||||
for key, val in { | ||||||
"batch_size": get_bench_case_value( | ||||||
bench_case, f"algorithm:batch_size:{stage}" | ||||||
), | ||||||
"num_batches": get_bench_case_value( | ||||||
bench_case, f"algorithm:num_batches:{stage}" | ||||||
) | ||||||
}.items() | ||||||
if val is not None | ||||||
} | ||||||
) | ||||||
if "n_classes" in data_description: | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,7 +16,7 @@ | |
|
||
import argparse | ||
import json | ||
from typing import Dict, List | ||
from typing import Dict, Hashable, List | ||
|
||
import openpyxl as xl | ||
import pandas as pd | ||
|
@@ -94,6 +94,7 @@ | |
"order", | ||
"n_classes", | ||
"n_clusters", | ||
"num_batches", | ||
"batch_size", | ||
] | ||
|
||
|
@@ -239,6 +240,7 @@ def get_result_tables_as_df( | |
bench_cases = pd.DataFrame( | ||
[flatten_dict(bench_case) for bench_case in results["bench_cases"]] | ||
) | ||
bench_cases = bench_cases.map(lambda x: str(x) if not isinstance(x, Hashable) else x) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is non-hashable object you are trying to convert? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. basic statistics result_options parameter is a list |
||
|
||
if compatibility_mode: | ||
bench_cases = transform_results_to_compatible(bench_cases) | ||
|
@@ -248,7 +250,7 @@ def get_result_tables_as_df( | |
bench_cases.drop(columns=[column], inplace=True) | ||
diffby_columns.remove(column) | ||
|
||
return split_df_by_columns(bench_cases, splitby_columns) | ||
return split_df_by_columns(bench_cases, splitby_columns, False) | ||
|
||
|
||
def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame: | ||
|
@@ -258,7 +260,10 @@ def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame: | |
# only relative improvements are included in summary currently | ||
if len(column) > 1 and column[1] == f"{metric_name} relative improvement": | ||
metric_columns.append(column) | ||
summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T | ||
if metric_columns: | ||
summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T | ||
else: | ||
summary = pd.DataFrame() | ||
summary.index = pd.Index([df_name]) | ||
return summary | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.