From b3500dd94fe02bbcb79af06b77655421edf5026a Mon Sep 17 00:00:00 2001 From: ethanglaser Date: Fri, 30 Aug 2024 23:27:21 +0000 Subject: [PATCH 001/110] Creating branch for large scale measurements --- configs/spmd/large_scale/basic_stats.json | 30 +++++++++++++++ configs/spmd/large_scale/covariance.json | 30 +++++++++++++++ configs/spmd/large_scale/dbscan.json | 32 ++++++++++++++++ configs/spmd/large_scale/kmeans.json | 32 ++++++++++++++++ configs/spmd/large_scale/knn.json | 43 ++++++++++++++++++++++ configs/spmd/large_scale/large_scale.json | 31 ++++++++++++++++ configs/spmd/large_scale/linear_model.json | 27 ++++++++++++++ configs/spmd/large_scale/logreg.json | 29 +++++++++++++++ configs/spmd/large_scale/pca.json | 30 +++++++++++++++ sklbench/benchmarks/sklearn_estimator.py | 8 ++-- sklbench/datasets/transformer.py | 9 +++-- sklbench/runner/commands_helper.py | 3 ++ 12 files changed, 298 insertions(+), 6 deletions(-) create mode 100644 configs/spmd/large_scale/basic_stats.json create mode 100644 configs/spmd/large_scale/covariance.json create mode 100644 configs/spmd/large_scale/dbscan.json create mode 100644 configs/spmd/large_scale/kmeans.json create mode 100644 configs/spmd/large_scale/knn.json create mode 100644 configs/spmd/large_scale/large_scale.json create mode 100644 configs/spmd/large_scale/linear_model.json create mode 100644 configs/spmd/large_scale/logreg.json create mode 100644 configs/spmd/large_scale/pca.json diff --git a/configs/spmd/large_scale/basic_stats.json b/configs/spmd/large_scale/basic_stats.json new file mode 100644 index 00000000..a9542017 --- /dev/null +++ b/configs/spmd/large_scale/basic_stats.json @@ -0,0 +1,30 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd basicstats parameters": { + "algorithm": { + "estimator": "BasicStatistics", + "estimator_methods": { "training": "compute" } + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } + ] + } + }, + "TEMPLATES": { + "basicstats": { + "SETS": [ + "sklearnex spmd implementation", + "large scale default parameters", + "synthetic data", + "spmd basicstats parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/covariance.json b/configs/spmd/large_scale/covariance.json new file mode 100644 index 00000000..3280bf77 --- /dev/null +++ b/configs/spmd/large_scale/covariance.json @@ -0,0 +1,30 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd basicstats parameters": { + "algorithm": { + "estimator": "EmpiricalCovariance", + "estimator_methods": { "training": "fit" } + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } + ] + } + }, + "TEMPLATES": { + "covariance": { + "SETS": [ + "sklearnex spmd implementation", + "large scale default parameters", + "synthetic data", + "spmd basicstats parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/dbscan.json b/configs/spmd/large_scale/dbscan.json new file mode 100644 index 00000000..c46287d8 --- /dev/null +++ b/configs/spmd/large_scale/dbscan.json @@ -0,0 +1,32 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/dbscan.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd dbscan parameters": { + "algorithm": { + "estimator": "DBSCAN", + "estimator_methods": { + "training": "fit" + } + }, + "data": { + "dtype": "float64" + } + }, + "synthetic dataset": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 10, "centers": 10 }, "algorithm": { "eps": 5, "min_samples": 5 } } + ] + } + }, + "TEMPLATES": { + "dbscan": { + "SETS": [ + "common dbscan parameters", + "synthetic dataset", + "sklearnex spmd implementation", + "large scale default parameters", + "spmd dbscan parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/kmeans.json b/configs/spmd/large_scale/kmeans.json new file mode 100644 index 00000000..3b490f14 --- /dev/null +++ b/configs/spmd/large_scale/kmeans.json @@ -0,0 +1,32 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/kmeans.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd kmeans parameters": { + "algorithm": { + "estimator": "KMeans", + "estimator_params": { + "algorithm": "lloyd" + }, + "estimator_methods": { "training": "fit" } + }, + "bench": { + "mpi_params": {"n": 48} + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } } + ] + } + }, + "TEMPLATES": { + "kmeans": { + "SETS": [ + "synthetic data", + "sklearnex spmd implementation", + "large scale default parameters", + "spmd kmeans parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json new file mode 100644 index 00000000..8b82094d --- /dev/null +++ b/configs/spmd/large_scale/knn.json @@ -0,0 +1,43 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/knn.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd knn cls parameters": { + "algorithm": { + "estimator": "KNeighborsClassifier", + "estimator_params": { + "algorithm": "brute", + "metric": "minkowski", + "p": 2, + "weights": "uniform", + "n_neighbors": 5 + }, + "estimator_methods": { + "training": "fit", + "inference": "predict" + } + }, + "bench": { + "mpi_params": {} + } + }, + "synthetic classification data": { + "data": [ + { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 }, "generation_kwargs": { "n_samples": 550000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, + { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 500000 }, "generation_kwargs": { "n_samples": 550000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, + { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 50000 }, "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, + { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 500000 }, "generation_kwargs": { "n_samples": 1000000, "n_features": 10, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } + ] + } + }, + "TEMPLATES": { + "knn classifier": { + "SETS": [ + "common knn parameters", + "synthetic classification data", + "sklearnex spmd implementation", + "large scale 2k parameters", + "spmd knn cls parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json new file mode 100644 index 00000000..4b39d5e2 --- /dev/null +++ b/configs/spmd/large_scale/large_scale.json @@ -0,0 +1,31 @@ +{ + "PARAMETERS_SETS": { + "large scale default parameters": { + "data": { + "dtype": "float64", + "distributed_split": "None" + }, + "bench": { + "mpi_params": {"n": [1,2,3,4,5,6,7,8,9,10,11,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + "large scale 2k parameters": { + "data": { + "dtype": "float64", + "distributed_split": "None" + }, + "bench": { + "mpi_params": {"n": [192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + "large scale impi parameters": { + "data": { + "dtype": "float64", + "distributed_split": "no" + }, + "bench": { + "mpi_params": {"n": [1,2,4,6,12,24], "ppn": 12} + } + } + } +} diff --git a/configs/spmd/large_scale/linear_model.json b/configs/spmd/large_scale/linear_model.json new file mode 100644 index 00000000..4c861caa --- /dev/null +++ b/configs/spmd/large_scale/linear_model.json @@ -0,0 +1,27 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/linear_model.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd linear parameters": { + "algorithm": { + "estimator": "LinearRegression", + "estimator_methods": { "training": "fit" } + } + }, + "synthetic data": { + "data": [ + { "source": "make_regression", "generation_kwargs": { "n_samples": 30005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 30000000, "test_size": 5000 } }, + { "source": "make_regression", "generation_kwargs": { "n_samples": 305000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 300000, "test_size": 5000 } } + ] + } + }, + "TEMPLATES": { + "linreg": { + "SETS": [ + "sklearnex spmd implementation", + "large scale default parameters", + "synthetic data", + "spmd linear parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/logreg.json b/configs/spmd/large_scale/logreg.json new file mode 100644 index 00000000..c5ef6203 --- /dev/null +++ b/configs/spmd/large_scale/logreg.json @@ -0,0 +1,29 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/logreg.json", "../logreg.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd logreg2 parameters": { + "algorithm":{ + "estimator": "LogisticRegression", + "estimator_methods": { "inference": "predict" }, + "estimator_params": { "max_iter": 20 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 10, "n_classes": 2 } }, + { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 101000, "n_features": 1000, "n_classes": 2 } } + ] + } + }, + "TEMPLATES": { + "linreg": { + "SETS": [ + "sklearnex spmd implementation", + "large scale 2k parameters", + "spmd logreg parameters", + "synthetic data", + "spmd logreg2 parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/pca.json b/configs/spmd/large_scale/pca.json new file mode 100644 index 00000000..35c1942a --- /dev/null +++ b/configs/spmd/large_scale/pca.json @@ -0,0 +1,30 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/pca.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd pca parameters": { + "algorithm": { + "estimator": "PCA", + "estimator_methods": { "training": "fit", "inference": "" } + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } + ] + } + }, + "TEMPLATES": { + "linreg": { + "SETS": [ + "sklearnex spmd implementation", + "large scale default parameters", + "synthetic data", + "spmd pca parameters" + ] + } + } +} diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index f9c0a75e..42f8725b 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -134,6 +134,9 @@ def get_subset_metrics_of_estimator( and isinstance(iterations[0], Union[Numeric, NumpyNumeric].__args__) ): metrics.update({"iterations": int(iterations[0])}) + if hasattr(estimator_instance, "_n_inner_iter"): + inner_iters = estimator_instance._n_inner_iter + metrics.update({"inner_iters": int(inner_iters)}) if task == "classification": y_pred = convert_to_numpy(estimator_instance.predict(x)) metrics.update( @@ -142,7 +145,7 @@ def get_subset_metrics_of_estimator( "balanced accuracy": float(balanced_accuracy_score(y_compat, y_pred)), } ) - if hasattr(estimator_instance, "predict_proba") and not ( + '''if hasattr(estimator_instance, "predict_proba") and not ( hasattr(estimator_instance, "probability") and getattr(estimator_instance, "probability") == False ): @@ -162,7 +165,7 @@ def get_subset_metrics_of_estimator( ), "logloss": float(log_loss(y_compat, y_pred_proba)), } - ) + )''' elif task == "regression": y_pred = convert_to_numpy(estimator_instance.predict(x)) metrics.update( @@ -429,7 +432,6 @@ def measure_sklearn_estimator( estimator_instance.get_booster() ) method_instance = getattr(daal_model, method) - metrics[method] = dict() ( metrics[method]["time[ms]"], diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index d2e63e9e..1ac7d7bc 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -109,7 +109,8 @@ def split_and_transform_data(bench_case, data, data_description): y_train, y_test = None, None distributed_split = get_bench_case_value(bench_case, "data:distributed_split", None) - if distributed_split == "rank_based": + knn_split_train = "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "") and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1 + if distributed_split == "rank_based" or knn_split_train: from mpi4py import MPI comm = MPI.COMM_WORLD @@ -129,10 +130,12 @@ def split_and_transform_data(bench_case, data, data_description): x_train[train_start:train_end], y_train[train_start:train_end], ) - x_test, y_test = x_test[test_start:test_end], y_test[test_start:test_end] + if distributed_split == "rank_based": + x_test, y_test = x_test[test_start:test_end], y_test[test_start:test_end] else: x_train = x_train[train_start:train_end] - x_test = x_test[test_start:test_end] + if distributed_split == "rank_based": + x_test = x_test[test_start:test_end] device = get_bench_case_value(bench_case, "algorithm:device", None) common_data_format = get_bench_case_value(bench_case, "data:format", "pandas") diff --git a/sklbench/runner/commands_helper.py b/sklbench/runner/commands_helper.py index b66da011..a63686c6 100644 --- a/sklbench/runner/commands_helper.py +++ b/sklbench/runner/commands_helper.py @@ -45,6 +45,9 @@ def generate_benchmark_command( mpi_prefix = "mpirun" for mpi_param_name, mpi_param_value in mpi_params.items(): mpi_prefix += f" -{mpi_param_name} {mpi_param_value}" + if mpi_param_name == "-hostfile": + import os + mpi_prefix += os.environ.get("PBS_NODEFILE") command_prefix = f"{mpi_prefix} {command_prefix}" # 3. Intel(R) VTune* profiling command prefix vtune_profiling = get_bench_case_value(bench_case, "bench:vtune_profiling") From 4bd6c7f91e0dcb5dc0001efc5b7c180f50dd9adc Mon Sep 17 00:00:00 2001 From: ethanglaser Date: Wed, 18 Sep 2024 18:01:07 +0000 Subject: [PATCH 002/110] strong scaling, config updates, minor revisions --- configs/spmd/large_scale/basic_stats.json | 4 +-- .../spmd/large_scale/basic_stats_strong.json | 29 +++++++++++++++++ configs/spmd/large_scale/covariance.json | 2 +- .../spmd/large_scale/covariance_strong.json | 29 +++++++++++++++++ configs/spmd/large_scale/kmeans.json | 11 +++---- configs/spmd/large_scale/knn.json | 5 +-- configs/spmd/large_scale/large_scale.json | 24 ++++++++++++-- configs/spmd/large_scale/linear_model.json | 2 +- .../spmd/large_scale/linear_model_strong.json | 26 ++++++++++++++++ configs/spmd/large_scale/logreg_strong.json | 28 +++++++++++++++++ configs/spmd/large_scale/pca.json | 2 +- configs/spmd/large_scale/pca_strong.json | 29 +++++++++++++++++ sklbench/benchmarks/sklearn_estimator.py | 4 ++- sklbench/utils/measurement.py | 31 ++++++++++++++----- 14 files changed, 200 insertions(+), 26 deletions(-) create mode 100644 configs/spmd/large_scale/basic_stats_strong.json create mode 100644 configs/spmd/large_scale/covariance_strong.json create mode 100644 configs/spmd/large_scale/linear_model_strong.json create mode 100644 configs/spmd/large_scale/logreg_strong.json create mode 100644 configs/spmd/large_scale/pca_strong.json diff --git a/configs/spmd/large_scale/basic_stats.json b/configs/spmd/large_scale/basic_stats.json index a9542017..9ac4725f 100644 --- a/configs/spmd/large_scale/basic_stats.json +++ b/configs/spmd/large_scale/basic_stats.json @@ -4,7 +4,7 @@ "spmd basicstats parameters": { "algorithm": { "estimator": "BasicStatistics", - "estimator_methods": { "training": "compute" } + "estimator_methods": { "training": "fit" } }, "data": { "split_kwargs": { "test_size": 0.0001 } @@ -21,7 +21,7 @@ "basicstats": { "SETS": [ "sklearnex spmd implementation", - "large scale default parameters", + "large scale 2k parameters", "synthetic data", "spmd basicstats parameters" ] diff --git a/configs/spmd/large_scale/basic_stats_strong.json b/configs/spmd/large_scale/basic_stats_strong.json new file mode 100644 index 00000000..b7aa22cb --- /dev/null +++ b/configs/spmd/large_scale/basic_stats_strong.json @@ -0,0 +1,29 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd basicstats parameters": { + "algorithm": { + "estimator": "BasicStatistics", + "estimator_methods": { "training": "fit" } + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000, "n_features": 100, "centers": 1 } } + ] + } + }, + "TEMPLATES": { + "basicstats": { + "SETS": [ + "sklearnex spmd implementation", + "large scale strong 2k parameters", + "synthetic data", + "spmd basicstats parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/covariance.json b/configs/spmd/large_scale/covariance.json index 3280bf77..260befd0 100644 --- a/configs/spmd/large_scale/covariance.json +++ b/configs/spmd/large_scale/covariance.json @@ -21,7 +21,7 @@ "covariance": { "SETS": [ "sklearnex spmd implementation", - "large scale default parameters", + "large scale 2k parameters", "synthetic data", "spmd basicstats parameters" ] diff --git a/configs/spmd/large_scale/covariance_strong.json b/configs/spmd/large_scale/covariance_strong.json new file mode 100644 index 00000000..568b4a8f --- /dev/null +++ b/configs/spmd/large_scale/covariance_strong.json @@ -0,0 +1,29 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd basicstats parameters": { + "algorithm": { + "estimator": "EmpiricalCovariance", + "estimator_methods": { "training": "fit" } + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000, "n_features": 100, "centers": 1 } } + ] + } + }, + "TEMPLATES": { + "covariance": { + "SETS": [ + "sklearnex spmd implementation", + "large scale strong 2k parameters", + "synthetic data", + "spmd basicstats parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/kmeans.json b/configs/spmd/large_scale/kmeans.json index 3b490f14..89524965 100644 --- a/configs/spmd/large_scale/kmeans.json +++ b/configs/spmd/large_scale/kmeans.json @@ -7,15 +7,14 @@ "estimator_params": { "algorithm": "lloyd" }, - "estimator_methods": { "training": "fit" } - }, - "bench": { - "mpi_params": {"n": 48} + "estimator_methods": { "training": "fit", "inference": "predict" } } }, "synthetic data": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } } + { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000, "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } } ] } }, @@ -24,7 +23,7 @@ "SETS": [ "synthetic data", "sklearnex spmd implementation", - "large scale default parameters", + "large scale 2k parameters", "spmd kmeans parameters" ] } diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json index 8b82094d..e979e2aa 100644 --- a/configs/spmd/large_scale/knn.json +++ b/configs/spmd/large_scale/knn.json @@ -15,9 +15,6 @@ "training": "fit", "inference": "predict" } - }, - "bench": { - "mpi_params": {} } }, "synthetic classification data": { @@ -35,7 +32,7 @@ "common knn parameters", "synthetic classification data", "sklearnex spmd implementation", - "large scale 2k parameters", + "large scale default parameters", "spmd knn cls parameters" ] } diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json index 4b39d5e2..72b808fe 100644 --- a/configs/spmd/large_scale/large_scale.json +++ b/configs/spmd/large_scale/large_scale.json @@ -6,7 +6,16 @@ "distributed_split": "None" }, "bench": { - "mpi_params": {"n": [1,2,3,4,5,6,7,8,9,10,11,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + "large scale strong parameters": { + "data": { + "dtype": "float64", + "distributed_split": "rank_based" + }, + "bench": { + "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, "large scale 2k parameters": { @@ -15,7 +24,16 @@ "distributed_split": "None" }, "bench": { - "mpi_params": {"n": [192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + "large scale strong 2k parameters": { + "data": { + "dtype": "float64", + "distributed_split": "rank_based" + }, + "bench": { + "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, "large scale impi parameters": { @@ -24,7 +42,7 @@ "distributed_split": "no" }, "bench": { - "mpi_params": {"n": [1,2,4,6,12,24], "ppn": 12} + "mpi_params": {"n": [1,2,4,6,9,12], "ppn": 12} } } } diff --git a/configs/spmd/large_scale/linear_model.json b/configs/spmd/large_scale/linear_model.json index 4c861caa..aeda4441 100644 --- a/configs/spmd/large_scale/linear_model.json +++ b/configs/spmd/large_scale/linear_model.json @@ -18,7 +18,7 @@ "linreg": { "SETS": [ "sklearnex spmd implementation", - "large scale default parameters", + "large scale 2k parameters", "synthetic data", "spmd linear parameters" ] diff --git a/configs/spmd/large_scale/linear_model_strong.json b/configs/spmd/large_scale/linear_model_strong.json new file mode 100644 index 00000000..77a9c79e --- /dev/null +++ b/configs/spmd/large_scale/linear_model_strong.json @@ -0,0 +1,26 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/linear_model.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd linear parameters": { + "algorithm": { + "estimator": "LinearRegression", + "estimator_methods": { "training": "fit" } + } + }, + "synthetic data": { + "data": [ + { "source": "make_regression", "generation_kwargs": { "n_samples": 25005000, "n_features": 100, "noise": 1.25 }, "split_kwargs": { "train_size": 25000000, "test_size": 5000 } } + ] + } + }, + "TEMPLATES": { + "linreg": { + "SETS": [ + "sklearnex spmd implementation", + "large scale strong 2k parameters", + "synthetic data", + "spmd linear parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/logreg_strong.json b/configs/spmd/large_scale/logreg_strong.json new file mode 100644 index 00000000..2bf1c0f9 --- /dev/null +++ b/configs/spmd/large_scale/logreg_strong.json @@ -0,0 +1,28 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/logreg.json", "../logreg.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd logreg2 parameters": { + "algorithm":{ + "estimator": "LogisticRegression", + "estimator_methods": { "inference": "predict" }, + "estimator_params": { "max_iter": 30 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 10001000, "n_features": 100, "n_classes": 2 } } + ] + } + }, + "TEMPLATES": { + "linreg": { + "SETS": [ + "sklearnex spmd implementation", + "large scale strong 2k parameters", + "spmd logreg parameters", + "synthetic data", + "spmd logreg2 parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/pca.json b/configs/spmd/large_scale/pca.json index 35c1942a..9a6a6b02 100644 --- a/configs/spmd/large_scale/pca.json +++ b/configs/spmd/large_scale/pca.json @@ -21,7 +21,7 @@ "linreg": { "SETS": [ "sklearnex spmd implementation", - "large scale default parameters", + "large scale 2k parameters", "synthetic data", "spmd pca parameters" ] diff --git a/configs/spmd/large_scale/pca_strong.json b/configs/spmd/large_scale/pca_strong.json new file mode 100644 index 00000000..adee3c79 --- /dev/null +++ b/configs/spmd/large_scale/pca_strong.json @@ -0,0 +1,29 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/pca.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd pca parameters": { + "algorithm": { + "estimator": "PCA", + "estimator_methods": { "training": "fit", "inference": "" } + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000, "n_features": 100, "centers": 1 } } + ] + } + }, + "TEMPLATES": { + "linreg": { + "SETS": [ + "sklearnex spmd implementation", + "large scale strong 2k parameters", + "synthetic data", + "spmd pca parameters" + ] + } + } +} diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index 42f8725b..a08a6e9c 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -436,7 +436,9 @@ def measure_sklearn_estimator( ( metrics[method]["time[ms]"], metrics[method]["time std[ms]"], - _, + metrics[method]["first iter[ms]"], + metrics[method]["box filter mean[ms]"], + metrics[method]["box filter std[ms]"] ) = measure_case(bench_case, method_instance, *data_args) if batch_size is not None: metrics[method]["throughput[samples/ms]"] = ( diff --git a/sklbench/utils/measurement.py b/sklbench/utils/measurement.py index 989daefd..df74e8da 100644 --- a/sklbench/utils/measurement.py +++ b/sklbench/utils/measurement.py @@ -40,6 +40,22 @@ def box_filter(timing, left=0.2, right=0.8): return np.mean(result) * 1000, np.std(result) * 1000 +def large_scale_measurements(timing): + first_iter = timing[0] * 1000 + mean = np.mean(timing[1:]) * 1000 + stdev = np.std(timing[1:]) * 1000 + timing_sorted = np.sort(timing) + Q1, Q3 = np.percentile(timing_sorted, [25, 75]) + IQ = Q3 - Q1 + lower, upper = Q1 - 1.5 * IQ, Q3 + 1.5 * IQ + + filtered_times = timing_sorted[(timing_sorted >= lower) & (timing_sorted <= upper)] + + box_filter_mean = np.mean(filtered_times) * 1000 if filtered_times.size > 0 else 0 + box_filter_stdev = np.std(filtered_times) * 1000 if filtered_times.size > 0 else 0 + return mean, stdev, first_iter, box_filter_mean, box_filter_stdev + + def measure_time( func, *args, @@ -72,13 +88,14 @@ def measure_time( f"exceeded time limit ({time_limit} seconds)" ) break - mean, std = box_filter(times) - if std / mean > std_mean_ratio: - logger.warning( - f'Measured "std / mean" time ratio of "{str(func)}" function is higher ' - f"than threshold ({round(std / mean, 3)} vs. {std_mean_ratio})" - ) - return mean, std, func_return_value + logger.debug(times) + #mean, std = box_filter(times) + #if std / mean > std_mean_ratio: + # logger.warning( + # f'Measured "std / mean" time ratio of "{str(func)}" function is higher ' + # f"than threshold ({round(std / mean, 3)} vs. {std_mean_ratio})" + # ) + return large_scale_measurements(times) # wrapper to get measurement params from benchmarking case From 3cd955c3eec84fe07654e71364d1dd4cc354cbdc Mon Sep 17 00:00:00 2001 From: ethanglaser Date: Sat, 21 Sep 2024 05:25:44 +0000 Subject: [PATCH 003/110] knn and forest config updates --- configs/spmd/large_scale/forest.json | 26 +++++++++++++++++++++ configs/spmd/large_scale/forest_strong.json | 25 ++++++++++++++++++++ configs/spmd/large_scale/knn.json | 4 ++-- sklbench/benchmarks/sklearn_estimator.py | 4 ++-- sklbench/datasets/transformer.py | 2 +- 5 files changed, 56 insertions(+), 5 deletions(-) create mode 100644 configs/spmd/large_scale/forest.json create mode 100644 configs/spmd/large_scale/forest_strong.json diff --git a/configs/spmd/large_scale/forest.json b/configs/spmd/large_scale/forest.json new file mode 100644 index 00000000..ee614ed3 --- /dev/null +++ b/configs/spmd/large_scale/forest.json @@ -0,0 +1,26 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd forest classification parameters": { + "algorithm": { + "estimator": "RandomForestClassifier" + } + }, + "synthetic data": { + "data": [ + { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 501000, "n_features": 10, "n_classes": 2 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } }, + { "source": "make_classification", "split_kwargs": { "train_size": 10000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 11000, "n_features": 1000, "n_classes": 2 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } } + ] + } + }, + "TEMPLATES": { + "basicstats": { + "SETS": [ + "sklearnex spmd implementation", + "large scale 2k parameters", + "synthetic data", + "spmd forest classification parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/forest_strong.json b/configs/spmd/large_scale/forest_strong.json new file mode 100644 index 00000000..121aa916 --- /dev/null +++ b/configs/spmd/large_scale/forest_strong.json @@ -0,0 +1,25 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd forest classification parameters": { + "algorithm": { + "estimator": "RandomForestClassifier" + } + }, + "synthetic data": { + "data": [ + { "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 10001000, "n_features": 100, "n_classes": 2 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } } + ] + } + }, + "TEMPLATES": { + "basicstats": { + "SETS": [ + "sklearnex spmd implementation", + "large scale strong 2k parameters", + "synthetic data", + "spmd forest classification parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json index e979e2aa..1ef849f1 100644 --- a/configs/spmd/large_scale/knn.json +++ b/configs/spmd/large_scale/knn.json @@ -22,7 +22,7 @@ { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 }, "generation_kwargs": { "n_samples": 550000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 500000 }, "generation_kwargs": { "n_samples": 550000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 50000 }, "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, - { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 500000 }, "generation_kwargs": { "n_samples": 1000000, "n_features": 10, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } + { "source": "make_classification", "split_kwargs": { "train_size": 200000, "test_size": 200000 }, "generation_kwargs": { "n_samples": 400000, "n_features": 10, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } ] } }, @@ -32,7 +32,7 @@ "common knn parameters", "synthetic classification data", "sklearnex spmd implementation", - "large scale default parameters", + "large scale 2k parameters", "spmd knn cls parameters" ] } diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index a08a6e9c..a1dc7a2f 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -525,8 +525,8 @@ def main(bench_case: BenchCase, filters: List[BenchCase]): result_template = enrich_result(result_template, bench_case) if "assume_finite" in context_params: result_template["assume_finite"] = context_params["assume_finite"] - if hasattr(estimator_instance, "get_params"): - estimator_params = estimator_instance.get_params() + #if hasattr(estimator_instance, "get_params"): + # estimator_params = estimator_instance.get_params() # note: "handle" is not JSON-serializable if "handle" in estimator_params: del estimator_params["handle"] diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index 1ac7d7bc..55cfc245 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -181,7 +181,7 @@ def split_and_transform_data(bench_case, data, data_description): "format": data_format, "order": data_order, "dtype": data_dtype, - "samples": converted_data.shape[0], + "samples (per rank)": converted_data.shape[0], } if len(converted_data.shape) == 2 and converted_data.shape[1] > 1: data_description[subset_name]["features"] = converted_data.shape[1] From e39dc2bed8100aafbb128460154ce6000f630a2e Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 23 Sep 2024 04:53:12 -0700 Subject: [PATCH 004/110] lint --- sklbench/benchmarks/sklearn_estimator.py | 8 ++++---- sklbench/datasets/transformer.py | 5 ++++- sklbench/runner/commands_helper.py | 1 + sklbench/utils/measurement.py | 8 ++++---- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index a1dc7a2f..bbfd3e62 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -145,7 +145,7 @@ def get_subset_metrics_of_estimator( "balanced accuracy": float(balanced_accuracy_score(y_compat, y_pred)), } ) - '''if hasattr(estimator_instance, "predict_proba") and not ( + """if hasattr(estimator_instance, "predict_proba") and not ( hasattr(estimator_instance, "probability") and getattr(estimator_instance, "probability") == False ): @@ -165,7 +165,7 @@ def get_subset_metrics_of_estimator( ), "logloss": float(log_loss(y_compat, y_pred_proba)), } - )''' + )""" elif task == "regression": y_pred = convert_to_numpy(estimator_instance.predict(x)) metrics.update( @@ -438,7 +438,7 @@ def measure_sklearn_estimator( metrics[method]["time std[ms]"], metrics[method]["first iter[ms]"], metrics[method]["box filter mean[ms]"], - metrics[method]["box filter std[ms]"] + metrics[method]["box filter std[ms]"], ) = measure_case(bench_case, method_instance, *data_args) if batch_size is not None: metrics[method]["throughput[samples/ms]"] = ( @@ -525,7 +525,7 @@ def main(bench_case: BenchCase, filters: List[BenchCase]): result_template = enrich_result(result_template, bench_case) if "assume_finite" in context_params: result_template["assume_finite"] = context_params["assume_finite"] - #if hasattr(estimator_instance, "get_params"): + # if hasattr(estimator_instance, "get_params"): # estimator_params = estimator_instance.get_params() # note: "handle" is not JSON-serializable if "handle" in estimator_params: diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index 55cfc245..86944ead 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -109,7 +109,10 @@ def split_and_transform_data(bench_case, data, data_description): y_train, y_test = None, None distributed_split = get_bench_case_value(bench_case, "data:distributed_split", None) - knn_split_train = "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "") and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1 + knn_split_train = ( + "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "") + and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1 + ) if distributed_split == "rank_based" or knn_split_train: from mpi4py import MPI diff --git a/sklbench/runner/commands_helper.py b/sklbench/runner/commands_helper.py index a63686c6..2441085a 100644 --- a/sklbench/runner/commands_helper.py +++ b/sklbench/runner/commands_helper.py @@ -47,6 +47,7 @@ def generate_benchmark_command( mpi_prefix += f" -{mpi_param_name} {mpi_param_value}" if mpi_param_name == "-hostfile": import os + mpi_prefix += os.environ.get("PBS_NODEFILE") command_prefix = f"{mpi_prefix} {command_prefix}" # 3. Intel(R) VTune* profiling command prefix diff --git a/sklbench/utils/measurement.py b/sklbench/utils/measurement.py index df74e8da..7495e258 100644 --- a/sklbench/utils/measurement.py +++ b/sklbench/utils/measurement.py @@ -48,9 +48,9 @@ def large_scale_measurements(timing): Q1, Q3 = np.percentile(timing_sorted, [25, 75]) IQ = Q3 - Q1 lower, upper = Q1 - 1.5 * IQ, Q3 + 1.5 * IQ - + filtered_times = timing_sorted[(timing_sorted >= lower) & (timing_sorted <= upper)] - + box_filter_mean = np.mean(filtered_times) * 1000 if filtered_times.size > 0 else 0 box_filter_stdev = np.std(filtered_times) * 1000 if filtered_times.size > 0 else 0 return mean, stdev, first_iter, box_filter_mean, box_filter_stdev @@ -89,8 +89,8 @@ def measure_time( ) break logger.debug(times) - #mean, std = box_filter(times) - #if std / mean > std_mean_ratio: + # mean, std = box_filter(times) + # if std / mean > std_mean_ratio: # logger.warning( # f'Measured "std / mean" time ratio of "{str(func)}" function is higher ' # f"than threshold ({round(std / mean, 3)} vs. {std_mean_ratio})" From 6e0fbf8a1947895169b48731be24b97a9c29db70 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 23 Sep 2024 06:28:03 -0700 Subject: [PATCH 005/110] just gpu for regular --- configs/common/sklearn.json | 5 +++++ configs/regular/dbscan.json | 2 +- configs/regular/ensemble.json | 4 ++-- configs/regular/kmeans.json | 2 +- configs/regular/knn.json | 20 ++------------------ configs/regular/linear_model.json | 24 +----------------------- configs/regular/logreg.json | 2 +- configs/regular/pca.json | 2 +- 8 files changed, 14 insertions(+), 47 deletions(-) diff --git a/configs/common/sklearn.json b/configs/common/sklearn.json index d7b13188..43051093 100644 --- a/configs/common/sklearn.json +++ b/configs/common/sklearn.json @@ -12,6 +12,11 @@ { "library": "sklearnex", "device": ["cpu", "gpu"] } ] }, + "sklearn-ex[gpu] implementations": { + "algorithm": [ + { "library": "sklearnex", "device": ["gpu"] } + ] + }, "sklearn-ex[preview] implementations": { "algorithm": [ { "library": "sklearn", "device": "cpu" }, diff --git a/configs/regular/dbscan.json b/configs/regular/dbscan.json index 71dcdc9b..1d0d732b 100644 --- a/configs/regular/dbscan.json +++ b/configs/regular/dbscan.json @@ -58,7 +58,7 @@ "TEMPLATES": { "sklearn dbscan": { "SETS": [ - "sklearn-ex[cpu,gpu] implementations", + "sklearn-ex[gpu] implementations", "common dbscan parameters", "sklearn dbscan parameters", "dbscan datasets" diff --git a/configs/regular/ensemble.json b/configs/regular/ensemble.json index 56e37e77..164cb236 100644 --- a/configs/regular/ensemble.json +++ b/configs/regular/ensemble.json @@ -90,7 +90,7 @@ "TEMPLATES": { "sklearn ensemble classification": { "SETS": [ - "sklearn-ex[cpu,gpu] implementations", + "sklearn-ex[gpu] implementations", "common ensemble params", "sklearn ensemble classifier params", "ensemble classification data" @@ -98,7 +98,7 @@ }, "sklearn ensemble regression": { "SETS": [ - "sklearn-ex[cpu,gpu] implementations", + "sklearn-ex[gpu] implementations", "common ensemble params", "sklearn ensemble regressor params", "ensemble regression data" diff --git a/configs/regular/kmeans.json b/configs/regular/kmeans.json index d4953615..8aba9055 100644 --- a/configs/regular/kmeans.json +++ b/configs/regular/kmeans.json @@ -70,7 +70,7 @@ "TEMPLATES": { "sklearn kmeans": { "SETS": [ - "sklearn-ex[preview] implementations", + "sklearn-ex[gpu] implementations", "common kmeans parameters", "sklearn kmeans parameters", "kmeans datasets" diff --git a/configs/regular/knn.json b/configs/regular/knn.json index e1cd8a75..bcbed117 100644 --- a/configs/regular/knn.json +++ b/configs/regular/knn.json @@ -74,36 +74,20 @@ "TEMPLATES": { "sklearn brute knn clsf": { "SETS": [ - "sklearn-ex[cpu,gpu] implementations", + "sklearn-ex[gpu] implementations", "common knn parameters", "sklearn knn parameters", "brute knn algorithm - classification data" ] }, - "sklearn kd_tree knn clsf": { - "SETS": [ - "sklearn-ex[cpu] implementations", - "common knn parameters", - "sklearn knn parameters", - "kd_tree knn algorithm - classification data" - ] - }, "sklearn brute knn regr": { "SETS": [ - "sklearn-ex[cpu,gpu] implementations", + "sklearn-ex[gpu] implementations", "common knn parameters", "sklearn knn parameters", "brute knn algorithm - regression data" ] }, - "sklearn kd_tree knn regr": { - "SETS": [ - "sklearn-ex[cpu] implementations", - "common knn parameters", - "sklearn knn parameters", - "kd_tree knn algorithm - regression data" - ] - }, "cuml brute knn clsf": { "SETS": [ "cuml implementation", diff --git a/configs/regular/linear_model.json b/configs/regular/linear_model.json index eb1b79ba..66667343 100644 --- a/configs/regular/linear_model.json +++ b/configs/regular/linear_model.json @@ -85,34 +85,12 @@ "TEMPLATES": { "sklearn linear": { "SETS": [ - "sklearn-ex[cpu,gpu] implementations", + "sklearn-ex[gpu] implementations", "common linear parameters", "sklearn linear parameters", "regression datasets" ] }, - "sklearn ridge": { - "SETS": [ - "sklearn-ex[cpu] implementations", - "common ridge parameters", - "sklearn ridge parameters", - "regression datasets" - ] - }, - "sklearn lasso": { - "SETS": [ - "sklearn-ex[cpu] implementations", - "common lasso parameters", - "regression datasets" - ] - }, - "sklearn elasticnet": { - "SETS": [ - "sklearn-ex[cpu] implementations", - "common elasticnet parameters", - "regression datasets" - ] - }, "cuml linear": { "SETS": [ "cuml implementation", diff --git a/configs/regular/logreg.json b/configs/regular/logreg.json index a94a7fcf..172ceb48 100644 --- a/configs/regular/logreg.json +++ b/configs/regular/logreg.json @@ -54,7 +54,7 @@ "TEMPLATES": { "sklearn logreg": { "SETS": [ - "sklearn-ex[cpu,gpu] implementations", + "sklearn-ex[gpu] implementations", "common logreg parameters", "sklearn logreg parameters", "logreg datasets" diff --git a/configs/regular/pca.json b/configs/regular/pca.json index 582acc9e..2300454d 100644 --- a/configs/regular/pca.json +++ b/configs/regular/pca.json @@ -46,7 +46,7 @@ "TEMPLATES": { "sklearn pca": { "SETS": [ - "sklearn-ex[cpu,gpu] implementations", + "sklearn-ex[gpu] implementations", "pca parameters", "pca datasets" ] From 7bb8fb486724192fc6410ccbd731ad16650563ad Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 23 Sep 2024 07:10:55 -0700 Subject: [PATCH 006/110] juremove cuml --- configs/regular/dbscan.json | 8 -------- configs/regular/ensemble.json | 16 ---------------- configs/regular/kmeans.json | 8 -------- configs/regular/knn.json | 14 -------------- configs/regular/linear_model.json | 24 ------------------------ configs/regular/logreg.json | 8 -------- configs/regular/pca.json | 7 ------- 7 files changed, 85 deletions(-) diff --git a/configs/regular/dbscan.json b/configs/regular/dbscan.json index 1d0d732b..711c15cd 100644 --- a/configs/regular/dbscan.json +++ b/configs/regular/dbscan.json @@ -63,14 +63,6 @@ "sklearn dbscan parameters", "dbscan datasets" ] - }, - "cuml dbscan": { - "SETS": [ - "cuml implementation", - "common dbscan parameters", - "cuml dbscan parameters", - "dbscan datasets" - ] } } } diff --git a/configs/regular/ensemble.json b/configs/regular/ensemble.json index 164cb236..f01c1383 100644 --- a/configs/regular/ensemble.json +++ b/configs/regular/ensemble.json @@ -103,22 +103,6 @@ "sklearn ensemble regressor params", "ensemble regression data" ] - }, - "cuml ensemble classification": { - "SETS": [ - "cuml implementation", - "common ensemble params", - "cuml ensemble classifier params", - "ensemble classification data" - ] - }, - "cuml ensemble regression": { - "SETS": [ - "cuml implementation", - "common ensemble params", - "cuml ensemble regressor params", - "ensemble regression data" - ] } } } diff --git a/configs/regular/kmeans.json b/configs/regular/kmeans.json index 8aba9055..756e2bab 100644 --- a/configs/regular/kmeans.json +++ b/configs/regular/kmeans.json @@ -75,14 +75,6 @@ "sklearn kmeans parameters", "kmeans datasets" ] - }, - "cuml kmeans": { - "SETS": [ - "cuml implementation", - "common kmeans parameters", - "cuml kmeans parameters", - "kmeans datasets" - ] } } } diff --git a/configs/regular/knn.json b/configs/regular/knn.json index bcbed117..a69c6864 100644 --- a/configs/regular/knn.json +++ b/configs/regular/knn.json @@ -87,20 +87,6 @@ "sklearn knn parameters", "brute knn algorithm - regression data" ] - }, - "cuml brute knn clsf": { - "SETS": [ - "cuml implementation", - "common knn parameters", - "brute knn algorithm - classification data" - ] - }, - "cuml brute knn regr": { - "SETS": [ - "cuml implementation", - "common knn parameters", - "brute knn algorithm - regression data" - ] } } } diff --git a/configs/regular/linear_model.json b/configs/regular/linear_model.json index 66667343..3040c82d 100644 --- a/configs/regular/linear_model.json +++ b/configs/regular/linear_model.json @@ -98,30 +98,6 @@ "cuml L2 parameters", "regression datasets" ] - }, - "cuml ridge": { - "SETS": [ - "cuml implementation", - "common ridge parameters", - "cuml L2 parameters", - "regression datasets" - ] - }, - "cuml lasso": { - "SETS": [ - "cuml implementation", - "common lasso parameters", - "cuml L1 parameters", - "regression datasets" - ] - }, - "cuml elasticnet": { - "SETS": [ - "cuml implementation", - "common elasticnet parameters", - "cuml L1 parameters", - "regression datasets" - ] } } } diff --git a/configs/regular/logreg.json b/configs/regular/logreg.json index 172ceb48..a8323b02 100644 --- a/configs/regular/logreg.json +++ b/configs/regular/logreg.json @@ -59,14 +59,6 @@ "sklearn logreg parameters", "logreg datasets" ] - }, - "cuml logreg": { - "SETS": [ - "cuml implementation", - "common logreg parameters", - "cuml logreg parameters", - "logreg datasets" - ] } } } diff --git a/configs/regular/pca.json b/configs/regular/pca.json index 2300454d..e26d3f44 100644 --- a/configs/regular/pca.json +++ b/configs/regular/pca.json @@ -50,13 +50,6 @@ "pca parameters", "pca datasets" ] - }, - "cuml pca": { - "SETS": [ - "cuml implementation", - "pca parameters", - "pca datasets" - ] } } } From 535c1e49171eea712d04f28769c7ebf697e675f9 Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Mon, 23 Sep 2024 10:57:42 -0700 Subject: [PATCH 007/110] Add incremental algorithms support --- configs/incremental.json | 99 ++++++++++++++++++++++++ sklbench/benchmarks/sklearn_estimator.py | 36 +++++++-- sklbench/report/implementation.py | 10 ++- test-configuration-linux.yml | 5 ++ test-configuration-win.yml | 4 + 5 files changed, 144 insertions(+), 10 deletions(-) create mode 100644 configs/incremental.json diff --git a/configs/incremental.json b/configs/incremental.json new file mode 100644 index 00000000..5f7a5477 --- /dev/null +++ b/configs/incremental.json @@ -0,0 +1,99 @@ +{ + "PARAMETERS_SETS": { + "common": {"bench": {"n_runs": 10, "time_limit": 60}}, + "covariance data": { + "data": [ + { + "source": "make_blobs", + "generation_kwargs": { + "centers": 1, + "n_samples": 1000, + "n_features": [16, 64] + }, + "split_kwargs": {"ignore": true} + } + ] + }, + "basic_statistics data": { + "data": { + "source": "make_blobs", + "generation_kwargs": { + "centers": 1, + "n_samples": 10000, + "n_features": [16, 64] + }, + "split_kwargs": {"ignore": true} + } + }, + "linear_regression data": { + "data": { + "source": "make_regression", + "split_kwargs": {"train_size": 0.2, "test_size": 0.8}, + "generation_kwargs": { + "n_samples": 5000, + "n_features": [40, 100], + "n_informative": 5, + "noise": 2.0 + } + } + }, + "pca data": { + "data": { + "source": "make_blobs", + "generation_kwargs": { + "centers": 1, + "n_samples": 1000, + "n_features": [16, 64] + }, + "split_kwargs": {"ignore": true} + } + }, + "covariance": { + "algorithm": [ + { + "estimator": "IncrementalEmpiricalCovariance", + "library": "sklearnex.covariance", + "estimator_methods": {"training": "partial_fit"}, + "num_batches": {"training": 2} + } + ] + }, + "basic_statistics": { + "algorithm": [ + { + "estimator": "IncrementalBasicStatistics", + "library": "sklearnex.basic_statistics", + "num_batches": {"training": 2} + } + ] + }, + "linear_regression": { + "algorithm": [ + { + "estimator": "IncrementalLinearRegression", + "library": "sklearnex.linear_model", + "num_batches": {"training": 2} + } + ] + }, + "pca": { + "algorithm": [ + { + "estimator": "IncrementalPCA", + "library": "sklearnex.preview.decomposition", + "num_batches": {"training": 2} + } + ] + } + }, + "TEMPLATES": { + "covariance": {"SETS": ["common", "covariance", "covariance data"]}, + "basic_statistics": { + "SETS": ["common", "basic_statistics", "basic_statistics data"] + }, + "linear_regression": { + "SETS": ["common", "linear_regression", "linear_regression data"] + }, + "pca": {"SETS": ["common", "pca", "pca data"]} + } +} diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index f9c0a75e..4cdde86d 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -74,7 +74,7 @@ def get_estimator(library_name: str, estimator_name: str): def get_estimator_methods(bench_case: BenchCase) -> Dict[str, List[str]]: # default estimator methods estimator_methods = { - "training": ["fit"], + "training": ["partial_fit", "fit"], "inference": ["predict", "predict_proba", "transform"], } for stage in estimator_methods.keys(): @@ -334,7 +334,9 @@ def verify_patching(stream: io.StringIO, function_name) -> bool: return acceleration_lines > 0 and fallback_lines == 0 -def create_online_function(method_instance, data_args, batch_size): +def create_online_function( + estimator_instance, method_instance, data_args, num_batches, batch_size +): n_batches = data_args[0].shape[0] // batch_size if "y" in list(inspect.signature(method_instance).parameters): @@ -345,6 +347,7 @@ def ndarray_function(x, y): x[i * batch_size : (i + 1) * batch_size], y[i * batch_size : (i + 1) * batch_size], ) + estimator_instance._onedal_finalize_fit() def dataframe_function(x, y): for i in range(n_batches): @@ -352,16 +355,19 @@ def dataframe_function(x, y): x.iloc[i * batch_size : (i + 1) * batch_size], y.iloc[i * batch_size : (i + 1) * batch_size], ) + estimator_instance._onedal_finalize_fit() else: def ndarray_function(x): for i in range(n_batches): method_instance(x[i * batch_size : (i + 1) * batch_size]) + estimator_instance._onedal_finalize_fit() def dataframe_function(x): for i in range(n_batches): method_instance(x.iloc[i * batch_size : (i + 1) * batch_size]) + estimator_instance._onedal_finalize_fit() if "ndarray" in str(type(data_args[0])): return ndarray_function @@ -414,12 +420,28 @@ def measure_sklearn_estimator( data_args = (x_train,) else: data_args = (x_test,) - batch_size = get_bench_case_value( - bench_case, f"algorithm:batch_size:{stage}" - ) - if batch_size is not None: + + if method == "partial_fit": + num_batches = get_bench_case_value(bench_case, "data:num_batches") + batch_size = get_bench_case_value(bench_case, "data:batch_size") + + if batch_size is None: + if num_batches is None: + num_batches = 5 + batch_size = ( + data_args[0].shape[0] + num_batches - 1 + ) // num_batches + if num_batches is None: + num_batches = ( + data_args[0].shape[0] + batch_size - 1 + ) // batch_size + method_instance = create_online_function( - method_instance, data_args, batch_size + estimator_instance, + method_instance, + data_args, + num_batches, + batch_size, ) # daal4py model builders enabling branch if enable_modelbuilders and stage == "inference": diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py index b577ab55..df15b5eb 100644 --- a/sklbench/report/implementation.py +++ b/sklbench/report/implementation.py @@ -16,7 +16,7 @@ import argparse import json -from typing import Dict, List +from typing import Dict, Hashable, List import openpyxl as xl import pandas as pd @@ -239,6 +239,7 @@ def get_result_tables_as_df( bench_cases = pd.DataFrame( [flatten_dict(bench_case) for bench_case in results["bench_cases"]] ) + bench_cases = bench_cases.map(lambda x: str(x) if not isinstance(x, Hashable) else x) if compatibility_mode: bench_cases = transform_results_to_compatible(bench_cases) @@ -248,7 +249,7 @@ def get_result_tables_as_df( bench_cases.drop(columns=[column], inplace=True) diffby_columns.remove(column) - return split_df_by_columns(bench_cases, splitby_columns) + return split_df_by_columns(bench_cases, splitby_columns, False) def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame: @@ -258,7 +259,10 @@ def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame: # only relative improvements are included in summary currently if len(column) > 1 and column[1] == f"{metric_name} relative improvement": metric_columns.append(column) - summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T + if metric_columns: + summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T + else: + summary = pd.DataFrame() summary.index = pd.Index([df_name]) return summary diff --git a/test-configuration-linux.yml b/test-configuration-linux.yml index a37769ce..722d1008 100644 --- a/test-configuration-linux.yml +++ b/test-configuration-linux.yml @@ -45,6 +45,11 @@ steps: conda activate bench-env python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json displayName: Sklearn example run + - script: | + source /usr/share/miniconda/etc/profile.d/conda.sh + conda activate bench-env + python -m sklbench --report -l DEBUG --report -c configs/incremental.json + displayName: Incremental algorithms example run - script: | source /usr/share/miniconda/etc/profile.d/conda.sh conda activate bench-env diff --git a/test-configuration-win.yml b/test-configuration-win.yml index a1eddaeb..82c3152a 100644 --- a/test-configuration-win.yml +++ b/test-configuration-win.yml @@ -43,6 +43,10 @@ steps: call activate bench-env python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json displayName: Sklearn example run + - script: | + call activate bench-env + python -m sklbench --report -l DEBUG --report -c configs/incremental.json + displayName: Incremental algorithms example run - script: | call activate bench-env python -m sklbench --report -l DEBUG --report -c configs/xgboost_example.json From d6952ac74715dcb0910626f9e5dce1c2eb1a3827 Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Mon, 23 Sep 2024 11:49:37 -0700 Subject: [PATCH 008/110] Fix win yml --- test-configuration-win.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test-configuration-win.yml b/test-configuration-win.yml index 82c3152a..f3ac1595 100644 --- a/test-configuration-win.yml +++ b/test-configuration-win.yml @@ -43,7 +43,7 @@ steps: call activate bench-env python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json displayName: Sklearn example run - - script: | + - script: | call activate bench-env python -m sklbench --report -l DEBUG --report -c configs/incremental.json displayName: Incremental algorithms example run From 9cf382ee7b58bd68d6d826f17d3cf8adb7e493eb Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 23 Sep 2024 11:52:03 -0700 Subject: [PATCH 009/110] refactor and kmeans strong --- configs/spmd/large_scale/basic_stats.json | 2 +- .../spmd/large_scale/basic_stats_strong.json | 2 +- configs/spmd/large_scale/covariance.json | 2 +- .../spmd/large_scale/covariance_strong.json | 2 +- configs/spmd/large_scale/dbscan.json | 2 +- configs/spmd/large_scale/forest.json | 2 +- configs/spmd/large_scale/forest_strong.json | 2 +- configs/spmd/large_scale/kmeans_strong.json | 31 +++++++++++++++++++ configs/spmd/large_scale/knn.json | 2 +- configs/spmd/large_scale/linear_model.json | 2 +- .../spmd/large_scale/linear_model_strong.json | 2 +- configs/spmd/large_scale/logreg.json | 4 +-- configs/spmd/large_scale/logreg_strong.json | 4 +-- configs/spmd/large_scale/pca.json | 4 +-- configs/spmd/large_scale/pca_strong.json | 4 +-- 15 files changed, 49 insertions(+), 18 deletions(-) create mode 100644 configs/spmd/large_scale/kmeans_strong.json diff --git a/configs/spmd/large_scale/basic_stats.json b/configs/spmd/large_scale/basic_stats.json index 9ac4725f..b484b647 100644 --- a/configs/spmd/large_scale/basic_stats.json +++ b/configs/spmd/large_scale/basic_stats.json @@ -22,7 +22,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale 2k parameters", - "synthetic data", + "synthetic data", "spmd basicstats parameters" ] } diff --git a/configs/spmd/large_scale/basic_stats_strong.json b/configs/spmd/large_scale/basic_stats_strong.json index b7aa22cb..6527d8e5 100644 --- a/configs/spmd/large_scale/basic_stats_strong.json +++ b/configs/spmd/large_scale/basic_stats_strong.json @@ -21,7 +21,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale strong 2k parameters", - "synthetic data", + "synthetic data", "spmd basicstats parameters" ] } diff --git a/configs/spmd/large_scale/covariance.json b/configs/spmd/large_scale/covariance.json index 260befd0..e4d0477a 100644 --- a/configs/spmd/large_scale/covariance.json +++ b/configs/spmd/large_scale/covariance.json @@ -22,7 +22,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale 2k parameters", - "synthetic data", + "synthetic data", "spmd basicstats parameters" ] } diff --git a/configs/spmd/large_scale/covariance_strong.json b/configs/spmd/large_scale/covariance_strong.json index 568b4a8f..2b9c5dd0 100644 --- a/configs/spmd/large_scale/covariance_strong.json +++ b/configs/spmd/large_scale/covariance_strong.json @@ -21,7 +21,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale strong 2k parameters", - "synthetic data", + "synthetic data", "spmd basicstats parameters" ] } diff --git a/configs/spmd/large_scale/dbscan.json b/configs/spmd/large_scale/dbscan.json index c46287d8..b17e2cd8 100644 --- a/configs/spmd/large_scale/dbscan.json +++ b/configs/spmd/large_scale/dbscan.json @@ -24,7 +24,7 @@ "common dbscan parameters", "synthetic dataset", "sklearnex spmd implementation", - "large scale default parameters", + "large scale default parameters", "spmd dbscan parameters" ] } diff --git a/configs/spmd/large_scale/forest.json b/configs/spmd/large_scale/forest.json index ee614ed3..ea6f3ef7 100644 --- a/configs/spmd/large_scale/forest.json +++ b/configs/spmd/large_scale/forest.json @@ -18,7 +18,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale 2k parameters", - "synthetic data", + "synthetic data", "spmd forest classification parameters" ] } diff --git a/configs/spmd/large_scale/forest_strong.json b/configs/spmd/large_scale/forest_strong.json index 121aa916..0f1ef40e 100644 --- a/configs/spmd/large_scale/forest_strong.json +++ b/configs/spmd/large_scale/forest_strong.json @@ -17,7 +17,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale strong 2k parameters", - "synthetic data", + "synthetic data", "spmd forest classification parameters" ] } diff --git a/configs/spmd/large_scale/kmeans_strong.json b/configs/spmd/large_scale/kmeans_strong.json new file mode 100644 index 00000000..29cfc2e7 --- /dev/null +++ b/configs/spmd/large_scale/kmeans_strong.json @@ -0,0 +1,31 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/kmeans.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd kmeans parameters": { + "algorithm": { + "estimator": "KMeans", + "estimator_params": { + "algorithm": "lloyd" + }, + "estimator_methods": { "training": "fit", "inference": "predict" } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000, "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } } + ] + } + }, + "TEMPLATES": { + "kmeans": { + "SETS": [ + "synthetic data", + "sklearnex spmd implementation", + "large scale strong 2k parameters", + "spmd kmeans parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json index 1ef849f1..8dd39f61 100644 --- a/configs/spmd/large_scale/knn.json +++ b/configs/spmd/large_scale/knn.json @@ -32,7 +32,7 @@ "common knn parameters", "synthetic classification data", "sklearnex spmd implementation", - "large scale 2k parameters", + "large scale 2k parameters", "spmd knn cls parameters" ] } diff --git a/configs/spmd/large_scale/linear_model.json b/configs/spmd/large_scale/linear_model.json index aeda4441..e4bb14a1 100644 --- a/configs/spmd/large_scale/linear_model.json +++ b/configs/spmd/large_scale/linear_model.json @@ -19,7 +19,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale 2k parameters", - "synthetic data", + "synthetic data", "spmd linear parameters" ] } diff --git a/configs/spmd/large_scale/linear_model_strong.json b/configs/spmd/large_scale/linear_model_strong.json index 77a9c79e..9d8c3533 100644 --- a/configs/spmd/large_scale/linear_model_strong.json +++ b/configs/spmd/large_scale/linear_model_strong.json @@ -18,7 +18,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale strong 2k parameters", - "synthetic data", + "synthetic data", "spmd linear parameters" ] } diff --git a/configs/spmd/large_scale/logreg.json b/configs/spmd/large_scale/logreg.json index c5ef6203..ccef906b 100644 --- a/configs/spmd/large_scale/logreg.json +++ b/configs/spmd/large_scale/logreg.json @@ -21,8 +21,8 @@ "sklearnex spmd implementation", "large scale 2k parameters", "spmd logreg parameters", - "synthetic data", - "spmd logreg2 parameters" + "synthetic data", + "spmd logreg2 parameters" ] } } diff --git a/configs/spmd/large_scale/logreg_strong.json b/configs/spmd/large_scale/logreg_strong.json index 2bf1c0f9..a6efd969 100644 --- a/configs/spmd/large_scale/logreg_strong.json +++ b/configs/spmd/large_scale/logreg_strong.json @@ -20,8 +20,8 @@ "sklearnex spmd implementation", "large scale strong 2k parameters", "spmd logreg parameters", - "synthetic data", - "spmd logreg2 parameters" + "synthetic data", + "spmd logreg2 parameters" ] } } diff --git a/configs/spmd/large_scale/pca.json b/configs/spmd/large_scale/pca.json index 9a6a6b02..3b9da126 100644 --- a/configs/spmd/large_scale/pca.json +++ b/configs/spmd/large_scale/pca.json @@ -20,10 +20,10 @@ "TEMPLATES": { "linreg": { "SETS": [ - "sklearnex spmd implementation", + "sklearnex spmd implementation", "large scale 2k parameters", "synthetic data", - "spmd pca parameters" + "spmd pca parameters" ] } } diff --git a/configs/spmd/large_scale/pca_strong.json b/configs/spmd/large_scale/pca_strong.json index adee3c79..2d302340 100644 --- a/configs/spmd/large_scale/pca_strong.json +++ b/configs/spmd/large_scale/pca_strong.json @@ -19,10 +19,10 @@ "TEMPLATES": { "linreg": { "SETS": [ - "sklearnex spmd implementation", + "sklearnex spmd implementation", "large scale strong 2k parameters", "synthetic data", - "spmd pca parameters" + "spmd pca parameters" ] } } From 6c8f529fd74f6f0bad1c36d6d7a8878e168624cb Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 23 Sep 2024 12:07:49 -0700 Subject: [PATCH 010/110] refactor and add config --- configs/spmd/large_scale/kmeans_strong_2.json | 31 ++++++++++++++++ configs/spmd/large_scale/large_scale.json | 36 ++++++++++++++----- configs/spmd/large_scale/logreg_2.json | 29 +++++++++++++++ configs/spmd/large_scale/logreg_strong_2.json | 28 +++++++++++++++ 4 files changed, 115 insertions(+), 9 deletions(-) create mode 100644 configs/spmd/large_scale/kmeans_strong_2.json create mode 100644 configs/spmd/large_scale/logreg_2.json create mode 100644 configs/spmd/large_scale/logreg_strong_2.json diff --git a/configs/spmd/large_scale/kmeans_strong_2.json b/configs/spmd/large_scale/kmeans_strong_2.json new file mode 100644 index 00000000..03f2bc59 --- /dev/null +++ b/configs/spmd/large_scale/kmeans_strong_2.json @@ -0,0 +1,31 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/kmeans.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd kmeans parameters": { + "algorithm": { + "estimator": "KMeans", + "estimator_params": { + "algorithm": "lloyd" + }, + "estimator_methods": { "training": "fit", "inference": "predict" } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000, "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } } + ] + } + }, + "TEMPLATES": { + "kmeans": { + "SETS": [ + "synthetic data", + "sklearnex spmd implementation", + "large scale strong two nodes parameters", + "spmd kmeans parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json index 72b808fe..1cde18f6 100644 --- a/configs/spmd/large_scale/large_scale.json +++ b/configs/spmd/large_scale/large_scale.json @@ -3,13 +3,13 @@ "large scale default parameters": { "data": { "dtype": "float64", - "distributed_split": "None" + "distributed_split": "None" }, "bench": { "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, - "large scale strong parameters": { + "large scale strong parameters": { "data": { "dtype": "float64", "distributed_split": "rank_based" @@ -18,7 +18,7 @@ "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, - "large scale 2k parameters": { + "large scale 2k parameters": { "data": { "dtype": "float64", "distributed_split": "None" @@ -27,6 +27,15 @@ "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, + "large scale two nodes parameters": { + "data": { + "dtype": "float64", + "distributed_split": "None" + }, + "bench": { + "mpi_params": {"n": [24], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, "large scale strong 2k parameters": { "data": { "dtype": "float64", @@ -36,14 +45,23 @@ "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, - "large scale impi parameters": { - "data": { - "dtype": "float64", + "large scale strong two nodes parameters": { + "data": { + "dtype": "float64", + "distributed_split": "rank_based" + }, + "bench": { + "mpi_params": {"n": [24], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + "large scale impi parameters": { + "data": { + "dtype": "float64", "distributed_split": "no" }, "bench": { - "mpi_params": {"n": [1,2,4,6,9,12], "ppn": 12} - } - } + "mpi_params": {"n": [1,2,4,6,9,12], "ppn": 12} + } + } } } diff --git a/configs/spmd/large_scale/logreg_2.json b/configs/spmd/large_scale/logreg_2.json new file mode 100644 index 00000000..d18b2293 --- /dev/null +++ b/configs/spmd/large_scale/logreg_2.json @@ -0,0 +1,29 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/logreg.json", "../logreg.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd logreg2 parameters": { + "algorithm":{ + "estimator": "LogisticRegression", + "estimator_methods": { "inference": "predict" }, + "estimator_params": { "max_iter": 20 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 10, "n_classes": 2 } }, + { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 101000, "n_features": 1000, "n_classes": 2 } } + ] + } + }, + "TEMPLATES": { + "linreg": { + "SETS": [ + "sklearnex spmd implementation", + "large scale two nodes parameters", + "spmd logreg parameters", + "synthetic data", + "spmd logreg2 parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/logreg_strong_2.json b/configs/spmd/large_scale/logreg_strong_2.json new file mode 100644 index 00000000..1a940d90 --- /dev/null +++ b/configs/spmd/large_scale/logreg_strong_2.json @@ -0,0 +1,28 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/logreg.json", "../logreg.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd logreg2 parameters": { + "algorithm":{ + "estimator": "LogisticRegression", + "estimator_methods": { "inference": "predict" }, + "estimator_params": { "max_iter": 30 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 10001000, "n_features": 100, "n_classes": 2 } } + ] + } + }, + "TEMPLATES": { + "linreg": { + "SETS": [ + "sklearnex spmd implementation", + "large scale strong two nodes parameters", + "spmd logreg parameters", + "synthetic data", + "spmd logreg2 parameters" + ] + } + } +} From 3867a8607e33ed6378055860bb986a09b123638f Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 23 Sep 2024 12:48:53 -0700 Subject: [PATCH 011/110] strong reduce nodes --- configs/spmd/large_scale/large_scale.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json index 1cde18f6..bf99dd5c 100644 --- a/configs/spmd/large_scale/large_scale.json +++ b/configs/spmd/large_scale/large_scale.json @@ -42,7 +42,7 @@ "distributed_split": "rank_based" }, "bench": { - "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, "large scale strong two nodes parameters": { From ed875b4f68207e0c745751ef75853a4e1b6a60bf Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 23 Sep 2024 14:00:02 -0700 Subject: [PATCH 012/110] forest reg config --- configs/spmd/large_scale/forest.json | 2 +- configs/spmd/large_scale/forest_strong.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/spmd/large_scale/forest.json b/configs/spmd/large_scale/forest.json index ea6f3ef7..5aa3d36f 100644 --- a/configs/spmd/large_scale/forest.json +++ b/configs/spmd/large_scale/forest.json @@ -14,7 +14,7 @@ } }, "TEMPLATES": { - "basicstats": { + "forestCls": { "SETS": [ "sklearnex spmd implementation", "large scale 2k parameters", diff --git a/configs/spmd/large_scale/forest_strong.json b/configs/spmd/large_scale/forest_strong.json index 0f1ef40e..14690846 100644 --- a/configs/spmd/large_scale/forest_strong.json +++ b/configs/spmd/large_scale/forest_strong.json @@ -13,7 +13,7 @@ } }, "TEMPLATES": { - "basicstats": { + "forestCls": { "SETS": [ "sklearnex spmd implementation", "large scale strong 2k parameters", From c596a56cc5ba48557368610e74481bf1b8e00b96 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 23 Sep 2024 14:00:12 -0700 Subject: [PATCH 013/110] forest reg config --- configs/spmd/large_scale/forest_reg.json | 25 +++++++++++++++++++ .../spmd/large_scale/forest_strong_reg.json | 25 +++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 configs/spmd/large_scale/forest_reg.json create mode 100644 configs/spmd/large_scale/forest_strong_reg.json diff --git a/configs/spmd/large_scale/forest_reg.json b/configs/spmd/large_scale/forest_reg.json new file mode 100644 index 00000000..ab2a6920 --- /dev/null +++ b/configs/spmd/large_scale/forest_reg.json @@ -0,0 +1,25 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd forest regression parameters": { + "algorithm": { + "estimator": "RandomForestRegressor" + } + }, + "synthetic data": { + "data": [ + { "source": "make_regression", "generation_kwargs": { "n_samples": 10000000, "n_features": 100, "noise": 1.25 }, "split_kwargs": { "train_size": 10000000, "test_size": 5000 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } }} + ] + } + }, + "TEMPLATES": { + "forestReg": { + "SETS": [ + "sklearnex spmd implementation", + "large scale 2k parameters", + "synthetic data", + "spmd forest regression parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/forest_strong_reg.json b/configs/spmd/large_scale/forest_strong_reg.json new file mode 100644 index 00000000..71afeee6 --- /dev/null +++ b/configs/spmd/large_scale/forest_strong_reg.json @@ -0,0 +1,25 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd forest regression parameters": { + "algorithm": { + "estimator": "RandomForestRegressor" + } + }, + "synthetic data": { + "data": [ + { "source": "make_regression", "generation_kwargs": { "n_samples": 10000000, "n_features": 100, "noise": 1.25 }, "split_kwargs": { "train_size": 10000000, "test_size": 5000 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } }} + ] + } + }, + "TEMPLATES": { + "forestReg": { + "SETS": [ + "sklearnex spmd implementation", + "large scale strong 2k parameters", + "synthetic data", + "spmd forest regression parameters" + ] + } + } +} From 4fee9911538d6a7794d89ece46bc0bea6b5bdf44 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 23 Sep 2024 14:11:50 -0700 Subject: [PATCH 014/110] KNN weak --- configs/spmd/large_scale/knn.json | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json index 8dd39f61..a7672ef5 100644 --- a/configs/spmd/large_scale/knn.json +++ b/configs/spmd/large_scale/knn.json @@ -19,10 +19,8 @@ }, "synthetic classification data": { "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 }, "generation_kwargs": { "n_samples": 550000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, - { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 500000 }, "generation_kwargs": { "n_samples": 550000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, - { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 50000 }, "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, - { "source": "make_classification", "split_kwargs": { "train_size": 200000, "test_size": 200000 }, "generation_kwargs": { "n_samples": 400000, "n_features": 10, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } + { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 }, "generation_kwargs": { "n_samples": 550000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, + { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 500000 }, "generation_kwargs": { "n_samples": 550000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } ] } }, From fce0651d81adab57fe2f1b95a85fb0f9e37d246d Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 23 Sep 2024 14:15:00 -0700 Subject: [PATCH 015/110] KNN strong --- configs/spmd/large_scale/knn_strong.json | 38 ++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 configs/spmd/large_scale/knn_strong.json diff --git a/configs/spmd/large_scale/knn_strong.json b/configs/spmd/large_scale/knn_strong.json new file mode 100644 index 00000000..15cc0226 --- /dev/null +++ b/configs/spmd/large_scale/knn_strong.json @@ -0,0 +1,38 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/knn.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd knn cls parameters": { + "algorithm": { + "estimator": "KNeighborsClassifier", + "estimator_params": { + "algorithm": "brute", + "metric": "minkowski", + "p": 2, + "weights": "uniform", + "n_neighbors": 5 + }, + "estimator_methods": { + "training": "fit", + "inference": "predict" + } + } + }, + "synthetic classification data": { + "data": [ + { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 }, "generation_kwargs": { "n_samples": 550000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, + { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 500000 }, "generation_kwargs": { "n_samples": 550000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } + ] + } + }, + "TEMPLATES": { + "knn classifier": { + "SETS": [ + "common knn parameters", + "synthetic classification data", + "sklearnex spmd implementation", + "large scale strong 2k parameters", + "spmd knn cls parameters" + ] + } + } +} From e1ff9a0ab0e86ed500059e37c7aca6e6ed2d2b29 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 23 Sep 2024 23:07:52 -0700 Subject: [PATCH 016/110] experiment with ppn --- configs/spmd/large_scale/large_scale.json | 18 ++++++++++++++ configs/spmd/large_scale/pca_single.json | 30 +++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 configs/spmd/large_scale/pca_single.json diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json index bf99dd5c..fcddc722 100644 --- a/configs/spmd/large_scale/large_scale.json +++ b/configs/spmd/large_scale/large_scale.json @@ -18,6 +18,24 @@ "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, + "large scale one node parameters": { + "data": { + "dtype": "float64", + "distributed_split": "None" + }, + "bench": { + "mpi_params": {"n": [1,2,6,12], "ppn": [1,2,6,12], "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + "large scale strong one node parameters": { + "data": { + "dtype": "float64", + "distributed_split": "rank_based" + }, + "bench": { + "mpi_params": {"n": [1,2,6,12], "ppn": [1,2,6,12], "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, "large scale 2k parameters": { "data": { "dtype": "float64", diff --git a/configs/spmd/large_scale/pca_single.json b/configs/spmd/large_scale/pca_single.json new file mode 100644 index 00000000..61b2cf15 --- /dev/null +++ b/configs/spmd/large_scale/pca_single.json @@ -0,0 +1,30 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/pca.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd pca parameters": { + "algorithm": { + "estimator": "PCA", + "estimator_methods": { "training": "fit", "inference": "" } + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } + ] + } + }, + "TEMPLATES": { + "linreg": { + "SETS": [ + "sklearnex spmd implementation", + "large scale one node parameters", + "synthetic data", + "spmd pca parameters" + ] + } + } +} From e3d9a35a79869f6f6efa074c0a16f8b0948e85cc Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 23 Sep 2024 23:20:26 -0700 Subject: [PATCH 017/110] experiment with ppn --- configs/spmd/large_scale/large_scale.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json index fcddc722..cf81cbf0 100644 --- a/configs/spmd/large_scale/large_scale.json +++ b/configs/spmd/large_scale/large_scale.json @@ -24,7 +24,7 @@ "distributed_split": "None" }, "bench": { - "mpi_params": {"n": [1,2,6,12], "ppn": [1,2,6,12], "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + "mpi_params": {"n": [1,2,3,4,5,6,7,8,9,10,11,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, "large scale strong one node parameters": { @@ -33,7 +33,7 @@ "distributed_split": "rank_based" }, "bench": { - "mpi_params": {"n": [1,2,6,12], "ppn": [1,2,6,12], "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + "mpi_params": {"n": [1,2,3,4,5,6,7,8,9,10,11,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, "large scale 2k parameters": { From 817710b83d09d1c44b523e10e4365144cbf14113 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 24 Sep 2024 00:05:42 -0700 Subject: [PATCH 018/110] bf16 --- configs/regular/bf16/dbscan.json | 41 ++++++++++++++++++++ configs/regular/bf16/ensemble.json | 45 ++++++++++++++++++++++ configs/regular/bf16/kmeans.json | 40 +++++++++++++++++++ configs/regular/bf16/knn.json | 34 ++++++++++++++++ configs/regular/bf16/linear_model.json | 29 ++++++++++++++ configs/regular/bf16/logreg.json | 42 ++++++++++++++++++++ configs/regular/bf16/pca.json | 33 ++++++++++++++++ configs/spmd/large_scale/linear_model.json | 2 +- configs/spmd/large_scale/logreg.json | 2 +- 9 files changed, 266 insertions(+), 2 deletions(-) create mode 100644 configs/regular/bf16/dbscan.json create mode 100644 configs/regular/bf16/ensemble.json create mode 100644 configs/regular/bf16/kmeans.json create mode 100644 configs/regular/bf16/knn.json create mode 100644 configs/regular/bf16/linear_model.json create mode 100644 configs/regular/bf16/logreg.json create mode 100644 configs/regular/bf16/pca.json diff --git a/configs/regular/bf16/dbscan.json b/configs/regular/bf16/dbscan.json new file mode 100644 index 00000000..26e87ad6 --- /dev/null +++ b/configs/regular/bf16/dbscan.json @@ -0,0 +1,41 @@ +{ + "INCLUDE": ["../common/sklearn.json"], + "PARAMETERS_SETS": { + "common dbscan parameters": { + "algorithm": { + "estimator": "DBSCAN", + "estimator_params": { + "eps": "[SPECIAL_VALUE]distances_quantile:0.01", + "min_samples": 5, + "metric": "euclidean" + } + }, + "data": { + "dtype": ["float32"] + } + }, + "sklearn dbscan parameters": { + "algorithm": { + "estimator_params": { + "algorithm": "brute", + "n_jobs": "[SPECIAL_VALUE]physical_cpus" + } + } + }, + "synthetic dataset": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 10, "centers": 10 }, "algorithm": { "eps": 5, "min_samples": 5 } } + ] + } + }, + "TEMPLATES": { + "sklearn dbscan": { + "SETS": [ + "sklearn-ex[gpu] implementations", + "common dbscan parameters", + "sklearn dbscan parameters", + "synthetic dataset" + ] + } + } +} diff --git a/configs/regular/bf16/ensemble.json b/configs/regular/bf16/ensemble.json new file mode 100644 index 00000000..f883a7af --- /dev/null +++ b/configs/regular/bf16/ensemble.json @@ -0,0 +1,45 @@ +{ + "INCLUDE": ["../common/sklearn.json"], + "PARAMETERS_SETS": { + "common ensemble params": { + "algorithm": { + "estimator_params": { + "n_estimators": 200, + "max_depth": 16, + "max_samples": 1.0, + "min_samples_split": 5, + "min_samples_leaf": 2, + "min_impurity_decrease": 0.0, + "bootstrap": true, + "random_state": 42 + } + } + }, + "sklearn ensemble classifier params": { + "algorithm": { + "estimator": ["RandomForestClassifier", "ExtraTreesClassifier"], + "estimator_params": { + "criterion": "gini", + "max_features": "sqrt", + "max_leaf_nodes": null, + "n_jobs": "[SPECIAL_VALUE]physical_cpus" + } + } + }, + "synthetic data": { + "data": [ + { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 501000, "n_features": 10, "n_classes": 2 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } } + ] + } + }, + "TEMPLATES": { + "sklearn ensemble classification": { + "SETS": [ + "sklearn-ex[gpu] implementations", + "common ensemble params", + "sklearn ensemble classifier params", + "synthetic data" + ] + } + } +} diff --git a/configs/regular/bf16/kmeans.json b/configs/regular/bf16/kmeans.json new file mode 100644 index 00000000..1141e641 --- /dev/null +++ b/configs/regular/bf16/kmeans.json @@ -0,0 +1,40 @@ +{ + "INCLUDE": ["../common/sklearn.json"], + "PARAMETERS_SETS": { + "common kmeans parameters": { + "algorithm": { + "estimator": "KMeans", + "estimator_params": { + "n_clusters": "[SPECIAL_VALUE]auto", + "n_init": 1, + "max_iter": 30, + "tol": 1e-3, + "random_state": 42 + }, + "estimator_methods": { "inference": "predict" } + }, + "data": { + "dtype": ["float32", "float64"], + "preprocessing_kwargs": { "normalize": true } + } + }, + "sklearn kmeans parameters": { + "algorithm": { "estimator_params": { "init": "k-means++", "algorithm": "lloyd" } } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } } + ] + } + }, + "TEMPLATES": { + "sklearn kmeans": { + "SETS": [ + "sklearn-ex[gpu] implementations", + "common kmeans parameters", + "sklearn kmeans parameters", + "synthetic data" + ] + } + } +} diff --git a/configs/regular/bf16/knn.json b/configs/regular/bf16/knn.json new file mode 100644 index 00000000..e6bdcf4e --- /dev/null +++ b/configs/regular/bf16/knn.json @@ -0,0 +1,34 @@ +{ + "INCLUDE": ["../common/sklearn.json"], + "PARAMETERS_SETS": { + "common knn parameters": { + "algorithm": { + "estimator_params": { + "n_neighbors": [10, 100], + "weights": "uniform" + } + }, + "data": { + "preprocessing_kwargs": { "normalize": true } + } + }, + "sklearn knn parameters": { + "algorithm": { "estimator_params": { "n_jobs": "[SPECIAL_VALUE]physical_cpus" } } + }, + "synthetic classification data": { + "data": [ + { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 }, "generation_kwargs": { "n_samples": 550000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } + ] + } + }, + "TEMPLATES": { + "sklearn brute knn clsf": { + "SETS": [ + "sklearn-ex[gpu] implementations", + "common knn parameters", + "sklearn knn parameters", + "synthetic classification data" + ] + } + } +} diff --git a/configs/regular/bf16/linear_model.json b/configs/regular/bf16/linear_model.json new file mode 100644 index 00000000..528f8cca --- /dev/null +++ b/configs/regular/bf16/linear_model.json @@ -0,0 +1,29 @@ +{ + "INCLUDE": ["../common/sklearn.json"], + "PARAMETERS_SETS": { + "synthetic data": { + "data": [ + { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 3000000, "test_size": 5000 } } + ] + }, + "common linear parameters": { + "algorithm": { + "estimator": "LinearRegression", + "estimator_params": { "fit_intercept": true, "copy_X": true } + } + }, + "sklearn linear parameters": { + "estimator_params": { "n_jobs": "[SPECIAL_VALUE]physical_cpus" } + } + }, + "TEMPLATES": { + "sklearn linear": { + "SETS": [ + "sklearn-ex[gpu] implementations", + "common linear parameters", + "sklearn linear parameters", + "synthetic data" + ] + } + } +} diff --git a/configs/regular/bf16/logreg.json b/configs/regular/bf16/logreg.json new file mode 100644 index 00000000..0dd26e40 --- /dev/null +++ b/configs/regular/bf16/logreg.json @@ -0,0 +1,42 @@ +{ + "INCLUDE": ["../common/sklearn.json"], + "PARAMETERS_SETS": { + "common logreg parameters": { + "algorithm": { + "estimator": "LogisticRegression", + "estimator_methods": { "inference": "predict" }, + "estimator_params": { + "penalty": "l2", + "tol": 1e-4, + "C": 1.0, + "l1_ratio": null, + "max_iter": 200 + } + } + }, + "sklearn logreg parameters": { + "algorithm": { + "estimator_params": { + "solver": "lbfgs", + "n_jobs": "[SPECIAL_VALUE]physical_cpus", + "random_state": 42 + } + } + }, + "synthetic data": { + "data": [ + { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 10, "n_classes": 2 } } + ] + } + }, + "TEMPLATES": { + "sklearn logreg": { + "SETS": [ + "sklearn-ex[gpu] implementations", + "common logreg parameters", + "sklearn logreg parameters", + "synthetic data" + ] + } + } +} diff --git a/configs/regular/bf16/pca.json b/configs/regular/bf16/pca.json new file mode 100644 index 00000000..9295aea5 --- /dev/null +++ b/configs/regular/bf16/pca.json @@ -0,0 +1,33 @@ +{ + "INCLUDE": ["../common/sklearn.json"], + "PARAMETERS_SETS": { + "pca parameters": { + "algorithm": { + "estimator": "PCA", + "estimator_params": { + "n_components": 3, + "copy": true, + "whiten": false, + "svd_solver": "covariance_eigh", + "tol": 0.0, + "iterated_power": 15, + "random_state": 42 + } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } } + ] + } + }, + "TEMPLATES": { + "sklearn pca": { + "SETS": [ + "sklearn-ex[gpu] implementations", + "pca parameters", + "synthetic data" + ] + } + } +} diff --git a/configs/spmd/large_scale/linear_model.json b/configs/spmd/large_scale/linear_model.json index e4bb14a1..e208da7d 100644 --- a/configs/spmd/large_scale/linear_model.json +++ b/configs/spmd/large_scale/linear_model.json @@ -10,7 +10,7 @@ "synthetic data": { "data": [ { "source": "make_regression", "generation_kwargs": { "n_samples": 30005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 30000000, "test_size": 5000 } }, - { "source": "make_regression", "generation_kwargs": { "n_samples": 305000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 300000, "test_size": 5000 } } + { "source": "make_regression", "generation_kwargs": { "n_samples": 305000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 300000, "test_size": 5000 } } ] } }, diff --git a/configs/spmd/large_scale/logreg.json b/configs/spmd/large_scale/logreg.json index ccef906b..bbd18f3b 100644 --- a/configs/spmd/large_scale/logreg.json +++ b/configs/spmd/large_scale/logreg.json @@ -10,7 +10,7 @@ }, "synthetic data": { "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 10, "n_classes": 2 } }, + { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 10, "n_classes": 2 } }, { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 101000, "n_features": 1000, "n_classes": 2 } } ] } From aaa00392bfb3e54b7d4fa4f14a12bf8ef8f5fd66 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 24 Sep 2024 00:21:53 -0700 Subject: [PATCH 019/110] bf16 --- configs/regular/bf16/dbscan.json | 2 +- configs/regular/bf16/ensemble.json | 2 +- configs/regular/bf16/kmeans.json | 2 +- configs/regular/bf16/knn.json | 2 +- configs/regular/bf16/linear_model.json | 2 +- configs/regular/bf16/logreg.json | 2 +- configs/regular/bf16/pca.json | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/configs/regular/bf16/dbscan.json b/configs/regular/bf16/dbscan.json index 26e87ad6..b91120e8 100644 --- a/configs/regular/bf16/dbscan.json +++ b/configs/regular/bf16/dbscan.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../common/sklearn.json"], + "INCLUDE": ["../../common/sklearn.json"], "PARAMETERS_SETS": { "common dbscan parameters": { "algorithm": { diff --git a/configs/regular/bf16/ensemble.json b/configs/regular/bf16/ensemble.json index f883a7af..d383bcac 100644 --- a/configs/regular/bf16/ensemble.json +++ b/configs/regular/bf16/ensemble.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../common/sklearn.json"], + "INCLUDE": ["../../common/sklearn.json"], "PARAMETERS_SETS": { "common ensemble params": { "algorithm": { diff --git a/configs/regular/bf16/kmeans.json b/configs/regular/bf16/kmeans.json index 1141e641..084ae8f4 100644 --- a/configs/regular/bf16/kmeans.json +++ b/configs/regular/bf16/kmeans.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../common/sklearn.json"], + "INCLUDE": ["../../common/sklearn.json"], "PARAMETERS_SETS": { "common kmeans parameters": { "algorithm": { diff --git a/configs/regular/bf16/knn.json b/configs/regular/bf16/knn.json index e6bdcf4e..1a62ef89 100644 --- a/configs/regular/bf16/knn.json +++ b/configs/regular/bf16/knn.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../common/sklearn.json"], + "INCLUDE": ["../../common/sklearn.json"], "PARAMETERS_SETS": { "common knn parameters": { "algorithm": { diff --git a/configs/regular/bf16/linear_model.json b/configs/regular/bf16/linear_model.json index 528f8cca..7149e490 100644 --- a/configs/regular/bf16/linear_model.json +++ b/configs/regular/bf16/linear_model.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../common/sklearn.json"], + "INCLUDE": ["../../common/sklearn.json"], "PARAMETERS_SETS": { "synthetic data": { "data": [ diff --git a/configs/regular/bf16/logreg.json b/configs/regular/bf16/logreg.json index 0dd26e40..cde74c25 100644 --- a/configs/regular/bf16/logreg.json +++ b/configs/regular/bf16/logreg.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../common/sklearn.json"], + "INCLUDE": ["../../common/sklearn.json"], "PARAMETERS_SETS": { "common logreg parameters": { "algorithm": { diff --git a/configs/regular/bf16/pca.json b/configs/regular/bf16/pca.json index 9295aea5..945c2939 100644 --- a/configs/regular/bf16/pca.json +++ b/configs/regular/bf16/pca.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../common/sklearn.json"], + "INCLUDE": ["../../common/sklearn.json"], "PARAMETERS_SETS": { "pca parameters": { "algorithm": { From 03a152a13c62eef3fa66b61109b76874d4e9b2b1 Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Tue, 24 Sep 2024 02:46:36 -0700 Subject: [PATCH 020/110] Remove samples/ms info --- sklbench/benchmarks/sklearn_estimator.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index 4cdde86d..7e616273 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -458,10 +458,6 @@ def measure_sklearn_estimator( metrics[method]["time std[ms]"], _, ) = measure_case(bench_case, method_instance, *data_args) - if batch_size is not None: - metrics[method]["throughput[samples/ms]"] = ( - (data_args[0].shape[0] // batch_size) * batch_size - ) / metrics[method]["time[ms]"] if ensure_sklearnex_patching: full_method_name = f"{estimator_class.__name__}.{method}" sklearnex_logging_stream.seek(0) From b7d962e0c51d815a5f93da4c792ec8a8f1f9e4ce Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 24 Sep 2024 06:32:47 -0700 Subject: [PATCH 021/110] knn --- configs/spmd/large_scale/knn.json | 4 ++-- configs/spmd/large_scale/knn_strong.json | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json index a7672ef5..cfd096cf 100644 --- a/configs/spmd/large_scale/knn.json +++ b/configs/spmd/large_scale/knn.json @@ -19,8 +19,8 @@ }, "synthetic classification data": { "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 }, "generation_kwargs": { "n_samples": 550000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, - { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 500000 }, "generation_kwargs": { "n_samples": 550000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } + { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 5000 }, "generation_kwargs": { "n_samples": 55000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, + { "source": "make_classification", "split_kwargs": { "train_size": 5000, "test_size": 50000 }, "generation_kwargs": { "n_samples": 55000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } ] } }, diff --git a/configs/spmd/large_scale/knn_strong.json b/configs/spmd/large_scale/knn_strong.json index 15cc0226..7682dc5e 100644 --- a/configs/spmd/large_scale/knn_strong.json +++ b/configs/spmd/large_scale/knn_strong.json @@ -19,8 +19,8 @@ }, "synthetic classification data": { "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 }, "generation_kwargs": { "n_samples": 550000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, - { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 500000 }, "generation_kwargs": { "n_samples": 550000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } + { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 5000 }, "generation_kwargs": { "n_samples": 505000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, + { "source": "make_classification", "split_kwargs": { "train_size": 5000, "test_size": 500000 }, "generation_kwargs": { "n_samples": 500500, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } ] } }, From 3ac5c236eb6255892e607a6122d4d2187e4c5451 Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Tue, 24 Sep 2024 06:45:42 -0700 Subject: [PATCH 022/110] Remove BS from config (need to add after pip version update) --- configs/incremental.json | 3 --- 1 file changed, 3 deletions(-) diff --git a/configs/incremental.json b/configs/incremental.json index 5f7a5477..c9ffb19c 100644 --- a/configs/incremental.json +++ b/configs/incremental.json @@ -88,9 +88,6 @@ }, "TEMPLATES": { "covariance": {"SETS": ["common", "covariance", "covariance data"]}, - "basic_statistics": { - "SETS": ["common", "basic_statistics", "basic_statistics data"] - }, "linear_regression": { "SETS": ["common", "linear_regression", "linear_regression data"] }, From 87b6fa6674f4b2e222c765e77a9f74b2bc786959 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 24 Sep 2024 22:32:38 -0700 Subject: [PATCH 023/110] basic stat single --- .../spmd/large_scale/basic_stats_single.json | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 configs/spmd/large_scale/basic_stats_single.json diff --git a/configs/spmd/large_scale/basic_stats_single.json b/configs/spmd/large_scale/basic_stats_single.json new file mode 100644 index 00000000..e106b2a9 --- /dev/null +++ b/configs/spmd/large_scale/basic_stats_single.json @@ -0,0 +1,30 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd basicstats parameters": { + "algorithm": { + "estimator": "BasicStatistics", + "estimator_methods": { "training": "fit" } + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } + ] + } + }, + "TEMPLATES": { + "basicstats": { + "SETS": [ + "sklearnex spmd implementation", + "large scale one node parameters", + "synthetic data", + "spmd basicstats parameters" + ] + } + } +} From 9461fad69a00ecbf69a3e5fcef662fb1bafd4253 Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Wed, 25 Sep 2024 02:00:29 -0700 Subject: [PATCH 024/110] Add condition for finalize --- sklbench/benchmarks/sklearn_estimator.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index 7e616273..52f5bf4e 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -347,7 +347,8 @@ def ndarray_function(x, y): x[i * batch_size : (i + 1) * batch_size], y[i * batch_size : (i + 1) * batch_size], ) - estimator_instance._onedal_finalize_fit() + if hasattr(estimator_instance, "_onedal_finalize_fit"): + estimator_instance._onedal_finalize_fit() def dataframe_function(x, y): for i in range(n_batches): @@ -355,19 +356,22 @@ def dataframe_function(x, y): x.iloc[i * batch_size : (i + 1) * batch_size], y.iloc[i * batch_size : (i + 1) * batch_size], ) - estimator_instance._onedal_finalize_fit() + if hasattr(estimator_instance, "_onedal_finalize_fit"): + estimator_instance._onedal_finalize_fit() else: def ndarray_function(x): for i in range(n_batches): method_instance(x[i * batch_size : (i + 1) * batch_size]) - estimator_instance._onedal_finalize_fit() + if hasattr(estimator_instance, "_onedal_finalize_fit"): + estimator_instance._onedal_finalize_fit() def dataframe_function(x): for i in range(n_batches): method_instance(x.iloc[i * batch_size : (i + 1) * batch_size]) - estimator_instance._onedal_finalize_fit() + if hasattr(estimator_instance, "_onedal_finalize_fit"): + estimator_instance._onedal_finalize_fit() if "ndarray" in str(type(data_args[0])): return ndarray_function From b82d772f26c1af7d261b78bf94ae97280c23c9e2 Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Wed, 25 Sep 2024 09:51:39 -0700 Subject: [PATCH 025/110] Fix num_batches usage --- sklbench/benchmarks/sklearn_estimator.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index 52f5bf4e..3f8b1641 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -337,12 +337,11 @@ def verify_patching(stream: io.StringIO, function_name) -> bool: def create_online_function( estimator_instance, method_instance, data_args, num_batches, batch_size ): - n_batches = data_args[0].shape[0] // batch_size if "y" in list(inspect.signature(method_instance).parameters): def ndarray_function(x, y): - for i in range(n_batches): + for i in range(num_batches): method_instance( x[i * batch_size : (i + 1) * batch_size], y[i * batch_size : (i + 1) * batch_size], @@ -351,7 +350,7 @@ def ndarray_function(x, y): estimator_instance._onedal_finalize_fit() def dataframe_function(x, y): - for i in range(n_batches): + for i in range(num_batches): method_instance( x.iloc[i * batch_size : (i + 1) * batch_size], y.iloc[i * batch_size : (i + 1) * batch_size], @@ -362,13 +361,13 @@ def dataframe_function(x, y): else: def ndarray_function(x): - for i in range(n_batches): + for i in range(num_batches): method_instance(x[i * batch_size : (i + 1) * batch_size]) if hasattr(estimator_instance, "_onedal_finalize_fit"): estimator_instance._onedal_finalize_fit() def dataframe_function(x): - for i in range(n_batches): + for i in range(num_batches): method_instance(x.iloc[i * batch_size : (i + 1) * batch_size]) if hasattr(estimator_instance, "_onedal_finalize_fit"): estimator_instance._onedal_finalize_fit() From c70e1222a94d25c51e2239dff8430545383b7f56 Mon Sep 17 00:00:00 2001 From: ethanglaser Date: Fri, 30 Aug 2024 23:27:21 +0000 Subject: [PATCH 026/110] Creating branch for large scale measurements --- configs/spmd/large_scale/basic_stats.json | 30 +++++++++++++++ configs/spmd/large_scale/covariance.json | 30 +++++++++++++++ configs/spmd/large_scale/dbscan.json | 32 ++++++++++++++++ configs/spmd/large_scale/kmeans.json | 32 ++++++++++++++++ configs/spmd/large_scale/knn.json | 43 ++++++++++++++++++++++ configs/spmd/large_scale/large_scale.json | 31 ++++++++++++++++ configs/spmd/large_scale/linear_model.json | 27 ++++++++++++++ configs/spmd/large_scale/logreg.json | 29 +++++++++++++++ configs/spmd/large_scale/pca.json | 30 +++++++++++++++ sklbench/benchmarks/sklearn_estimator.py | 8 ++-- sklbench/datasets/transformer.py | 9 +++-- sklbench/runner/commands_helper.py | 3 ++ 12 files changed, 298 insertions(+), 6 deletions(-) create mode 100644 configs/spmd/large_scale/basic_stats.json create mode 100644 configs/spmd/large_scale/covariance.json create mode 100644 configs/spmd/large_scale/dbscan.json create mode 100644 configs/spmd/large_scale/kmeans.json create mode 100644 configs/spmd/large_scale/knn.json create mode 100644 configs/spmd/large_scale/large_scale.json create mode 100644 configs/spmd/large_scale/linear_model.json create mode 100644 configs/spmd/large_scale/logreg.json create mode 100644 configs/spmd/large_scale/pca.json diff --git a/configs/spmd/large_scale/basic_stats.json b/configs/spmd/large_scale/basic_stats.json new file mode 100644 index 00000000..a9542017 --- /dev/null +++ b/configs/spmd/large_scale/basic_stats.json @@ -0,0 +1,30 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd basicstats parameters": { + "algorithm": { + "estimator": "BasicStatistics", + "estimator_methods": { "training": "compute" } + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } + ] + } + }, + "TEMPLATES": { + "basicstats": { + "SETS": [ + "sklearnex spmd implementation", + "large scale default parameters", + "synthetic data", + "spmd basicstats parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/covariance.json b/configs/spmd/large_scale/covariance.json new file mode 100644 index 00000000..3280bf77 --- /dev/null +++ b/configs/spmd/large_scale/covariance.json @@ -0,0 +1,30 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd basicstats parameters": { + "algorithm": { + "estimator": "EmpiricalCovariance", + "estimator_methods": { "training": "fit" } + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } + ] + } + }, + "TEMPLATES": { + "covariance": { + "SETS": [ + "sklearnex spmd implementation", + "large scale default parameters", + "synthetic data", + "spmd basicstats parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/dbscan.json b/configs/spmd/large_scale/dbscan.json new file mode 100644 index 00000000..c46287d8 --- /dev/null +++ b/configs/spmd/large_scale/dbscan.json @@ -0,0 +1,32 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/dbscan.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd dbscan parameters": { + "algorithm": { + "estimator": "DBSCAN", + "estimator_methods": { + "training": "fit" + } + }, + "data": { + "dtype": "float64" + } + }, + "synthetic dataset": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 10, "centers": 10 }, "algorithm": { "eps": 5, "min_samples": 5 } } + ] + } + }, + "TEMPLATES": { + "dbscan": { + "SETS": [ + "common dbscan parameters", + "synthetic dataset", + "sklearnex spmd implementation", + "large scale default parameters", + "spmd dbscan parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/kmeans.json b/configs/spmd/large_scale/kmeans.json new file mode 100644 index 00000000..3b490f14 --- /dev/null +++ b/configs/spmd/large_scale/kmeans.json @@ -0,0 +1,32 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/kmeans.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd kmeans parameters": { + "algorithm": { + "estimator": "KMeans", + "estimator_params": { + "algorithm": "lloyd" + }, + "estimator_methods": { "training": "fit" } + }, + "bench": { + "mpi_params": {"n": 48} + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } } + ] + } + }, + "TEMPLATES": { + "kmeans": { + "SETS": [ + "synthetic data", + "sklearnex spmd implementation", + "large scale default parameters", + "spmd kmeans parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json new file mode 100644 index 00000000..8b82094d --- /dev/null +++ b/configs/spmd/large_scale/knn.json @@ -0,0 +1,43 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/knn.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd knn cls parameters": { + "algorithm": { + "estimator": "KNeighborsClassifier", + "estimator_params": { + "algorithm": "brute", + "metric": "minkowski", + "p": 2, + "weights": "uniform", + "n_neighbors": 5 + }, + "estimator_methods": { + "training": "fit", + "inference": "predict" + } + }, + "bench": { + "mpi_params": {} + } + }, + "synthetic classification data": { + "data": [ + { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 }, "generation_kwargs": { "n_samples": 550000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, + { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 500000 }, "generation_kwargs": { "n_samples": 550000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, + { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 50000 }, "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, + { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 500000 }, "generation_kwargs": { "n_samples": 1000000, "n_features": 10, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } + ] + } + }, + "TEMPLATES": { + "knn classifier": { + "SETS": [ + "common knn parameters", + "synthetic classification data", + "sklearnex spmd implementation", + "large scale 2k parameters", + "spmd knn cls parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json new file mode 100644 index 00000000..4b39d5e2 --- /dev/null +++ b/configs/spmd/large_scale/large_scale.json @@ -0,0 +1,31 @@ +{ + "PARAMETERS_SETS": { + "large scale default parameters": { + "data": { + "dtype": "float64", + "distributed_split": "None" + }, + "bench": { + "mpi_params": {"n": [1,2,3,4,5,6,7,8,9,10,11,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + "large scale 2k parameters": { + "data": { + "dtype": "float64", + "distributed_split": "None" + }, + "bench": { + "mpi_params": {"n": [192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + "large scale impi parameters": { + "data": { + "dtype": "float64", + "distributed_split": "no" + }, + "bench": { + "mpi_params": {"n": [1,2,4,6,12,24], "ppn": 12} + } + } + } +} diff --git a/configs/spmd/large_scale/linear_model.json b/configs/spmd/large_scale/linear_model.json new file mode 100644 index 00000000..4c861caa --- /dev/null +++ b/configs/spmd/large_scale/linear_model.json @@ -0,0 +1,27 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/linear_model.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd linear parameters": { + "algorithm": { + "estimator": "LinearRegression", + "estimator_methods": { "training": "fit" } + } + }, + "synthetic data": { + "data": [ + { "source": "make_regression", "generation_kwargs": { "n_samples": 30005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 30000000, "test_size": 5000 } }, + { "source": "make_regression", "generation_kwargs": { "n_samples": 305000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 300000, "test_size": 5000 } } + ] + } + }, + "TEMPLATES": { + "linreg": { + "SETS": [ + "sklearnex spmd implementation", + "large scale default parameters", + "synthetic data", + "spmd linear parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/logreg.json b/configs/spmd/large_scale/logreg.json new file mode 100644 index 00000000..c5ef6203 --- /dev/null +++ b/configs/spmd/large_scale/logreg.json @@ -0,0 +1,29 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/logreg.json", "../logreg.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd logreg2 parameters": { + "algorithm":{ + "estimator": "LogisticRegression", + "estimator_methods": { "inference": "predict" }, + "estimator_params": { "max_iter": 20 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 10, "n_classes": 2 } }, + { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 101000, "n_features": 1000, "n_classes": 2 } } + ] + } + }, + "TEMPLATES": { + "linreg": { + "SETS": [ + "sklearnex spmd implementation", + "large scale 2k parameters", + "spmd logreg parameters", + "synthetic data", + "spmd logreg2 parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/pca.json b/configs/spmd/large_scale/pca.json new file mode 100644 index 00000000..35c1942a --- /dev/null +++ b/configs/spmd/large_scale/pca.json @@ -0,0 +1,30 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/pca.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd pca parameters": { + "algorithm": { + "estimator": "PCA", + "estimator_methods": { "training": "fit", "inference": "" } + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } + ] + } + }, + "TEMPLATES": { + "linreg": { + "SETS": [ + "sklearnex spmd implementation", + "large scale default parameters", + "synthetic data", + "spmd pca parameters" + ] + } + } +} diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index 3f8b1641..cf977ad8 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -134,6 +134,9 @@ def get_subset_metrics_of_estimator( and isinstance(iterations[0], Union[Numeric, NumpyNumeric].__args__) ): metrics.update({"iterations": int(iterations[0])}) + if hasattr(estimator_instance, "_n_inner_iter"): + inner_iters = estimator_instance._n_inner_iter + metrics.update({"inner_iters": int(inner_iters)}) if task == "classification": y_pred = convert_to_numpy(estimator_instance.predict(x)) metrics.update( @@ -142,7 +145,7 @@ def get_subset_metrics_of_estimator( "balanced accuracy": float(balanced_accuracy_score(y_compat, y_pred)), } ) - if hasattr(estimator_instance, "predict_proba") and not ( + '''if hasattr(estimator_instance, "predict_proba") and not ( hasattr(estimator_instance, "probability") and getattr(estimator_instance, "probability") == False ): @@ -162,7 +165,7 @@ def get_subset_metrics_of_estimator( ), "logloss": float(log_loss(y_compat, y_pred_proba)), } - ) + )''' elif task == "regression": y_pred = convert_to_numpy(estimator_instance.predict(x)) metrics.update( @@ -454,7 +457,6 @@ def measure_sklearn_estimator( estimator_instance.get_booster() ) method_instance = getattr(daal_model, method) - metrics[method] = dict() ( metrics[method]["time[ms]"], diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index d2e63e9e..1ac7d7bc 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -109,7 +109,8 @@ def split_and_transform_data(bench_case, data, data_description): y_train, y_test = None, None distributed_split = get_bench_case_value(bench_case, "data:distributed_split", None) - if distributed_split == "rank_based": + knn_split_train = "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "") and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1 + if distributed_split == "rank_based" or knn_split_train: from mpi4py import MPI comm = MPI.COMM_WORLD @@ -129,10 +130,12 @@ def split_and_transform_data(bench_case, data, data_description): x_train[train_start:train_end], y_train[train_start:train_end], ) - x_test, y_test = x_test[test_start:test_end], y_test[test_start:test_end] + if distributed_split == "rank_based": + x_test, y_test = x_test[test_start:test_end], y_test[test_start:test_end] else: x_train = x_train[train_start:train_end] - x_test = x_test[test_start:test_end] + if distributed_split == "rank_based": + x_test = x_test[test_start:test_end] device = get_bench_case_value(bench_case, "algorithm:device", None) common_data_format = get_bench_case_value(bench_case, "data:format", "pandas") diff --git a/sklbench/runner/commands_helper.py b/sklbench/runner/commands_helper.py index b66da011..a63686c6 100644 --- a/sklbench/runner/commands_helper.py +++ b/sklbench/runner/commands_helper.py @@ -45,6 +45,9 @@ def generate_benchmark_command( mpi_prefix = "mpirun" for mpi_param_name, mpi_param_value in mpi_params.items(): mpi_prefix += f" -{mpi_param_name} {mpi_param_value}" + if mpi_param_name == "-hostfile": + import os + mpi_prefix += os.environ.get("PBS_NODEFILE") command_prefix = f"{mpi_prefix} {command_prefix}" # 3. Intel(R) VTune* profiling command prefix vtune_profiling = get_bench_case_value(bench_case, "bench:vtune_profiling") From 8d74f6d3b6f4514e1a3ecc165d900b4c7928f70e Mon Sep 17 00:00:00 2001 From: ethanglaser Date: Wed, 18 Sep 2024 18:01:07 +0000 Subject: [PATCH 027/110] strong scaling, config updates, minor revisions --- configs/spmd/large_scale/basic_stats.json | 4 +-- .../spmd/large_scale/basic_stats_strong.json | 29 +++++++++++++++++ configs/spmd/large_scale/covariance.json | 2 +- .../spmd/large_scale/covariance_strong.json | 29 +++++++++++++++++ configs/spmd/large_scale/kmeans.json | 11 +++---- configs/spmd/large_scale/knn.json | 5 +-- configs/spmd/large_scale/large_scale.json | 24 ++++++++++++-- configs/spmd/large_scale/linear_model.json | 2 +- .../spmd/large_scale/linear_model_strong.json | 26 ++++++++++++++++ configs/spmd/large_scale/logreg_strong.json | 28 +++++++++++++++++ configs/spmd/large_scale/pca.json | 2 +- configs/spmd/large_scale/pca_strong.json | 29 +++++++++++++++++ sklbench/benchmarks/sklearn_estimator.py | 4 ++- sklbench/utils/measurement.py | 31 ++++++++++++++----- 14 files changed, 200 insertions(+), 26 deletions(-) create mode 100644 configs/spmd/large_scale/basic_stats_strong.json create mode 100644 configs/spmd/large_scale/covariance_strong.json create mode 100644 configs/spmd/large_scale/linear_model_strong.json create mode 100644 configs/spmd/large_scale/logreg_strong.json create mode 100644 configs/spmd/large_scale/pca_strong.json diff --git a/configs/spmd/large_scale/basic_stats.json b/configs/spmd/large_scale/basic_stats.json index a9542017..9ac4725f 100644 --- a/configs/spmd/large_scale/basic_stats.json +++ b/configs/spmd/large_scale/basic_stats.json @@ -4,7 +4,7 @@ "spmd basicstats parameters": { "algorithm": { "estimator": "BasicStatistics", - "estimator_methods": { "training": "compute" } + "estimator_methods": { "training": "fit" } }, "data": { "split_kwargs": { "test_size": 0.0001 } @@ -21,7 +21,7 @@ "basicstats": { "SETS": [ "sklearnex spmd implementation", - "large scale default parameters", + "large scale 2k parameters", "synthetic data", "spmd basicstats parameters" ] diff --git a/configs/spmd/large_scale/basic_stats_strong.json b/configs/spmd/large_scale/basic_stats_strong.json new file mode 100644 index 00000000..b7aa22cb --- /dev/null +++ b/configs/spmd/large_scale/basic_stats_strong.json @@ -0,0 +1,29 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd basicstats parameters": { + "algorithm": { + "estimator": "BasicStatistics", + "estimator_methods": { "training": "fit" } + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000, "n_features": 100, "centers": 1 } } + ] + } + }, + "TEMPLATES": { + "basicstats": { + "SETS": [ + "sklearnex spmd implementation", + "large scale strong 2k parameters", + "synthetic data", + "spmd basicstats parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/covariance.json b/configs/spmd/large_scale/covariance.json index 3280bf77..260befd0 100644 --- a/configs/spmd/large_scale/covariance.json +++ b/configs/spmd/large_scale/covariance.json @@ -21,7 +21,7 @@ "covariance": { "SETS": [ "sklearnex spmd implementation", - "large scale default parameters", + "large scale 2k parameters", "synthetic data", "spmd basicstats parameters" ] diff --git a/configs/spmd/large_scale/covariance_strong.json b/configs/spmd/large_scale/covariance_strong.json new file mode 100644 index 00000000..568b4a8f --- /dev/null +++ b/configs/spmd/large_scale/covariance_strong.json @@ -0,0 +1,29 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd basicstats parameters": { + "algorithm": { + "estimator": "EmpiricalCovariance", + "estimator_methods": { "training": "fit" } + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000, "n_features": 100, "centers": 1 } } + ] + } + }, + "TEMPLATES": { + "covariance": { + "SETS": [ + "sklearnex spmd implementation", + "large scale strong 2k parameters", + "synthetic data", + "spmd basicstats parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/kmeans.json b/configs/spmd/large_scale/kmeans.json index 3b490f14..89524965 100644 --- a/configs/spmd/large_scale/kmeans.json +++ b/configs/spmd/large_scale/kmeans.json @@ -7,15 +7,14 @@ "estimator_params": { "algorithm": "lloyd" }, - "estimator_methods": { "training": "fit" } - }, - "bench": { - "mpi_params": {"n": 48} + "estimator_methods": { "training": "fit", "inference": "predict" } } }, "synthetic data": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } } + { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000, "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } } ] } }, @@ -24,7 +23,7 @@ "SETS": [ "synthetic data", "sklearnex spmd implementation", - "large scale default parameters", + "large scale 2k parameters", "spmd kmeans parameters" ] } diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json index 8b82094d..e979e2aa 100644 --- a/configs/spmd/large_scale/knn.json +++ b/configs/spmd/large_scale/knn.json @@ -15,9 +15,6 @@ "training": "fit", "inference": "predict" } - }, - "bench": { - "mpi_params": {} } }, "synthetic classification data": { @@ -35,7 +32,7 @@ "common knn parameters", "synthetic classification data", "sklearnex spmd implementation", - "large scale 2k parameters", + "large scale default parameters", "spmd knn cls parameters" ] } diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json index 4b39d5e2..72b808fe 100644 --- a/configs/spmd/large_scale/large_scale.json +++ b/configs/spmd/large_scale/large_scale.json @@ -6,7 +6,16 @@ "distributed_split": "None" }, "bench": { - "mpi_params": {"n": [1,2,3,4,5,6,7,8,9,10,11,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + "large scale strong parameters": { + "data": { + "dtype": "float64", + "distributed_split": "rank_based" + }, + "bench": { + "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, "large scale 2k parameters": { @@ -15,7 +24,16 @@ "distributed_split": "None" }, "bench": { - "mpi_params": {"n": [192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + "large scale strong 2k parameters": { + "data": { + "dtype": "float64", + "distributed_split": "rank_based" + }, + "bench": { + "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, "large scale impi parameters": { @@ -24,7 +42,7 @@ "distributed_split": "no" }, "bench": { - "mpi_params": {"n": [1,2,4,6,12,24], "ppn": 12} + "mpi_params": {"n": [1,2,4,6,9,12], "ppn": 12} } } } diff --git a/configs/spmd/large_scale/linear_model.json b/configs/spmd/large_scale/linear_model.json index 4c861caa..aeda4441 100644 --- a/configs/spmd/large_scale/linear_model.json +++ b/configs/spmd/large_scale/linear_model.json @@ -18,7 +18,7 @@ "linreg": { "SETS": [ "sklearnex spmd implementation", - "large scale default parameters", + "large scale 2k parameters", "synthetic data", "spmd linear parameters" ] diff --git a/configs/spmd/large_scale/linear_model_strong.json b/configs/spmd/large_scale/linear_model_strong.json new file mode 100644 index 00000000..77a9c79e --- /dev/null +++ b/configs/spmd/large_scale/linear_model_strong.json @@ -0,0 +1,26 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/linear_model.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd linear parameters": { + "algorithm": { + "estimator": "LinearRegression", + "estimator_methods": { "training": "fit" } + } + }, + "synthetic data": { + "data": [ + { "source": "make_regression", "generation_kwargs": { "n_samples": 25005000, "n_features": 100, "noise": 1.25 }, "split_kwargs": { "train_size": 25000000, "test_size": 5000 } } + ] + } + }, + "TEMPLATES": { + "linreg": { + "SETS": [ + "sklearnex spmd implementation", + "large scale strong 2k parameters", + "synthetic data", + "spmd linear parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/logreg_strong.json b/configs/spmd/large_scale/logreg_strong.json new file mode 100644 index 00000000..2bf1c0f9 --- /dev/null +++ b/configs/spmd/large_scale/logreg_strong.json @@ -0,0 +1,28 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/logreg.json", "../logreg.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd logreg2 parameters": { + "algorithm":{ + "estimator": "LogisticRegression", + "estimator_methods": { "inference": "predict" }, + "estimator_params": { "max_iter": 30 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 10001000, "n_features": 100, "n_classes": 2 } } + ] + } + }, + "TEMPLATES": { + "linreg": { + "SETS": [ + "sklearnex spmd implementation", + "large scale strong 2k parameters", + "spmd logreg parameters", + "synthetic data", + "spmd logreg2 parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/pca.json b/configs/spmd/large_scale/pca.json index 35c1942a..9a6a6b02 100644 --- a/configs/spmd/large_scale/pca.json +++ b/configs/spmd/large_scale/pca.json @@ -21,7 +21,7 @@ "linreg": { "SETS": [ "sklearnex spmd implementation", - "large scale default parameters", + "large scale 2k parameters", "synthetic data", "spmd pca parameters" ] diff --git a/configs/spmd/large_scale/pca_strong.json b/configs/spmd/large_scale/pca_strong.json new file mode 100644 index 00000000..adee3c79 --- /dev/null +++ b/configs/spmd/large_scale/pca_strong.json @@ -0,0 +1,29 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/pca.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd pca parameters": { + "algorithm": { + "estimator": "PCA", + "estimator_methods": { "training": "fit", "inference": "" } + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000, "n_features": 100, "centers": 1 } } + ] + } + }, + "TEMPLATES": { + "linreg": { + "SETS": [ + "sklearnex spmd implementation", + "large scale strong 2k parameters", + "synthetic data", + "spmd pca parameters" + ] + } + } +} diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index cf977ad8..0fc4874e 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -461,7 +461,9 @@ def measure_sklearn_estimator( ( metrics[method]["time[ms]"], metrics[method]["time std[ms]"], - _, + metrics[method]["first iter[ms]"], + metrics[method]["box filter mean[ms]"], + metrics[method]["box filter std[ms]"] ) = measure_case(bench_case, method_instance, *data_args) if ensure_sklearnex_patching: full_method_name = f"{estimator_class.__name__}.{method}" diff --git a/sklbench/utils/measurement.py b/sklbench/utils/measurement.py index 989daefd..df74e8da 100644 --- a/sklbench/utils/measurement.py +++ b/sklbench/utils/measurement.py @@ -40,6 +40,22 @@ def box_filter(timing, left=0.2, right=0.8): return np.mean(result) * 1000, np.std(result) * 1000 +def large_scale_measurements(timing): + first_iter = timing[0] * 1000 + mean = np.mean(timing[1:]) * 1000 + stdev = np.std(timing[1:]) * 1000 + timing_sorted = np.sort(timing) + Q1, Q3 = np.percentile(timing_sorted, [25, 75]) + IQ = Q3 - Q1 + lower, upper = Q1 - 1.5 * IQ, Q3 + 1.5 * IQ + + filtered_times = timing_sorted[(timing_sorted >= lower) & (timing_sorted <= upper)] + + box_filter_mean = np.mean(filtered_times) * 1000 if filtered_times.size > 0 else 0 + box_filter_stdev = np.std(filtered_times) * 1000 if filtered_times.size > 0 else 0 + return mean, stdev, first_iter, box_filter_mean, box_filter_stdev + + def measure_time( func, *args, @@ -72,13 +88,14 @@ def measure_time( f"exceeded time limit ({time_limit} seconds)" ) break - mean, std = box_filter(times) - if std / mean > std_mean_ratio: - logger.warning( - f'Measured "std / mean" time ratio of "{str(func)}" function is higher ' - f"than threshold ({round(std / mean, 3)} vs. {std_mean_ratio})" - ) - return mean, std, func_return_value + logger.debug(times) + #mean, std = box_filter(times) + #if std / mean > std_mean_ratio: + # logger.warning( + # f'Measured "std / mean" time ratio of "{str(func)}" function is higher ' + # f"than threshold ({round(std / mean, 3)} vs. {std_mean_ratio})" + # ) + return large_scale_measurements(times) # wrapper to get measurement params from benchmarking case From 192744f91ee7199d98986f8ec953def96c0153eb Mon Sep 17 00:00:00 2001 From: ethanglaser Date: Sat, 21 Sep 2024 05:25:44 +0000 Subject: [PATCH 028/110] knn and forest config updates --- configs/spmd/large_scale/forest.json | 26 +++++++++++++++++++++ configs/spmd/large_scale/forest_strong.json | 25 ++++++++++++++++++++ configs/spmd/large_scale/knn.json | 4 ++-- sklbench/benchmarks/sklearn_estimator.py | 4 ++-- sklbench/datasets/transformer.py | 2 +- 5 files changed, 56 insertions(+), 5 deletions(-) create mode 100644 configs/spmd/large_scale/forest.json create mode 100644 configs/spmd/large_scale/forest_strong.json diff --git a/configs/spmd/large_scale/forest.json b/configs/spmd/large_scale/forest.json new file mode 100644 index 00000000..ee614ed3 --- /dev/null +++ b/configs/spmd/large_scale/forest.json @@ -0,0 +1,26 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd forest classification parameters": { + "algorithm": { + "estimator": "RandomForestClassifier" + } + }, + "synthetic data": { + "data": [ + { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 501000, "n_features": 10, "n_classes": 2 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } }, + { "source": "make_classification", "split_kwargs": { "train_size": 10000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 11000, "n_features": 1000, "n_classes": 2 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } } + ] + } + }, + "TEMPLATES": { + "basicstats": { + "SETS": [ + "sklearnex spmd implementation", + "large scale 2k parameters", + "synthetic data", + "spmd forest classification parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/forest_strong.json b/configs/spmd/large_scale/forest_strong.json new file mode 100644 index 00000000..121aa916 --- /dev/null +++ b/configs/spmd/large_scale/forest_strong.json @@ -0,0 +1,25 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd forest classification parameters": { + "algorithm": { + "estimator": "RandomForestClassifier" + } + }, + "synthetic data": { + "data": [ + { "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 10001000, "n_features": 100, "n_classes": 2 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } } + ] + } + }, + "TEMPLATES": { + "basicstats": { + "SETS": [ + "sklearnex spmd implementation", + "large scale strong 2k parameters", + "synthetic data", + "spmd forest classification parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json index e979e2aa..1ef849f1 100644 --- a/configs/spmd/large_scale/knn.json +++ b/configs/spmd/large_scale/knn.json @@ -22,7 +22,7 @@ { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 }, "generation_kwargs": { "n_samples": 550000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 500000 }, "generation_kwargs": { "n_samples": 550000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 50000 }, "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, - { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 500000 }, "generation_kwargs": { "n_samples": 1000000, "n_features": 10, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } + { "source": "make_classification", "split_kwargs": { "train_size": 200000, "test_size": 200000 }, "generation_kwargs": { "n_samples": 400000, "n_features": 10, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } ] } }, @@ -32,7 +32,7 @@ "common knn parameters", "synthetic classification data", "sklearnex spmd implementation", - "large scale default parameters", + "large scale 2k parameters", "spmd knn cls parameters" ] } diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index 0fc4874e..296a5e17 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -546,8 +546,8 @@ def main(bench_case: BenchCase, filters: List[BenchCase]): result_template = enrich_result(result_template, bench_case) if "assume_finite" in context_params: result_template["assume_finite"] = context_params["assume_finite"] - if hasattr(estimator_instance, "get_params"): - estimator_params = estimator_instance.get_params() + #if hasattr(estimator_instance, "get_params"): + # estimator_params = estimator_instance.get_params() # note: "handle" is not JSON-serializable if "handle" in estimator_params: del estimator_params["handle"] diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index 1ac7d7bc..55cfc245 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -181,7 +181,7 @@ def split_and_transform_data(bench_case, data, data_description): "format": data_format, "order": data_order, "dtype": data_dtype, - "samples": converted_data.shape[0], + "samples (per rank)": converted_data.shape[0], } if len(converted_data.shape) == 2 and converted_data.shape[1] > 1: data_description[subset_name]["features"] = converted_data.shape[1] From b1f2c1589cf8cae48b416ab70268643d9f0a2d6c Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 23 Sep 2024 04:53:12 -0700 Subject: [PATCH 029/110] lint --- sklbench/benchmarks/sklearn_estimator.py | 8 ++++---- sklbench/datasets/transformer.py | 5 ++++- sklbench/runner/commands_helper.py | 1 + sklbench/utils/measurement.py | 8 ++++---- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index 296a5e17..b4d4f3ee 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -145,7 +145,7 @@ def get_subset_metrics_of_estimator( "balanced accuracy": float(balanced_accuracy_score(y_compat, y_pred)), } ) - '''if hasattr(estimator_instance, "predict_proba") and not ( + """if hasattr(estimator_instance, "predict_proba") and not ( hasattr(estimator_instance, "probability") and getattr(estimator_instance, "probability") == False ): @@ -165,7 +165,7 @@ def get_subset_metrics_of_estimator( ), "logloss": float(log_loss(y_compat, y_pred_proba)), } - )''' + )""" elif task == "regression": y_pred = convert_to_numpy(estimator_instance.predict(x)) metrics.update( @@ -463,7 +463,7 @@ def measure_sklearn_estimator( metrics[method]["time std[ms]"], metrics[method]["first iter[ms]"], metrics[method]["box filter mean[ms]"], - metrics[method]["box filter std[ms]"] + metrics[method]["box filter std[ms]"], ) = measure_case(bench_case, method_instance, *data_args) if ensure_sklearnex_patching: full_method_name = f"{estimator_class.__name__}.{method}" @@ -546,7 +546,7 @@ def main(bench_case: BenchCase, filters: List[BenchCase]): result_template = enrich_result(result_template, bench_case) if "assume_finite" in context_params: result_template["assume_finite"] = context_params["assume_finite"] - #if hasattr(estimator_instance, "get_params"): + # if hasattr(estimator_instance, "get_params"): # estimator_params = estimator_instance.get_params() # note: "handle" is not JSON-serializable if "handle" in estimator_params: diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index 55cfc245..86944ead 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -109,7 +109,10 @@ def split_and_transform_data(bench_case, data, data_description): y_train, y_test = None, None distributed_split = get_bench_case_value(bench_case, "data:distributed_split", None) - knn_split_train = "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "") and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1 + knn_split_train = ( + "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "") + and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1 + ) if distributed_split == "rank_based" or knn_split_train: from mpi4py import MPI diff --git a/sklbench/runner/commands_helper.py b/sklbench/runner/commands_helper.py index a63686c6..2441085a 100644 --- a/sklbench/runner/commands_helper.py +++ b/sklbench/runner/commands_helper.py @@ -47,6 +47,7 @@ def generate_benchmark_command( mpi_prefix += f" -{mpi_param_name} {mpi_param_value}" if mpi_param_name == "-hostfile": import os + mpi_prefix += os.environ.get("PBS_NODEFILE") command_prefix = f"{mpi_prefix} {command_prefix}" # 3. Intel(R) VTune* profiling command prefix diff --git a/sklbench/utils/measurement.py b/sklbench/utils/measurement.py index df74e8da..7495e258 100644 --- a/sklbench/utils/measurement.py +++ b/sklbench/utils/measurement.py @@ -48,9 +48,9 @@ def large_scale_measurements(timing): Q1, Q3 = np.percentile(timing_sorted, [25, 75]) IQ = Q3 - Q1 lower, upper = Q1 - 1.5 * IQ, Q3 + 1.5 * IQ - + filtered_times = timing_sorted[(timing_sorted >= lower) & (timing_sorted <= upper)] - + box_filter_mean = np.mean(filtered_times) * 1000 if filtered_times.size > 0 else 0 box_filter_stdev = np.std(filtered_times) * 1000 if filtered_times.size > 0 else 0 return mean, stdev, first_iter, box_filter_mean, box_filter_stdev @@ -89,8 +89,8 @@ def measure_time( ) break logger.debug(times) - #mean, std = box_filter(times) - #if std / mean > std_mean_ratio: + # mean, std = box_filter(times) + # if std / mean > std_mean_ratio: # logger.warning( # f'Measured "std / mean" time ratio of "{str(func)}" function is higher ' # f"than threshold ({round(std / mean, 3)} vs. {std_mean_ratio})" From f3be7377caa262f6240a6414cff73de1d3a94c18 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 23 Sep 2024 06:28:03 -0700 Subject: [PATCH 030/110] just gpu for regular --- configs/common/sklearn.json | 5 +++++ configs/regular/dbscan.json | 2 +- configs/regular/ensemble.json | 4 ++-- configs/regular/kmeans.json | 2 +- configs/regular/knn.json | 20 ++------------------ configs/regular/linear_model.json | 24 +----------------------- configs/regular/logreg.json | 2 +- configs/regular/pca.json | 2 +- 8 files changed, 14 insertions(+), 47 deletions(-) diff --git a/configs/common/sklearn.json b/configs/common/sklearn.json index d7b13188..43051093 100644 --- a/configs/common/sklearn.json +++ b/configs/common/sklearn.json @@ -12,6 +12,11 @@ { "library": "sklearnex", "device": ["cpu", "gpu"] } ] }, + "sklearn-ex[gpu] implementations": { + "algorithm": [ + { "library": "sklearnex", "device": ["gpu"] } + ] + }, "sklearn-ex[preview] implementations": { "algorithm": [ { "library": "sklearn", "device": "cpu" }, diff --git a/configs/regular/dbscan.json b/configs/regular/dbscan.json index 71dcdc9b..1d0d732b 100644 --- a/configs/regular/dbscan.json +++ b/configs/regular/dbscan.json @@ -58,7 +58,7 @@ "TEMPLATES": { "sklearn dbscan": { "SETS": [ - "sklearn-ex[cpu,gpu] implementations", + "sklearn-ex[gpu] implementations", "common dbscan parameters", "sklearn dbscan parameters", "dbscan datasets" diff --git a/configs/regular/ensemble.json b/configs/regular/ensemble.json index 56e37e77..164cb236 100644 --- a/configs/regular/ensemble.json +++ b/configs/regular/ensemble.json @@ -90,7 +90,7 @@ "TEMPLATES": { "sklearn ensemble classification": { "SETS": [ - "sklearn-ex[cpu,gpu] implementations", + "sklearn-ex[gpu] implementations", "common ensemble params", "sklearn ensemble classifier params", "ensemble classification data" @@ -98,7 +98,7 @@ }, "sklearn ensemble regression": { "SETS": [ - "sklearn-ex[cpu,gpu] implementations", + "sklearn-ex[gpu] implementations", "common ensemble params", "sklearn ensemble regressor params", "ensemble regression data" diff --git a/configs/regular/kmeans.json b/configs/regular/kmeans.json index d4953615..8aba9055 100644 --- a/configs/regular/kmeans.json +++ b/configs/regular/kmeans.json @@ -70,7 +70,7 @@ "TEMPLATES": { "sklearn kmeans": { "SETS": [ - "sklearn-ex[preview] implementations", + "sklearn-ex[gpu] implementations", "common kmeans parameters", "sklearn kmeans parameters", "kmeans datasets" diff --git a/configs/regular/knn.json b/configs/regular/knn.json index e1cd8a75..bcbed117 100644 --- a/configs/regular/knn.json +++ b/configs/regular/knn.json @@ -74,36 +74,20 @@ "TEMPLATES": { "sklearn brute knn clsf": { "SETS": [ - "sklearn-ex[cpu,gpu] implementations", + "sklearn-ex[gpu] implementations", "common knn parameters", "sklearn knn parameters", "brute knn algorithm - classification data" ] }, - "sklearn kd_tree knn clsf": { - "SETS": [ - "sklearn-ex[cpu] implementations", - "common knn parameters", - "sklearn knn parameters", - "kd_tree knn algorithm - classification data" - ] - }, "sklearn brute knn regr": { "SETS": [ - "sklearn-ex[cpu,gpu] implementations", + "sklearn-ex[gpu] implementations", "common knn parameters", "sklearn knn parameters", "brute knn algorithm - regression data" ] }, - "sklearn kd_tree knn regr": { - "SETS": [ - "sklearn-ex[cpu] implementations", - "common knn parameters", - "sklearn knn parameters", - "kd_tree knn algorithm - regression data" - ] - }, "cuml brute knn clsf": { "SETS": [ "cuml implementation", diff --git a/configs/regular/linear_model.json b/configs/regular/linear_model.json index eb1b79ba..66667343 100644 --- a/configs/regular/linear_model.json +++ b/configs/regular/linear_model.json @@ -85,34 +85,12 @@ "TEMPLATES": { "sklearn linear": { "SETS": [ - "sklearn-ex[cpu,gpu] implementations", + "sklearn-ex[gpu] implementations", "common linear parameters", "sklearn linear parameters", "regression datasets" ] }, - "sklearn ridge": { - "SETS": [ - "sklearn-ex[cpu] implementations", - "common ridge parameters", - "sklearn ridge parameters", - "regression datasets" - ] - }, - "sklearn lasso": { - "SETS": [ - "sklearn-ex[cpu] implementations", - "common lasso parameters", - "regression datasets" - ] - }, - "sklearn elasticnet": { - "SETS": [ - "sklearn-ex[cpu] implementations", - "common elasticnet parameters", - "regression datasets" - ] - }, "cuml linear": { "SETS": [ "cuml implementation", diff --git a/configs/regular/logreg.json b/configs/regular/logreg.json index a94a7fcf..172ceb48 100644 --- a/configs/regular/logreg.json +++ b/configs/regular/logreg.json @@ -54,7 +54,7 @@ "TEMPLATES": { "sklearn logreg": { "SETS": [ - "sklearn-ex[cpu,gpu] implementations", + "sklearn-ex[gpu] implementations", "common logreg parameters", "sklearn logreg parameters", "logreg datasets" diff --git a/configs/regular/pca.json b/configs/regular/pca.json index 582acc9e..2300454d 100644 --- a/configs/regular/pca.json +++ b/configs/regular/pca.json @@ -46,7 +46,7 @@ "TEMPLATES": { "sklearn pca": { "SETS": [ - "sklearn-ex[cpu,gpu] implementations", + "sklearn-ex[gpu] implementations", "pca parameters", "pca datasets" ] From ee8c74b5ef53b450876ae6b00d926f8377692038 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 23 Sep 2024 07:10:55 -0700 Subject: [PATCH 031/110] juremove cuml --- configs/regular/dbscan.json | 8 -------- configs/regular/ensemble.json | 16 ---------------- configs/regular/kmeans.json | 8 -------- configs/regular/knn.json | 14 -------------- configs/regular/linear_model.json | 24 ------------------------ configs/regular/logreg.json | 8 -------- configs/regular/pca.json | 7 ------- 7 files changed, 85 deletions(-) diff --git a/configs/regular/dbscan.json b/configs/regular/dbscan.json index 1d0d732b..711c15cd 100644 --- a/configs/regular/dbscan.json +++ b/configs/regular/dbscan.json @@ -63,14 +63,6 @@ "sklearn dbscan parameters", "dbscan datasets" ] - }, - "cuml dbscan": { - "SETS": [ - "cuml implementation", - "common dbscan parameters", - "cuml dbscan parameters", - "dbscan datasets" - ] } } } diff --git a/configs/regular/ensemble.json b/configs/regular/ensemble.json index 164cb236..f01c1383 100644 --- a/configs/regular/ensemble.json +++ b/configs/regular/ensemble.json @@ -103,22 +103,6 @@ "sklearn ensemble regressor params", "ensemble regression data" ] - }, - "cuml ensemble classification": { - "SETS": [ - "cuml implementation", - "common ensemble params", - "cuml ensemble classifier params", - "ensemble classification data" - ] - }, - "cuml ensemble regression": { - "SETS": [ - "cuml implementation", - "common ensemble params", - "cuml ensemble regressor params", - "ensemble regression data" - ] } } } diff --git a/configs/regular/kmeans.json b/configs/regular/kmeans.json index 8aba9055..756e2bab 100644 --- a/configs/regular/kmeans.json +++ b/configs/regular/kmeans.json @@ -75,14 +75,6 @@ "sklearn kmeans parameters", "kmeans datasets" ] - }, - "cuml kmeans": { - "SETS": [ - "cuml implementation", - "common kmeans parameters", - "cuml kmeans parameters", - "kmeans datasets" - ] } } } diff --git a/configs/regular/knn.json b/configs/regular/knn.json index bcbed117..a69c6864 100644 --- a/configs/regular/knn.json +++ b/configs/regular/knn.json @@ -87,20 +87,6 @@ "sklearn knn parameters", "brute knn algorithm - regression data" ] - }, - "cuml brute knn clsf": { - "SETS": [ - "cuml implementation", - "common knn parameters", - "brute knn algorithm - classification data" - ] - }, - "cuml brute knn regr": { - "SETS": [ - "cuml implementation", - "common knn parameters", - "brute knn algorithm - regression data" - ] } } } diff --git a/configs/regular/linear_model.json b/configs/regular/linear_model.json index 66667343..3040c82d 100644 --- a/configs/regular/linear_model.json +++ b/configs/regular/linear_model.json @@ -98,30 +98,6 @@ "cuml L2 parameters", "regression datasets" ] - }, - "cuml ridge": { - "SETS": [ - "cuml implementation", - "common ridge parameters", - "cuml L2 parameters", - "regression datasets" - ] - }, - "cuml lasso": { - "SETS": [ - "cuml implementation", - "common lasso parameters", - "cuml L1 parameters", - "regression datasets" - ] - }, - "cuml elasticnet": { - "SETS": [ - "cuml implementation", - "common elasticnet parameters", - "cuml L1 parameters", - "regression datasets" - ] } } } diff --git a/configs/regular/logreg.json b/configs/regular/logreg.json index 172ceb48..a8323b02 100644 --- a/configs/regular/logreg.json +++ b/configs/regular/logreg.json @@ -59,14 +59,6 @@ "sklearn logreg parameters", "logreg datasets" ] - }, - "cuml logreg": { - "SETS": [ - "cuml implementation", - "common logreg parameters", - "cuml logreg parameters", - "logreg datasets" - ] } } } diff --git a/configs/regular/pca.json b/configs/regular/pca.json index 2300454d..e26d3f44 100644 --- a/configs/regular/pca.json +++ b/configs/regular/pca.json @@ -50,13 +50,6 @@ "pca parameters", "pca datasets" ] - }, - "cuml pca": { - "SETS": [ - "cuml implementation", - "pca parameters", - "pca datasets" - ] } } } From 93eae2f09c46ce0afa9a1aa62bc9169d8518dacb Mon Sep 17 00:00:00 2001 From: "Kruglov, Oleg" Date: Tue, 24 Sep 2024 05:34:56 -0700 Subject: [PATCH 032/110] Add metrics to list for proper report generation --- sklbench/report/implementation.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py index df15b5eb..f1bda219 100644 --- a/sklbench/report/implementation.py +++ b/sklbench/report/implementation.py @@ -32,6 +32,9 @@ METRICS = { "lower is better": [ "time[ms]", + "first iter[ms]", + "box filter mean[ms]", + "box filter std[ms]", "iterations", # classification "logloss", From 574ff2a35509643d6bae5a457f71e59676c8ef2d Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Wed, 25 Sep 2024 23:54:03 -0700 Subject: [PATCH 033/110] batch for online --- configs/regular/batch_for_online.json | 97 +++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 configs/regular/batch_for_online.json diff --git a/configs/regular/batch_for_online.json b/configs/regular/batch_for_online.json new file mode 100644 index 00000000..8acd604d --- /dev/null +++ b/configs/regular/batch_for_online.json @@ -0,0 +1,97 @@ +{ + "INCLUDE": ["./common/sklearn.json"], + "PARAMETERS_SETS": { + "common": {"bench": {"n_runs": 10, "time_limit": 60}}, + "covariance data": { + "data": [ + { + "source": "make_blobs", + "generation_kwargs": { + "centers": 1, + "n_samples": 1200000, + "n_features": [10, 100] + }, + "split_kwargs": {"ignore": true} + } + ] + }, + "basic_statistics data": { + "data": { + "source": "make_blobs", + "generation_kwargs": { + "centers": 1, + "n_samples": 1200000, + "n_features": [10, 100] + }, + "split_kwargs": {"ignore": true} + } + }, + "linear_regression data": { + "data": { + "source": "make_regression", + "split_kwargs": {"train_size": 0.2, "test_size": 0.8}, + "generation_kwargs": { + "n_samples": 1200000, + "n_features": [10, 100], + "n_informative": 5, + "noise": 2.0 + } + } + }, + "pca data": { + "data": { + "source": "make_blobs", + "generation_kwargs": { + "centers": 1, + "n_samples": 1200000, + "n_features": [10, 100] + }, + "split_kwargs": {"ignore": true} + } + }, + "covariance": { + "algorithm": [ + { + "estimator": "EmpiricalCovariance", + "library": "sklearnex.covariance", + "estimator_methods": {"training": "fit"} + } + ] + }, + "basic_statistics": { + "algorithm": [ + { + "estimator": "BasicStatistics", + "library": "sklearnex.basic_statistics", + "estimator_methods": {"training": "fit"} + } + ] + }, + "linear_regression": { + "algorithm": [ + { + "estimator": "LinearRegression", + "library": "sklearnex.linear_model", + "estimator_methods": {"training": "fit"} + } + ] + }, + "pca": { + "algorithm": [ + { + "estimator": "IncrementalPCA", + "library": "sklearnex.decomposition", + "estimator_methods": {"training": "fit"} + } + ] + } + }, + "TEMPLATES": { + "basic_statistics": {"SETS": ["common", "basic_statistics", "basic_statistics data", "sklearn-ex[gpu] implementations"]}, + "covariance": {"SETS": ["common", "covariance", "covariance data", "sklearn-ex[gpu] implementations"]}, + "linear_regression": { + "SETS": ["common", "linear_regression", "linear_regression data", "sklearn-ex[gpu] implementations"] + }, + "pca": {"SETS": ["common", "pca", "pca data", "sklearn-ex[gpu] implementations"]} + } +} \ No newline at end of file From da7f425920cba0701e5bc7b22e4262f6c5da6aac Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 26 Sep 2024 00:11:03 -0700 Subject: [PATCH 034/110] online vs spmd --- configs/spmd/large_scale/large_scale.json | 18 ++++ configs/spmd/large_scale/spmd_for_online.json | 96 +++++++++++++++++++ .../large_scale/spmd_for_online_strong.json | 96 +++++++++++++++++++ 3 files changed, 210 insertions(+) create mode 100644 configs/spmd/large_scale/spmd_for_online.json create mode 100644 configs/spmd/large_scale/spmd_for_online_strong.json diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json index cf81cbf0..6469b8aa 100644 --- a/configs/spmd/large_scale/large_scale.json +++ b/configs/spmd/large_scale/large_scale.json @@ -36,6 +36,24 @@ "mpi_params": {"n": [1,2,3,4,5,6,7,8,9,10,11,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, + "large scale full one node parameters": { + "data": { + "dtype": "float64", + "distributed_split": "None" + }, + "bench": { + "mpi_params": {"n": [12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + "large scale strong full one node parameters": { + "data": { + "dtype": "float64", + "distributed_split": "rank_based" + }, + "bench": { + "mpi_params": {"n": [12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, "large scale 2k parameters": { "data": { "dtype": "float64", diff --git a/configs/spmd/large_scale/spmd_for_online.json b/configs/spmd/large_scale/spmd_for_online.json new file mode 100644 index 00000000..8e3af579 --- /dev/null +++ b/configs/spmd/large_scale/spmd_for_online.json @@ -0,0 +1,96 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], + "PARAMETERS_SETS": { + "covariance data": { + "data": [ + { + "source": "make_blobs", + "generation_kwargs": { + "centers": 1, + "n_samples": 100000, + "n_features": [10, 100] + }, + "split_kwargs": {"ignore": true} + } + ] + }, + "basic_statistics data": { + "data": { + "source": "make_blobs", + "generation_kwargs": { + "centers": 1, + "n_samples": 100000, + "n_features": [10, 100] + }, + "split_kwargs": {"ignore": true} + } + }, + "linear_regression data": { + "data": { + "source": "make_regression", + "split_kwargs": {"train_size": 0.2, "test_size": 0.8}, + "generation_kwargs": { + "n_samples": 100000, + "n_features": [10, 100], + "n_informative": 5, + "noise": 2.0 + } + } + }, + "pca data": { + "data": { + "source": "make_blobs", + "generation_kwargs": { + "centers": 1, + "n_samples": 100000, + "n_features": [10, 100] + }, + "split_kwargs": {"ignore": true} + } + }, + "covariance": { + "algorithm": [ + { + "estimator": "EmpiricalCovariance", + "library": "sklearnex.covariance", + "estimator_methods": {"training": "fit"} + } + ] + }, + "basic_statistics": { + "algorithm": [ + { + "estimator": "BasicStatistics", + "library": "sklearnex.basic_statistics", + "estimator_methods": {"training": "fit"} + } + ] + }, + "linear_regression": { + "algorithm": [ + { + "estimator": "LinearRegression", + "library": "sklearnex.linear_model", + "estimator_methods": {"training": "fit"} + } + ] + }, + "pca": { + "algorithm": [ + { + "estimator": "IncrementalPCA", + "library": "sklearnex.decomposition", + "estimator_methods": {"training": "fit"} + } + ] + } + }, + "TEMPLATES": { + "basic_statistics": {"SETS": ["basic_statistics", "basic_statistics data", "large scale strong full one node parameters", "sklearnex spmd implementation"]}, + "covariance": {"SETS": ["covariance", "covariance data", "large scale strong full one node parameters", "sklearnex spmd implementation"]}, + "linear_regression": { + "SETS": ["linear_regression", "linear_regression data", "large scale strong full one node parameters", "sklearnex spmd implementation"] + }, + "pca": {"SETS": ["pca", "pca data", "large scale strong full one node parameters", "sklearnex spmd implementation"]} + } +} \ No newline at end of file diff --git a/configs/spmd/large_scale/spmd_for_online_strong.json b/configs/spmd/large_scale/spmd_for_online_strong.json new file mode 100644 index 00000000..abcff3ad --- /dev/null +++ b/configs/spmd/large_scale/spmd_for_online_strong.json @@ -0,0 +1,96 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], + "PARAMETERS_SETS": { + "covariance data": { + "data": [ + { + "source": "make_blobs", + "generation_kwargs": { + "centers": 1, + "n_samples": 1200000, + "n_features": [10, 100] + }, + "split_kwargs": {"ignore": true} + } + ] + }, + "basic_statistics data": { + "data": { + "source": "make_blobs", + "generation_kwargs": { + "centers": 1, + "n_samples": 1200000, + "n_features": [10, 100] + }, + "split_kwargs": {"ignore": true} + } + }, + "linear_regression data": { + "data": { + "source": "make_regression", + "split_kwargs": {"train_size": 0.2, "test_size": 0.8}, + "generation_kwargs": { + "n_samples": 1200000, + "n_features": [10, 100], + "n_informative": 5, + "noise": 2.0 + } + } + }, + "pca data": { + "data": { + "source": "make_blobs", + "generation_kwargs": { + "centers": 1, + "n_samples": 1200000, + "n_features": [10, 100] + }, + "split_kwargs": {"ignore": true} + } + }, + "covariance": { + "algorithm": [ + { + "estimator": "EmpiricalCovariance", + "library": "sklearnex.covariance", + "estimator_methods": {"training": "fit"} + } + ] + }, + "basic_statistics": { + "algorithm": [ + { + "estimator": "BasicStatistics", + "library": "sklearnex.basic_statistics", + "estimator_methods": {"training": "fit"} + } + ] + }, + "linear_regression": { + "algorithm": [ + { + "estimator": "LinearRegression", + "library": "sklearnex.linear_model", + "estimator_methods": {"training": "fit"} + } + ] + }, + "pca": { + "algorithm": [ + { + "estimator": "IncrementalPCA", + "library": "sklearnex.decomposition", + "estimator_methods": {"training": "fit"} + } + ] + } + }, + "TEMPLATES": { + "basic_statistics": {"SETS": ["basic_statistics", "basic_statistics data", "large scale strong full one node parameters", "sklearnex spmd implementation"]}, + "covariance": {"SETS": ["covariance", "covariance data", "large scale strong full one node parameters", "sklearnex spmd implementation"]}, + "linear_regression": { + "SETS": ["linear_regression", "linear_regression data", "large scale strong full one node parameters", "sklearnex spmd implementation"] + }, + "pca": {"SETS": ["pca", "pca data", "large scale strong full one node parameters", "sklearnex spmd implementation"]} + } +} \ No newline at end of file From 2377a9e9f803e304cb05696278311bf583d04fcc Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 26 Sep 2024 00:35:30 -0700 Subject: [PATCH 035/110] spmd vs online fix --- configs/spmd/large_scale/spmd_for_online.json | 46 ++----------------- .../large_scale/spmd_for_online_strong.json | 46 ++----------------- 2 files changed, 10 insertions(+), 82 deletions(-) diff --git a/configs/spmd/large_scale/spmd_for_online.json b/configs/spmd/large_scale/spmd_for_online.json index 8e3af579..0a4bc9da 100644 --- a/configs/spmd/large_scale/spmd_for_online.json +++ b/configs/spmd/large_scale/spmd_for_online.json @@ -47,50 +47,14 @@ }, "split_kwargs": {"ignore": true} } - }, - "covariance": { - "algorithm": [ - { - "estimator": "EmpiricalCovariance", - "library": "sklearnex.covariance", - "estimator_methods": {"training": "fit"} - } - ] - }, - "basic_statistics": { - "algorithm": [ - { - "estimator": "BasicStatistics", - "library": "sklearnex.basic_statistics", - "estimator_methods": {"training": "fit"} - } - ] - }, - "linear_regression": { - "algorithm": [ - { - "estimator": "LinearRegression", - "library": "sklearnex.linear_model", - "estimator_methods": {"training": "fit"} - } - ] - }, - "pca": { - "algorithm": [ - { - "estimator": "IncrementalPCA", - "library": "sklearnex.decomposition", - "estimator_methods": {"training": "fit"} - } - ] } }, "TEMPLATES": { - "basic_statistics": {"SETS": ["basic_statistics", "basic_statistics data", "large scale strong full one node parameters", "sklearnex spmd implementation"]}, - "covariance": {"SETS": ["covariance", "covariance data", "large scale strong full one node parameters", "sklearnex spmd implementation"]}, + "basic_statistics": {"SETS": ["basic_statistics data", "spmd default parameters", "sklearnex spmd implementation", "large scale full one node parameters"]}, + "covariance": {"SETS": ["covariance data", "spmd default parameters","sklearnex spmd implementation", "large scale full one node parameters"]}, "linear_regression": { - "SETS": ["linear_regression", "linear_regression data", "large scale strong full one node parameters", "sklearnex spmd implementation"] + "SETS": ["linear_regression data", "spmd default parameters", "sklearnex spmd implementation", "large scale full one node parameters"] }, - "pca": {"SETS": ["pca", "pca data", "large scale strong full one node parameters", "sklearnex spmd implementation"]} + "pca": {"SETS": ["pca data", "spmd default parameters", "sklearnex spmd implementation", "large scale full one node parameters"]} } -} \ No newline at end of file +} diff --git a/configs/spmd/large_scale/spmd_for_online_strong.json b/configs/spmd/large_scale/spmd_for_online_strong.json index abcff3ad..152e94f3 100644 --- a/configs/spmd/large_scale/spmd_for_online_strong.json +++ b/configs/spmd/large_scale/spmd_for_online_strong.json @@ -47,50 +47,14 @@ }, "split_kwargs": {"ignore": true} } - }, - "covariance": { - "algorithm": [ - { - "estimator": "EmpiricalCovariance", - "library": "sklearnex.covariance", - "estimator_methods": {"training": "fit"} - } - ] - }, - "basic_statistics": { - "algorithm": [ - { - "estimator": "BasicStatistics", - "library": "sklearnex.basic_statistics", - "estimator_methods": {"training": "fit"} - } - ] - }, - "linear_regression": { - "algorithm": [ - { - "estimator": "LinearRegression", - "library": "sklearnex.linear_model", - "estimator_methods": {"training": "fit"} - } - ] - }, - "pca": { - "algorithm": [ - { - "estimator": "IncrementalPCA", - "library": "sklearnex.decomposition", - "estimator_methods": {"training": "fit"} - } - ] } }, "TEMPLATES": { - "basic_statistics": {"SETS": ["basic_statistics", "basic_statistics data", "large scale strong full one node parameters", "sklearnex spmd implementation"]}, - "covariance": {"SETS": ["covariance", "covariance data", "large scale strong full one node parameters", "sklearnex spmd implementation"]}, + "basic_statistics": {"SETS": ["basic_statistics data", "spmd default parameters", "sklearnex spmd implementation", "large scale strong full one node parameters"]}, + "covariance": {"SETS": ["covariance data", "spmd default parameters","sklearnex spmd implementation", "large scale strong full one node parameters"]}, "linear_regression": { - "SETS": ["linear_regression", "linear_regression data", "large scale strong full one node parameters", "sklearnex spmd implementation"] + "SETS": ["linear_regression data", "spmd default parameters", "sklearnex spmd implementation", "large scale strong full one node parameters"] }, - "pca": {"SETS": ["pca", "pca data", "large scale strong full one node parameters", "sklearnex spmd implementation"]} + "pca": {"SETS": ["pca data", "spmd default parameters", "sklearnex spmd implementation", "large scale strong full one node parameters"]} } -} \ No newline at end of file +} From 3e4333e7271fe0658b91d35913393f6ea9589bd4 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 26 Sep 2024 00:40:07 -0700 Subject: [PATCH 036/110] batch vs online fix --- configs/regular/batch_for_online.json | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/configs/regular/batch_for_online.json b/configs/regular/batch_for_online.json index 8acd604d..5dd0c131 100644 --- a/configs/regular/batch_for_online.json +++ b/configs/regular/batch_for_online.json @@ -2,19 +2,6 @@ "INCLUDE": ["./common/sklearn.json"], "PARAMETERS_SETS": { "common": {"bench": {"n_runs": 10, "time_limit": 60}}, - "covariance data": { - "data": [ - { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 1200000, - "n_features": [10, 100] - }, - "split_kwargs": {"ignore": true} - } - ] - }, "basic_statistics data": { "data": { "source": "make_blobs", @@ -49,15 +36,6 @@ "split_kwargs": {"ignore": true} } }, - "covariance": { - "algorithm": [ - { - "estimator": "EmpiricalCovariance", - "library": "sklearnex.covariance", - "estimator_methods": {"training": "fit"} - } - ] - }, "basic_statistics": { "algorithm": [ { @@ -79,7 +57,7 @@ "pca": { "algorithm": [ { - "estimator": "IncrementalPCA", + "estimator": "PCA", "library": "sklearnex.decomposition", "estimator_methods": {"training": "fit"} } @@ -88,7 +66,6 @@ }, "TEMPLATES": { "basic_statistics": {"SETS": ["common", "basic_statistics", "basic_statistics data", "sklearn-ex[gpu] implementations"]}, - "covariance": {"SETS": ["common", "covariance", "covariance data", "sklearn-ex[gpu] implementations"]}, "linear_regression": { "SETS": ["common", "linear_regression", "linear_regression data", "sklearn-ex[gpu] implementations"] }, From 40ad9d51434d1a079ffd1ae6883fe4a4437afdb5 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 26 Sep 2024 01:04:45 -0700 Subject: [PATCH 037/110] increase online data size --- configs/regular/batch_for_online.json | 6 +++--- configs/spmd/large_scale/spmd_for_online.json | 8 ++++---- configs/spmd/large_scale/spmd_for_online_strong.json | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/configs/regular/batch_for_online.json b/configs/regular/batch_for_online.json index 5dd0c131..cbcdaa5c 100644 --- a/configs/regular/batch_for_online.json +++ b/configs/regular/batch_for_online.json @@ -7,7 +7,7 @@ "source": "make_blobs", "generation_kwargs": { "centers": 1, - "n_samples": 1200000, + "n_samples": 12000000, "n_features": [10, 100] }, "split_kwargs": {"ignore": true} @@ -18,7 +18,7 @@ "source": "make_regression", "split_kwargs": {"train_size": 0.2, "test_size": 0.8}, "generation_kwargs": { - "n_samples": 1200000, + "n_samples": 12000000, "n_features": [10, 100], "n_informative": 5, "noise": 2.0 @@ -30,7 +30,7 @@ "source": "make_blobs", "generation_kwargs": { "centers": 1, - "n_samples": 1200000, + "n_samples": 12000000, "n_features": [10, 100] }, "split_kwargs": {"ignore": true} diff --git a/configs/spmd/large_scale/spmd_for_online.json b/configs/spmd/large_scale/spmd_for_online.json index 0a4bc9da..7f258e9b 100644 --- a/configs/spmd/large_scale/spmd_for_online.json +++ b/configs/spmd/large_scale/spmd_for_online.json @@ -7,7 +7,7 @@ "source": "make_blobs", "generation_kwargs": { "centers": 1, - "n_samples": 100000, + "n_samples": 1000000, "n_features": [10, 100] }, "split_kwargs": {"ignore": true} @@ -19,7 +19,7 @@ "source": "make_blobs", "generation_kwargs": { "centers": 1, - "n_samples": 100000, + "n_samples": 1000000, "n_features": [10, 100] }, "split_kwargs": {"ignore": true} @@ -30,7 +30,7 @@ "source": "make_regression", "split_kwargs": {"train_size": 0.2, "test_size": 0.8}, "generation_kwargs": { - "n_samples": 100000, + "n_samples": 1000000, "n_features": [10, 100], "n_informative": 5, "noise": 2.0 @@ -42,7 +42,7 @@ "source": "make_blobs", "generation_kwargs": { "centers": 1, - "n_samples": 100000, + "n_samples": 1000000, "n_features": [10, 100] }, "split_kwargs": {"ignore": true} diff --git a/configs/spmd/large_scale/spmd_for_online_strong.json b/configs/spmd/large_scale/spmd_for_online_strong.json index 152e94f3..77a25075 100644 --- a/configs/spmd/large_scale/spmd_for_online_strong.json +++ b/configs/spmd/large_scale/spmd_for_online_strong.json @@ -7,7 +7,7 @@ "source": "make_blobs", "generation_kwargs": { "centers": 1, - "n_samples": 1200000, + "n_samples": 12000000, "n_features": [10, 100] }, "split_kwargs": {"ignore": true} @@ -19,7 +19,7 @@ "source": "make_blobs", "generation_kwargs": { "centers": 1, - "n_samples": 1200000, + "n_samples": 12000000, "n_features": [10, 100] }, "split_kwargs": {"ignore": true} @@ -30,7 +30,7 @@ "source": "make_regression", "split_kwargs": {"train_size": 0.2, "test_size": 0.8}, "generation_kwargs": { - "n_samples": 1200000, + "n_samples": 12000000, "n_features": [10, 100], "n_informative": 5, "noise": 2.0 @@ -42,7 +42,7 @@ "source": "make_blobs", "generation_kwargs": { "centers": 1, - "n_samples": 1200000, + "n_samples": 12000000, "n_features": [10, 100] }, "split_kwargs": {"ignore": true} From 894ed1d5678cc1d116b8c74446494af3d9b54550 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 26 Sep 2024 01:24:27 -0700 Subject: [PATCH 038/110] batch vs online fix --- configs/regular/batch_for_online.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/regular/batch_for_online.json b/configs/regular/batch_for_online.json index cbcdaa5c..d4239c65 100644 --- a/configs/regular/batch_for_online.json +++ b/configs/regular/batch_for_online.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["./common/sklearn.json"], + "INCLUDE": ["../common/sklearn.json"], "PARAMETERS_SETS": { "common": {"bench": {"n_runs": 10, "time_limit": 60}}, "basic_statistics data": { From 36c57c3734f7cf1ac2c44d7ba9e4c7bd47210725 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 26 Sep 2024 01:41:52 -0700 Subject: [PATCH 039/110] separate nodes --- configs/spmd/large_scale/large_scale.json | 80 +++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json index 6469b8aa..74388728 100644 --- a/configs/spmd/large_scale/large_scale.json +++ b/configs/spmd/large_scale/large_scale.json @@ -63,6 +63,75 @@ "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, + "large scale 64 parameters": { + "data": { + "dtype": "float64", + "distributed_split": "None" + }, + "bench": { + "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + "large scale 128 to 2k parameters": { + "data": { + "dtype": "float64", + "distributed_split": "None" + }, + "bench": { + "mpi_params": {"n": [1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + + "large scale 128 parameters": { + "data": { + "dtype": "float64", + "distributed_split": "None" + }, + "bench": { + "mpi_params": {"n": [1536], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + + "large scale 256 parameters": { + "data": { + "dtype": "float64", + "distributed_split": "None" + }, + "bench": { + "mpi_params": {"n": [3072], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + + "large scale 512 parameters": { + "data": { + "dtype": "float64", + "distributed_split": "None" + }, + "bench": { + "mpi_params": {"n": [6144], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + + "large scale 1024 parameters": { + "data": { + "dtype": "float64", + "distributed_split": "None" + }, + "bench": { + "mpi_params": {"n": [12288], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + + "large scale 2048 parameters": { + "data": { + "dtype": "float64", + "distributed_split": "None" + }, + "bench": { + "mpi_params": {"n": [24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + "large scale two nodes parameters": { "data": { "dtype": "float64", @@ -73,6 +142,16 @@ } }, "large scale strong 2k parameters": { + "data": { + "dtype": "float64", + "distributed_split": "rank_based" + }, + "bench": { + "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + + "large scale strong 64 parameters": { "data": { "dtype": "float64", "distributed_split": "rank_based" @@ -81,6 +160,7 @@ "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, + "large scale strong two nodes parameters": { "data": { "dtype": "float64", From 08f0aa89d030b644b238a185be5226d884997d89 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 26 Sep 2024 02:11:35 -0700 Subject: [PATCH 040/110] minor --- configs/spmd/large_scale/knn_strong.json | 2 +- configs/spmd/large_scale/large_scale.json | 4 ++-- configs/spmd/large_scale/spmd_for_online.json | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/spmd/large_scale/knn_strong.json b/configs/spmd/large_scale/knn_strong.json index 7682dc5e..a3236c74 100644 --- a/configs/spmd/large_scale/knn_strong.json +++ b/configs/spmd/large_scale/knn_strong.json @@ -30,7 +30,7 @@ "common knn parameters", "synthetic classification data", "sklearnex spmd implementation", - "large scale strong 2k parameters", + "large scale strong 32 parameters", "spmd knn cls parameters" ] } diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json index 74388728..55a70fbf 100644 --- a/configs/spmd/large_scale/large_scale.json +++ b/configs/spmd/large_scale/large_scale.json @@ -151,13 +151,13 @@ } }, - "large scale strong 64 parameters": { + "large scale strong 32 parameters": { "data": { "dtype": "float64", "distributed_split": "rank_based" }, "bench": { - "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + "mpi_params": {"n": [1,2,6,12,24,48,96,192,384], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, diff --git a/configs/spmd/large_scale/spmd_for_online.json b/configs/spmd/large_scale/spmd_for_online.json index 7f258e9b..ec42a050 100644 --- a/configs/spmd/large_scale/spmd_for_online.json +++ b/configs/spmd/large_scale/spmd_for_online.json @@ -51,7 +51,7 @@ }, "TEMPLATES": { "basic_statistics": {"SETS": ["basic_statistics data", "spmd default parameters", "sklearnex spmd implementation", "large scale full one node parameters"]}, - "covariance": {"SETS": ["covariance data", "spmd default parameters","sklearnex spmd implementation", "large scale full one node parameters"]}, + "covariance": {"SETS": ["covariance data", "spmd default parameters", "sklearnex spmd implementation", "large scale full one node parameters"]}, "linear_regression": { "SETS": ["linear_regression data", "spmd default parameters", "sklearnex spmd implementation", "large scale full one node parameters"] }, From 33022127814a52e26fea361fc24c4f304e46f587 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 26 Sep 2024 02:27:15 -0700 Subject: [PATCH 041/110] dbscan --- configs/regular/batch_for_online.json | 2 +- configs/spmd/large_scale/dbscan_strong.json | 31 +++++++++++++++++++++ configs/spmd/large_scale/kmeans_strong.json | 2 +- 3 files changed, 33 insertions(+), 2 deletions(-) create mode 100644 configs/spmd/large_scale/dbscan_strong.json diff --git a/configs/regular/batch_for_online.json b/configs/regular/batch_for_online.json index d4239c65..9e53081b 100644 --- a/configs/regular/batch_for_online.json +++ b/configs/regular/batch_for_online.json @@ -1,7 +1,7 @@ { "INCLUDE": ["../common/sklearn.json"], "PARAMETERS_SETS": { - "common": {"bench": {"n_runs": 10, "time_limit": 60}}, + "common": {"bench": {"n_runs": 10}}, "basic_statistics data": { "data": { "source": "make_blobs", diff --git a/configs/spmd/large_scale/dbscan_strong.json b/configs/spmd/large_scale/dbscan_strong.json new file mode 100644 index 00000000..4e96eafa --- /dev/null +++ b/configs/spmd/large_scale/dbscan_strong.json @@ -0,0 +1,31 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/dbscan.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd dbscan parameters": { + "algorithm": { + "estimator": "DBSCAN", + "estimator_methods": { + "training": "fit" + } + }, + "data": { + "dtype": "float64" + } + }, + "synthetic dataset": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 10, "centers": 10 }, "algorithm": { "eps": 5, "min_samples": 5 } } + ] + } + }, + "TEMPLATES": { + "dbscan": { + "SETS": [ + "synthetic dataset", + "sklearnex spmd implementation", + "large scale strong 32 parameters", + "spmd dbscan parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/kmeans_strong.json b/configs/spmd/large_scale/kmeans_strong.json index 29cfc2e7..6f095af0 100644 --- a/configs/spmd/large_scale/kmeans_strong.json +++ b/configs/spmd/large_scale/kmeans_strong.json @@ -23,7 +23,7 @@ "SETS": [ "synthetic data", "sklearnex spmd implementation", - "large scale strong 2k parameters", + "large scale strong 32 parameters", "spmd kmeans parameters" ] } From 1779a9f7b8b57f79f7031894da657fe9e53072b6 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 26 Sep 2024 21:48:26 -0700 Subject: [PATCH 042/110] config fixes --- configs/spmd/large_scale/knn_strong.json | 2 +- configs/spmd/large_scale/spmd_for_online.json | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/configs/spmd/large_scale/knn_strong.json b/configs/spmd/large_scale/knn_strong.json index a3236c74..4afe8684 100644 --- a/configs/spmd/large_scale/knn_strong.json +++ b/configs/spmd/large_scale/knn_strong.json @@ -20,7 +20,7 @@ "synthetic classification data": { "data": [ { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 5000 }, "generation_kwargs": { "n_samples": 505000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, - { "source": "make_classification", "split_kwargs": { "train_size": 5000, "test_size": 500000 }, "generation_kwargs": { "n_samples": 500500, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } + { "source": "make_classification", "split_kwargs": { "train_size": 5000, "test_size": 500000 }, "generation_kwargs": { "n_samples": 505000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } ] } }, diff --git a/configs/spmd/large_scale/spmd_for_online.json b/configs/spmd/large_scale/spmd_for_online.json index ec42a050..11b0b159 100644 --- a/configs/spmd/large_scale/spmd_for_online.json +++ b/configs/spmd/large_scale/spmd_for_online.json @@ -50,11 +50,11 @@ } }, "TEMPLATES": { - "basic_statistics": {"SETS": ["basic_statistics data", "spmd default parameters", "sklearnex spmd implementation", "large scale full one node parameters"]}, - "covariance": {"SETS": ["covariance data", "spmd default parameters", "sklearnex spmd implementation", "large scale full one node parameters"]}, + "basic_statistics": {"SETS": ["basic_statistics data", "sklearnex spmd implementation", "large scale full one node parameters"]}, + "covariance": {"SETS": ["covariance data", "sklearnex spmd implementation", "large scale full one node parameters"]}, "linear_regression": { - "SETS": ["linear_regression data", "spmd default parameters", "sklearnex spmd implementation", "large scale full one node parameters"] + "SETS": ["linear_regression data", "sklearnex spmd implementation", "large scale full one node parameters"] }, - "pca": {"SETS": ["pca data", "spmd default parameters", "sklearnex spmd implementation", "large scale full one node parameters"]} + "pca": {"SETS": ["pca data", "sklearnex spmd implementation", "large scale full one node parameters"]} } } From 4ac119e3e86d6e534ed96f57ab5f371264d8a6d4 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 26 Sep 2024 22:08:09 -0700 Subject: [PATCH 043/110] config fix --- configs/spmd/large_scale/spmd_for_online.json | 44 +++++++++++++++++-- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/configs/spmd/large_scale/spmd_for_online.json b/configs/spmd/large_scale/spmd_for_online.json index 11b0b159..53ac660e 100644 --- a/configs/spmd/large_scale/spmd_for_online.json +++ b/configs/spmd/large_scale/spmd_for_online.json @@ -47,14 +47,50 @@ }, "split_kwargs": {"ignore": true} } + }, + "basic_statistics": { + "algorithm": [ + { + "estimator": "BasicStatistics", + "library": "sklearnex.spmd", + "estimator_methods": {"training": "fit"} + } + ] + }, + "covariance": { + "algorithm": [ + { + "estimator": "EmpiricalCovariance", + "library": "sklearnex.spmd.covariance", + "estimator_methods": {"training": "fit"} + } + ] + }, + "linear_regression": { + "algorithm": [ + { + "estimator": "LinearRegression", + "library": "sklearnex.spmd.linear_model", + "estimator_methods": {"training": "fit"} + } + ] + }, + "pca": { + "algorithm": [ + { + "estimator": "PCA", + "library": "sklearnex.spmd.decomposition", + "estimator_methods": {"training": "fit"} + } + ] } }, "TEMPLATES": { - "basic_statistics": {"SETS": ["basic_statistics data", "sklearnex spmd implementation", "large scale full one node parameters"]}, - "covariance": {"SETS": ["covariance data", "sklearnex spmd implementation", "large scale full one node parameters"]}, + "basic_statistics": {"SETS": ["basic_statistics", "basic_statistics data", "sklearnex spmd implementation", "large scale full one node parameters"]}, + "covariance": {"SETS": ["covariance", "covariance data", "sklearnex spmd implementation", "large scale full one node parameters"]}, "linear_regression": { - "SETS": ["linear_regression data", "sklearnex spmd implementation", "large scale full one node parameters"] + "SETS": ["linear_regression", "linear_regression data", "sklearnex spmd implementation", "large scale full one node parameters"] }, - "pca": {"SETS": ["pca data", "sklearnex spmd implementation", "large scale full one node parameters"]} + "pca": {"SETS": ["pca", "pca data", "sklearnex spmd implementation", "large scale full one node parameters"]} } } From 902f0ec0df358d91fa200c42771c9ad96a169ecb Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 27 Sep 2024 06:25:12 -0700 Subject: [PATCH 044/110] forest regression --- configs/spmd/large_scale/forest_strong_reg.json | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/configs/spmd/large_scale/forest_strong_reg.json b/configs/spmd/large_scale/forest_strong_reg.json index 71afeee6..a5a0c253 100644 --- a/configs/spmd/large_scale/forest_strong_reg.json +++ b/configs/spmd/large_scale/forest_strong_reg.json @@ -8,7 +8,9 @@ }, "synthetic data": { "data": [ - { "source": "make_regression", "generation_kwargs": { "n_samples": 10000000, "n_features": 100, "noise": 1.25 }, "split_kwargs": { "train_size": 10000000, "test_size": 5000 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } }} + { "source": "make_regression", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 9090000, "test_size": 100000 }, "algorithm": { "estimator_params": { "n_estimators": 5, "max_depth": 4 } }}, + { "source": "make_regression", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 90000, "test_size": 10000 }, "algorithm": { "estimator_params": { "n_estimators": 10, "max_depth": 4 } }} + ] } }, @@ -16,7 +18,7 @@ "forestReg": { "SETS": [ "sklearnex spmd implementation", - "large scale strong 2k parameters", + "large scale strong 32 parameters", "synthetic data", "spmd forest regression parameters" ] From d40389eec8c72207174e8f9fdf9e67dedf0c518b Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 27 Sep 2024 06:35:13 -0700 Subject: [PATCH 045/110] forest regression --- configs/spmd/large_scale/forest_reg.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/configs/spmd/large_scale/forest_reg.json b/configs/spmd/large_scale/forest_reg.json index ab2a6920..3191eb28 100644 --- a/configs/spmd/large_scale/forest_reg.json +++ b/configs/spmd/large_scale/forest_reg.json @@ -8,7 +8,9 @@ }, "synthetic data": { "data": [ - { "source": "make_regression", "generation_kwargs": { "n_samples": 10000000, "n_features": 100, "noise": 1.25 }, "split_kwargs": { "train_size": 10000000, "test_size": 5000 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } }} + { "source": "make_regression", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 9090000, "test_size": 100000 }, "algorithm": { "estimator_params": { "n_estimators": 5, "max_depth": 4 } }}, + { "source": "make_regression", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 90000, "test_size": 10000 }, "algorithm": { "estimator_params": { "n_estimators": 10, "max_depth": 4 } }} + ] } }, From 906de020380c5b23336374ceaf33a54eaf47e294 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 27 Sep 2024 07:06:51 -0700 Subject: [PATCH 046/110] forest regression --- configs/spmd/large_scale/forest_reg.json | 4 ++-- configs/spmd/large_scale/forest_strong_reg.json | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/spmd/large_scale/forest_reg.json b/configs/spmd/large_scale/forest_reg.json index 3191eb28..daab32c4 100644 --- a/configs/spmd/large_scale/forest_reg.json +++ b/configs/spmd/large_scale/forest_reg.json @@ -8,8 +8,8 @@ }, "synthetic data": { "data": [ - { "source": "make_regression", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 9090000, "test_size": 100000 }, "algorithm": { "estimator_params": { "n_estimators": 5, "max_depth": 4 } }}, - { "source": "make_regression", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 90000, "test_size": 10000 }, "algorithm": { "estimator_params": { "n_estimators": 10, "max_depth": 4 } }} + { "source": "make_regression", "generation_kwargs": { "n_samples": 1000000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 900000, "test_size": 100000 }, "algorithm": { "estimator_params": { "n_estimators": 5, "max_depth": 4 } }}, + { "source": "make_regression", "generation_kwargs": { "n_samples": 100000, "n_features": 100, "noise": 1.25 }, "split_kwargs": { "train_size": 90000, "test_size": 10000 }, "algorithm": { "estimator_params": { "n_estimators": 10, "max_depth": 4 } }} ] } diff --git a/configs/spmd/large_scale/forest_strong_reg.json b/configs/spmd/large_scale/forest_strong_reg.json index a5a0c253..8d738389 100644 --- a/configs/spmd/large_scale/forest_strong_reg.json +++ b/configs/spmd/large_scale/forest_strong_reg.json @@ -8,8 +8,8 @@ }, "synthetic data": { "data": [ - { "source": "make_regression", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 9090000, "test_size": 100000 }, "algorithm": { "estimator_params": { "n_estimators": 5, "max_depth": 4 } }}, - { "source": "make_regression", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 90000, "test_size": 10000 }, "algorithm": { "estimator_params": { "n_estimators": 10, "max_depth": 4 } }} + { "source": "make_regression", "generation_kwargs": { "n_samples": 1000000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 900000, "test_size": 10000 }, "algorithm": { "estimator_params": { "n_estimators": 5, "max_depth": 4 } }}, + { "source": "make_regression", "generation_kwargs": { "n_samples": 100000, "n_features": 100, "noise": 1.25 }, "split_kwargs": { "train_size": 90000, "test_size": 10000 }, "algorithm": { "estimator_params": { "n_estimators": 10, "max_depth": 4 } }} ] } From 7348b42870900b75cc99842564516f8bf082fe22 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 1 Oct 2024 16:56:52 -0700 Subject: [PATCH 047/110] kmeans and logreg update --- configs/spmd/large_scale/kmeans.json | 5 ++--- configs/spmd/large_scale/logreg.json | 3 ++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/spmd/large_scale/kmeans.json b/configs/spmd/large_scale/kmeans.json index 89524965..caba8a79 100644 --- a/configs/spmd/large_scale/kmeans.json +++ b/configs/spmd/large_scale/kmeans.json @@ -12,9 +12,8 @@ }, "synthetic data": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }, - { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000, "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }, - { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } } + { "source": "make_blobs", "generation_kwargs": { "n_samples": 3750000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 18750, "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } } ] } }, diff --git a/configs/spmd/large_scale/logreg.json b/configs/spmd/large_scale/logreg.json index bbd18f3b..a23a745e 100644 --- a/configs/spmd/large_scale/logreg.json +++ b/configs/spmd/large_scale/logreg.json @@ -11,7 +11,8 @@ "synthetic data": { "data": [ { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 10, "n_classes": 2 } }, - { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 101000, "n_features": 1000, "n_classes": 2 } } + { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 101000, "n_features": 1000, "n_classes": 2 } }, + { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 501000, "n_features": 200, "n_classes":2, "n_clusters_per_class": 3, "flip_y":0.05 } } ] } }, From 270c8417a73392e364f5ade6f4f4f83320190286 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 1 Oct 2024 17:24:27 -0700 Subject: [PATCH 048/110] forest reg data same as cls --- configs/spmd/large_scale/forest_reg.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/spmd/large_scale/forest_reg.json b/configs/spmd/large_scale/forest_reg.json index daab32c4..58cb3962 100644 --- a/configs/spmd/large_scale/forest_reg.json +++ b/configs/spmd/large_scale/forest_reg.json @@ -8,8 +8,8 @@ }, "synthetic data": { "data": [ - { "source": "make_regression", "generation_kwargs": { "n_samples": 1000000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 900000, "test_size": 100000 }, "algorithm": { "estimator_params": { "n_estimators": 5, "max_depth": 4 } }}, - { "source": "make_regression", "generation_kwargs": { "n_samples": 100000, "n_features": 100, "noise": 1.25 }, "split_kwargs": { "train_size": 90000, "test_size": 10000 }, "algorithm": { "estimator_params": { "n_estimators": 10, "max_depth": 4 } }} + { "source": "make_regression", "generation_kwargs": { "n_samples": 501000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } }}, + { "source": "make_regression", "generation_kwargs": { "n_samples": 11000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 10000, "test_size": 1000 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } }} ] } From d172d2a2dc93f7acf31d33918809124510a6709a Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 1 Oct 2024 18:50:19 -0700 Subject: [PATCH 049/110] knn bf16 --- configs/regular/bf16/knn.json | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/configs/regular/bf16/knn.json b/configs/regular/bf16/knn.json index 1a62ef89..049ed492 100644 --- a/configs/regular/bf16/knn.json +++ b/configs/regular/bf16/knn.json @@ -17,9 +17,14 @@ }, "synthetic classification data": { "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 }, "generation_kwargs": { "n_samples": 550000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } + { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 51000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } ] - } + }, + "synthetic regression data": { + "data": [ + { "source": "make_regression", "split_kwargs": { "train_size": 50000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 51000, "n_features": 100, "noise":1.5 } } + ] + } }, "TEMPLATES": { "sklearn brute knn clsf": { @@ -27,7 +32,8 @@ "sklearn-ex[gpu] implementations", "common knn parameters", "sklearn knn parameters", - "synthetic classification data" + "synthetic classification data", + "synthetic regression data" ] } } From 29ea28838d4458aa866e3658f81c0c011c744b3d Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Wed, 2 Oct 2024 11:38:06 -0700 Subject: [PATCH 050/110] cov regular prev --- configs/regular/batch_for_online.json | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/configs/regular/batch_for_online.json b/configs/regular/batch_for_online.json index 9e53081b..2bcdc47f 100644 --- a/configs/regular/batch_for_online.json +++ b/configs/regular/batch_for_online.json @@ -45,6 +45,15 @@ } ] }, + "covariance": { + "algorithm": [ + { + "estimator": "EmpiricalCovariance", + "library": "sklearnex.preview.covariance", + "estimator_methods": {"training": "fit"} + } + ] + }, "linear_regression": { "algorithm": [ { @@ -66,6 +75,7 @@ }, "TEMPLATES": { "basic_statistics": {"SETS": ["common", "basic_statistics", "basic_statistics data", "sklearn-ex[gpu] implementations"]}, + "covariance": {"SETS": ["common", "basic_statistics data", "sklearn-ex[gpu] implementations", "covariance"]}, "linear_regression": { "SETS": ["common", "linear_regression", "linear_regression data", "sklearn-ex[gpu] implementations"] }, From 13c0514e64227134bfae874f58f557121073b450 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Mon, 7 Oct 2024 22:44:01 +0200 Subject: [PATCH 051/110] Update logreg.json --- configs/regular/bf16/logreg.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/regular/bf16/logreg.json b/configs/regular/bf16/logreg.json index cde74c25..e2babdd1 100644 --- a/configs/regular/bf16/logreg.json +++ b/configs/regular/bf16/logreg.json @@ -10,14 +10,14 @@ "tol": 1e-4, "C": 1.0, "l1_ratio": null, - "max_iter": 200 + "max_iter": 20 } } }, "sklearn logreg parameters": { "algorithm": { "estimator_params": { - "solver": "lbfgs", + "solver": "newton-cg", "n_jobs": "[SPECIAL_VALUE]physical_cpus", "random_state": 42 } From 8532908888208b263918bfecec0ed758bfd8433b Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Mon, 7 Oct 2024 23:17:11 +0200 Subject: [PATCH 052/110] Update ensemble.json --- configs/regular/bf16/ensemble.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/configs/regular/bf16/ensemble.json b/configs/regular/bf16/ensemble.json index d383bcac..556da67a 100644 --- a/configs/regular/bf16/ensemble.json +++ b/configs/regular/bf16/ensemble.json @@ -13,6 +13,9 @@ "bootstrap": true, "random_state": 42 } + }, + "data": { + "dtype": ["float32"] } }, "sklearn ensemble classifier params": { From c3ac4bb08b12d429779f3a5b93f3d591489f2fed Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Mon, 7 Oct 2024 23:17:33 +0200 Subject: [PATCH 053/110] Update kmeans.json --- configs/regular/bf16/kmeans.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/regular/bf16/kmeans.json b/configs/regular/bf16/kmeans.json index 084ae8f4..8a5323c5 100644 --- a/configs/regular/bf16/kmeans.json +++ b/configs/regular/bf16/kmeans.json @@ -14,7 +14,7 @@ "estimator_methods": { "inference": "predict" } }, "data": { - "dtype": ["float32", "float64"], + "dtype": ["float32"], "preprocessing_kwargs": { "normalize": true } } }, From a8d898b1016d5e14df189ee80f896cee205eaf8c Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Mon, 7 Oct 2024 23:18:37 +0200 Subject: [PATCH 054/110] Update knn.json --- configs/regular/bf16/knn.json | 1 + 1 file changed, 1 insertion(+) diff --git a/configs/regular/bf16/knn.json b/configs/regular/bf16/knn.json index 049ed492..527dcbe4 100644 --- a/configs/regular/bf16/knn.json +++ b/configs/regular/bf16/knn.json @@ -9,6 +9,7 @@ } }, "data": { + "dtype": ["float32"], "preprocessing_kwargs": { "normalize": true } } }, From fe90de288ab5439d3a02f77136f252e7c38d0cff Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Mon, 7 Oct 2024 23:19:32 +0200 Subject: [PATCH 055/110] Update logreg.json --- configs/regular/bf16/logreg.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/configs/regular/bf16/logreg.json b/configs/regular/bf16/logreg.json index e2babdd1..863d67f9 100644 --- a/configs/regular/bf16/logreg.json +++ b/configs/regular/bf16/logreg.json @@ -12,6 +12,9 @@ "l1_ratio": null, "max_iter": 20 } + }, + "data": { + "dtype": ["float32"] } }, "sklearn logreg parameters": { From 7ab1cc3a482daefe33dd82c631345f897c904cd7 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Mon, 7 Oct 2024 23:21:22 +0200 Subject: [PATCH 056/110] Update pca.json --- configs/regular/bf16/pca.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/configs/regular/bf16/pca.json b/configs/regular/bf16/pca.json index 945c2939..e5113261 100644 --- a/configs/regular/bf16/pca.json +++ b/configs/regular/bf16/pca.json @@ -13,6 +13,9 @@ "iterated_power": 15, "random_state": 42 } + }, + "data": { + "dtype": ["float32"] } }, "synthetic data": { From 595a7ee974b6789a5fdddf89bf3e24adab11cb13 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Mon, 7 Oct 2024 23:22:22 +0200 Subject: [PATCH 057/110] Update linear_model.json --- configs/regular/bf16/linear_model.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/configs/regular/bf16/linear_model.json b/configs/regular/bf16/linear_model.json index 7149e490..393b2c64 100644 --- a/configs/regular/bf16/linear_model.json +++ b/configs/regular/bf16/linear_model.json @@ -10,6 +10,9 @@ "algorithm": { "estimator": "LinearRegression", "estimator_params": { "fit_intercept": true, "copy_X": true } + }, + "data": { + "dtype": ["float32"] } }, "sklearn linear parameters": { From 80257199e245c0685664f541219ee97533f6a1cc Mon Sep 17 00:00:00 2001 From: ethanglaser Date: Mon, 7 Oct 2024 21:45:55 +0000 Subject: [PATCH 058/110] dbscan large scale support and logreg details --- configs/spmd/large_scale/dbscan_strong.json | 32 +++++++++++++++++++++ configs/spmd/large_scale/large_scale.json | 27 +++++++++++++++++ configs/spmd/large_scale/logreg_strong.json | 2 +- sklbench/utils/measurement.py | 7 +++++ 4 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 configs/spmd/large_scale/dbscan_strong.json diff --git a/configs/spmd/large_scale/dbscan_strong.json b/configs/spmd/large_scale/dbscan_strong.json new file mode 100644 index 00000000..1843cd8c --- /dev/null +++ b/configs/spmd/large_scale/dbscan_strong.json @@ -0,0 +1,32 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../../regular/dbscan.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd dbscan parameters": { + "algorithm": { + "estimator": "DBSCAN", + "estimator_methods": { + "training": "fit" + } + }, + "data": { + "dtype": "float64" + } + }, + "synthetic dataset": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 400000, "n_features": 100, "centers": 10 }, "algorithm": { "eps": 5, "min_samples": 5 } } + ] + } + }, + "TEMPLATES": { + "dbscan": { + "SETS": [ + "common dbscan parameters", + "synthetic dataset", + "sklearnex spmd implementation", + "large scale strong parameters", + "spmd dbscan parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json index 72b808fe..06a8db16 100644 --- a/configs/spmd/large_scale/large_scale.json +++ b/configs/spmd/large_scale/large_scale.json @@ -27,6 +27,24 @@ "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, + "large scale <64 parameters": { + "data": { + "dtype": "float64", + "distributed_split": "None" + }, + "bench": { + "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + "large scale >64 parameters": { + "data": { + "dtype": "float64", + "distributed_split": "None" + }, + "bench": { + "mpi_params": {"n": [768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, "large scale strong 2k parameters": { "data": { "dtype": "float64", @@ -36,6 +54,15 @@ "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, + "large scale strong <64 parameters": { + "data": { + "dtype": "float64", + "distributed_split": "rank_based" + }, + "bench": { + "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, "large scale impi parameters": { "data": { "dtype": "float64", diff --git a/configs/spmd/large_scale/logreg_strong.json b/configs/spmd/large_scale/logreg_strong.json index 2bf1c0f9..8787f6b6 100644 --- a/configs/spmd/large_scale/logreg_strong.json +++ b/configs/spmd/large_scale/logreg_strong.json @@ -5,7 +5,7 @@ "algorithm":{ "estimator": "LogisticRegression", "estimator_methods": { "inference": "predict" }, - "estimator_params": { "max_iter": 30 } + "estimator_params": { "max_iter": 16 } } }, "synthetic data": { diff --git a/sklbench/utils/measurement.py b/sklbench/utils/measurement.py index df74e8da..ea86d29f 100644 --- a/sklbench/utils/measurement.py +++ b/sklbench/utils/measurement.py @@ -72,12 +72,16 @@ def measure_time( ) times = [] func_return_value = None + inners, iters = [], [] while len(times) < n_runs: if enable_itt and itt_is_available: itt.resume() t0 = timeit.default_timer() func_return_value = func(*args, **kwargs) t1 = timeit.default_timer() + if hasattr(func.__self__, "_n_inner_iter"): + inners.append(func.__self__._n_inner_iter) + iters.append(func.__self__.n_iter_) if enable_itt and itt_is_available: itt.pause() times.append(t1 - t0) @@ -88,6 +92,9 @@ def measure_time( f"exceeded time limit ({time_limit} seconds)" ) break + from mpi4py import MPI + if MPI.COMM_WORLD.Get_rank() == 0: + logger.debug("iters across n runs: " + str(iters) + ", inner iters across n runs: " + str(inners)) logger.debug(times) #mean, std = box_filter(times) #if std / mean > std_mean_ratio: From fcaa9077f9f987450642a7ef4d42924b5551780e Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 8 Oct 2024 00:51:01 -0400 Subject: [PATCH 059/110] reformat --- configs/regular/batch_for_online.json | 3 ++- configs/spmd/large_scale/basic_stats.json | 2 +- configs/spmd/large_scale/basic_stats_single.json | 2 +- configs/spmd/large_scale/basic_stats_strong.json | 2 +- configs/spmd/large_scale/covariance.json | 2 +- configs/spmd/large_scale/covariance_strong.json | 2 +- configs/spmd/large_scale/dbscan.json | 2 +- configs/spmd/large_scale/dbscan_strong.json | 2 +- configs/spmd/large_scale/forest.json | 2 +- configs/spmd/large_scale/forest_reg.json | 2 +- configs/spmd/large_scale/forest_strong.json | 2 +- configs/spmd/large_scale/forest_strong_reg.json | 2 +- configs/spmd/large_scale/knn.json | 2 +- configs/spmd/large_scale/knn_strong.json | 2 +- configs/spmd/large_scale/linear_model.json | 2 +- configs/spmd/large_scale/linear_model_strong.json | 2 +- configs/spmd/large_scale/logreg.json | 11 ----------- configs/spmd/large_scale/logreg_2.json | 4 ++-- configs/spmd/large_scale/logreg_strong.json | 4 ++-- configs/spmd/large_scale/logreg_strong_2.json | 4 ++-- configs/spmd/large_scale/pca.json | 4 ++-- configs/spmd/large_scale/pca_single.json | 4 ++-- configs/spmd/large_scale/pca_strong.json | 4 ++-- 23 files changed, 29 insertions(+), 39 deletions(-) diff --git a/configs/regular/batch_for_online.json b/configs/regular/batch_for_online.json index 2bcdc47f..973c4ed4 100644 --- a/configs/regular/batch_for_online.json +++ b/configs/regular/batch_for_online.json @@ -81,4 +81,5 @@ }, "pca": {"SETS": ["common", "pca", "pca data", "sklearn-ex[gpu] implementations"]} } -} \ No newline at end of file +} + diff --git a/configs/spmd/large_scale/basic_stats.json b/configs/spmd/large_scale/basic_stats.json index b484b647..9ac4725f 100644 --- a/configs/spmd/large_scale/basic_stats.json +++ b/configs/spmd/large_scale/basic_stats.json @@ -22,7 +22,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale 2k parameters", - "synthetic data", + "synthetic data", "spmd basicstats parameters" ] } diff --git a/configs/spmd/large_scale/basic_stats_single.json b/configs/spmd/large_scale/basic_stats_single.json index e106b2a9..832bd3b2 100644 --- a/configs/spmd/large_scale/basic_stats_single.json +++ b/configs/spmd/large_scale/basic_stats_single.json @@ -22,7 +22,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale one node parameters", - "synthetic data", + "synthetic data", "spmd basicstats parameters" ] } diff --git a/configs/spmd/large_scale/basic_stats_strong.json b/configs/spmd/large_scale/basic_stats_strong.json index 6527d8e5..b7aa22cb 100644 --- a/configs/spmd/large_scale/basic_stats_strong.json +++ b/configs/spmd/large_scale/basic_stats_strong.json @@ -21,7 +21,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale strong 2k parameters", - "synthetic data", + "synthetic data", "spmd basicstats parameters" ] } diff --git a/configs/spmd/large_scale/covariance.json b/configs/spmd/large_scale/covariance.json index e4d0477a..260befd0 100644 --- a/configs/spmd/large_scale/covariance.json +++ b/configs/spmd/large_scale/covariance.json @@ -22,7 +22,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale 2k parameters", - "synthetic data", + "synthetic data", "spmd basicstats parameters" ] } diff --git a/configs/spmd/large_scale/covariance_strong.json b/configs/spmd/large_scale/covariance_strong.json index 2b9c5dd0..568b4a8f 100644 --- a/configs/spmd/large_scale/covariance_strong.json +++ b/configs/spmd/large_scale/covariance_strong.json @@ -21,7 +21,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale strong 2k parameters", - "synthetic data", + "synthetic data", "spmd basicstats parameters" ] } diff --git a/configs/spmd/large_scale/dbscan.json b/configs/spmd/large_scale/dbscan.json index b17e2cd8..c46287d8 100644 --- a/configs/spmd/large_scale/dbscan.json +++ b/configs/spmd/large_scale/dbscan.json @@ -24,7 +24,7 @@ "common dbscan parameters", "synthetic dataset", "sklearnex spmd implementation", - "large scale default parameters", + "large scale default parameters", "spmd dbscan parameters" ] } diff --git a/configs/spmd/large_scale/dbscan_strong.json b/configs/spmd/large_scale/dbscan_strong.json index 4e96eafa..05b00d39 100644 --- a/configs/spmd/large_scale/dbscan_strong.json +++ b/configs/spmd/large_scale/dbscan_strong.json @@ -23,7 +23,7 @@ "SETS": [ "synthetic dataset", "sklearnex spmd implementation", - "large scale strong 32 parameters", + "large scale strong 32 parameters", "spmd dbscan parameters" ] } diff --git a/configs/spmd/large_scale/forest.json b/configs/spmd/large_scale/forest.json index 5aa3d36f..9cab46be 100644 --- a/configs/spmd/large_scale/forest.json +++ b/configs/spmd/large_scale/forest.json @@ -18,7 +18,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale 2k parameters", - "synthetic data", + "synthetic data", "spmd forest classification parameters" ] } diff --git a/configs/spmd/large_scale/forest_reg.json b/configs/spmd/large_scale/forest_reg.json index 58cb3962..a5ec73cd 100644 --- a/configs/spmd/large_scale/forest_reg.json +++ b/configs/spmd/large_scale/forest_reg.json @@ -19,7 +19,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale 2k parameters", - "synthetic data", + "synthetic data", "spmd forest regression parameters" ] } diff --git a/configs/spmd/large_scale/forest_strong.json b/configs/spmd/large_scale/forest_strong.json index 14690846..9259e7ea 100644 --- a/configs/spmd/large_scale/forest_strong.json +++ b/configs/spmd/large_scale/forest_strong.json @@ -17,7 +17,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale strong 2k parameters", - "synthetic data", + "synthetic data", "spmd forest classification parameters" ] } diff --git a/configs/spmd/large_scale/forest_strong_reg.json b/configs/spmd/large_scale/forest_strong_reg.json index 8d738389..305e729b 100644 --- a/configs/spmd/large_scale/forest_strong_reg.json +++ b/configs/spmd/large_scale/forest_strong_reg.json @@ -19,7 +19,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale strong 32 parameters", - "synthetic data", + "synthetic data", "spmd forest regression parameters" ] } diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json index cfd096cf..f1e0678d 100644 --- a/configs/spmd/large_scale/knn.json +++ b/configs/spmd/large_scale/knn.json @@ -30,7 +30,7 @@ "common knn parameters", "synthetic classification data", "sklearnex spmd implementation", - "large scale 2k parameters", + "large scale 2k parameters", "spmd knn cls parameters" ] } diff --git a/configs/spmd/large_scale/knn_strong.json b/configs/spmd/large_scale/knn_strong.json index 4afe8684..67398123 100644 --- a/configs/spmd/large_scale/knn_strong.json +++ b/configs/spmd/large_scale/knn_strong.json @@ -30,7 +30,7 @@ "common knn parameters", "synthetic classification data", "sklearnex spmd implementation", - "large scale strong 32 parameters", + "large scale strong 32 parameters", "spmd knn cls parameters" ] } diff --git a/configs/spmd/large_scale/linear_model.json b/configs/spmd/large_scale/linear_model.json index e208da7d..f9d17b5b 100644 --- a/configs/spmd/large_scale/linear_model.json +++ b/configs/spmd/large_scale/linear_model.json @@ -19,7 +19,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale 2k parameters", - "synthetic data", + "synthetic data", "spmd linear parameters" ] } diff --git a/configs/spmd/large_scale/linear_model_strong.json b/configs/spmd/large_scale/linear_model_strong.json index 9d8c3533..77a9c79e 100644 --- a/configs/spmd/large_scale/linear_model_strong.json +++ b/configs/spmd/large_scale/linear_model_strong.json @@ -18,7 +18,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale strong 2k parameters", - "synthetic data", + "synthetic data", "spmd linear parameters" ] } diff --git a/configs/spmd/large_scale/logreg.json b/configs/spmd/large_scale/logreg.json index 9fbaee71..c5ef6203 100644 --- a/configs/spmd/large_scale/logreg.json +++ b/configs/spmd/large_scale/logreg.json @@ -10,14 +10,8 @@ }, "synthetic data": { "data": [ -<<<<<<< HEAD - { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 10, "n_classes": 2 } }, - { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 101000, "n_features": 1000, "n_classes": 2 } }, - { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 501000, "n_features": 200, "n_classes":2, "n_clusters_per_class": 3, "flip_y":0.05 } } -======= { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 10, "n_classes": 2 } }, { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 101000, "n_features": 1000, "n_classes": 2 } } ->>>>>>> oleg_online/inc-dist-support ] } }, @@ -27,13 +21,8 @@ "sklearnex spmd implementation", "large scale 2k parameters", "spmd logreg parameters", -<<<<<<< HEAD - "synthetic data", - "spmd logreg2 parameters" -======= "synthetic data", "spmd logreg2 parameters" ->>>>>>> oleg_online/inc-dist-support ] } } diff --git a/configs/spmd/large_scale/logreg_2.json b/configs/spmd/large_scale/logreg_2.json index d18b2293..796eb8ad 100644 --- a/configs/spmd/large_scale/logreg_2.json +++ b/configs/spmd/large_scale/logreg_2.json @@ -21,8 +21,8 @@ "sklearnex spmd implementation", "large scale two nodes parameters", "spmd logreg parameters", - "synthetic data", - "spmd logreg2 parameters" + "synthetic data", + "spmd logreg2 parameters" ] } } diff --git a/configs/spmd/large_scale/logreg_strong.json b/configs/spmd/large_scale/logreg_strong.json index a6efd969..2bf1c0f9 100644 --- a/configs/spmd/large_scale/logreg_strong.json +++ b/configs/spmd/large_scale/logreg_strong.json @@ -20,8 +20,8 @@ "sklearnex spmd implementation", "large scale strong 2k parameters", "spmd logreg parameters", - "synthetic data", - "spmd logreg2 parameters" + "synthetic data", + "spmd logreg2 parameters" ] } } diff --git a/configs/spmd/large_scale/logreg_strong_2.json b/configs/spmd/large_scale/logreg_strong_2.json index 1a940d90..998e3bb7 100644 --- a/configs/spmd/large_scale/logreg_strong_2.json +++ b/configs/spmd/large_scale/logreg_strong_2.json @@ -20,8 +20,8 @@ "sklearnex spmd implementation", "large scale strong two nodes parameters", "spmd logreg parameters", - "synthetic data", - "spmd logreg2 parameters" + "synthetic data", + "spmd logreg2 parameters" ] } } diff --git a/configs/spmd/large_scale/pca.json b/configs/spmd/large_scale/pca.json index 3b9da126..9a6a6b02 100644 --- a/configs/spmd/large_scale/pca.json +++ b/configs/spmd/large_scale/pca.json @@ -20,10 +20,10 @@ "TEMPLATES": { "linreg": { "SETS": [ - "sklearnex spmd implementation", + "sklearnex spmd implementation", "large scale 2k parameters", "synthetic data", - "spmd pca parameters" + "spmd pca parameters" ] } } diff --git a/configs/spmd/large_scale/pca_single.json b/configs/spmd/large_scale/pca_single.json index 61b2cf15..07775a6a 100644 --- a/configs/spmd/large_scale/pca_single.json +++ b/configs/spmd/large_scale/pca_single.json @@ -20,10 +20,10 @@ "TEMPLATES": { "linreg": { "SETS": [ - "sklearnex spmd implementation", + "sklearnex spmd implementation", "large scale one node parameters", "synthetic data", - "spmd pca parameters" + "spmd pca parameters" ] } } diff --git a/configs/spmd/large_scale/pca_strong.json b/configs/spmd/large_scale/pca_strong.json index 2d302340..9063c22e 100644 --- a/configs/spmd/large_scale/pca_strong.json +++ b/configs/spmd/large_scale/pca_strong.json @@ -19,10 +19,10 @@ "TEMPLATES": { "linreg": { "SETS": [ - "sklearnex spmd implementation", + "sklearnex spmd implementation", "large scale strong 2k parameters", "synthetic data", - "spmd pca parameters" + "spmd pca parameters" ] } } From a4653a12d0d6e997961cb0a976031e7c37a250a5 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 8 Oct 2024 01:03:07 -0400 Subject: [PATCH 060/110] knn bf16 --- configs/regular/bf16/knn.json | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/configs/regular/bf16/knn.json b/configs/regular/bf16/knn.json index 527dcbe4..c39b577e 100644 --- a/configs/regular/bf16/knn.json +++ b/configs/regular/bf16/knn.json @@ -17,11 +17,19 @@ "algorithm": { "estimator_params": { "n_jobs": "[SPECIAL_VALUE]physical_cpus" } } }, "synthetic classification data": { + "algorithm": { + "estimator": "KNeighborsClassifier", + "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": [1, 2] } + }, "data": [ { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 51000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } ] }, "synthetic regression data": { + "algorithm": { + "estimator": "KNeighborsRegressor", + "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": [1, 2] } + }, "data": [ { "source": "make_regression", "split_kwargs": { "train_size": 50000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 51000, "n_features": 100, "noise":1.5 } } ] @@ -33,7 +41,14 @@ "sklearn-ex[gpu] implementations", "common knn parameters", "sklearn knn parameters", - "synthetic classification data", + "synthetic classification data" + ] + }, + "sklearn brute knn reg": { + "SETS": [ + "sklearn-ex[gpu] implementations", + "common knn parameters", + "sklearn knn parameters", "synthetic regression data" ] } From 4f65e1faff0694ede8704ed5b88360e8944e9e0b Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 8 Oct 2024 01:24:35 -0400 Subject: [PATCH 061/110] add bf16 cases --- configs/regular/bf16/basic_statistics.json | 27 +++++++++++++++++++++ configs/regular/bf16/covariance.json | 28 ++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 configs/regular/bf16/basic_statistics.json create mode 100644 configs/regular/bf16/covariance.json diff --git a/configs/regular/bf16/basic_statistics.json b/configs/regular/bf16/basic_statistics.json new file mode 100644 index 00000000..671521ab --- /dev/null +++ b/configs/regular/bf16/basic_statistics.json @@ -0,0 +1,27 @@ +{ + "INCLUDE": ["../../common/sklearn.json"], + "PARAMETERS_SETS": { + "basic stats parameters": { + "algorithm": { + "estimator": "BasicStatistics" + }, + "data": { + "dtype": ["float32"] + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } } + ] + } + }, + "TEMPLATES": { + "basic_statistics": { + "SETS": [ + "sklearn-ex[gpu] implementations", + "basic stats parameters", + "synthetic data" + ] + } + } +} diff --git a/configs/regular/bf16/covariance.json b/configs/regular/bf16/covariance.json new file mode 100644 index 00000000..1cd6ef4a --- /dev/null +++ b/configs/regular/bf16/covariance.json @@ -0,0 +1,28 @@ +{ + "INCLUDE": ["../../common/sklearn.json"], + "PARAMETERS_SETS": { + "covariance parameters": { + "algorithm": { + "estimator": "EmpiricalCovariance", + "library": "sklearnex.preview.covariance" + }, + "data": { + "dtype": ["float32"] + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } } + ] + } + }, + "TEMPLATES": { + "covariance": { + "SETS": [ + "sklearn-ex[gpu] implementations", + "covariance parameters", + "synthetic data" + ] + } + } +} From c8522797fb6d02163f85bf6582e9d5eb585807d6 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 8 Oct 2024 01:47:32 -0400 Subject: [PATCH 062/110] forest bf16 --- configs/regular/bf16/ensemble.json | 48 ------------------------------ configs/regular/bf16/forest.json | 34 +++++++++++++++++++++ 2 files changed, 34 insertions(+), 48 deletions(-) delete mode 100644 configs/regular/bf16/ensemble.json create mode 100644 configs/regular/bf16/forest.json diff --git a/configs/regular/bf16/ensemble.json b/configs/regular/bf16/ensemble.json deleted file mode 100644 index 556da67a..00000000 --- a/configs/regular/bf16/ensemble.json +++ /dev/null @@ -1,48 +0,0 @@ -{ - "INCLUDE": ["../../common/sklearn.json"], - "PARAMETERS_SETS": { - "common ensemble params": { - "algorithm": { - "estimator_params": { - "n_estimators": 200, - "max_depth": 16, - "max_samples": 1.0, - "min_samples_split": 5, - "min_samples_leaf": 2, - "min_impurity_decrease": 0.0, - "bootstrap": true, - "random_state": 42 - } - }, - "data": { - "dtype": ["float32"] - } - }, - "sklearn ensemble classifier params": { - "algorithm": { - "estimator": ["RandomForestClassifier", "ExtraTreesClassifier"], - "estimator_params": { - "criterion": "gini", - "max_features": "sqrt", - "max_leaf_nodes": null, - "n_jobs": "[SPECIAL_VALUE]physical_cpus" - } - } - }, - "synthetic data": { - "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 501000, "n_features": 10, "n_classes": 2 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } } - ] - } - }, - "TEMPLATES": { - "sklearn ensemble classification": { - "SETS": [ - "sklearn-ex[gpu] implementations", - "common ensemble params", - "sklearn ensemble classifier params", - "synthetic data" - ] - } - } -} diff --git a/configs/regular/bf16/forest.json b/configs/regular/bf16/forest.json new file mode 100644 index 00000000..845b73a2 --- /dev/null +++ b/configs/regular/bf16/forest.json @@ -0,0 +1,34 @@ +{ + "INCLUDE": ["../../common/sklearn.json"], + "PARAMETERS_SETS": { + "common forest params": { + "data": { + "dtype": ["float32"] + } + }, + "forest classifier params": { + "algorithm": {"estimator": "RandomForestClassifier"}, + "data": { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 501000, "n_features": 10, "n_classes": 2 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } } + }, + "forest regression params": { + "algorithm": {"estimator": "RandomForestRegressor"}, + "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 501000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } }} + } + }, + "TEMPLATES": { + "forest cls": { + "SETS": [ + "sklearn-ex[gpu] implementations", + "common forest params", + "forest classifier params" + ] + }, + "forest reg": { + "SETS": [ + "sklearn-ex[gpu] implementations", + "common forest params", + "forest regression params" + ] + } + } +} From 698d884441b27a68b6718ea5d09b0f837bac9a26 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 8 Oct 2024 03:33:12 -0400 Subject: [PATCH 063/110] incremental --- configs/incremental.json | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/configs/incremental.json b/configs/incremental.json index c9ffb19c..e1f589a4 100644 --- a/configs/incremental.json +++ b/configs/incremental.json @@ -1,4 +1,4 @@ -{ +{ "INCLUDE": ["./common/sklearn.json"], "PARAMETERS_SETS": { "common": {"bench": {"n_runs": 10, "time_limit": 60}}, "covariance data": { @@ -7,8 +7,8 @@ "source": "make_blobs", "generation_kwargs": { "centers": 1, - "n_samples": 1000, - "n_features": [16, 64] + "n_samples": 12000000, + "n_features": [10, 100] }, "split_kwargs": {"ignore": true} } @@ -19,8 +19,8 @@ "source": "make_blobs", "generation_kwargs": { "centers": 1, - "n_samples": 10000, - "n_features": [16, 64] + "n_samples": 12000000, + "n_features": [10, 100] }, "split_kwargs": {"ignore": true} } @@ -30,8 +30,8 @@ "source": "make_regression", "split_kwargs": {"train_size": 0.2, "test_size": 0.8}, "generation_kwargs": { - "n_samples": 5000, - "n_features": [40, 100], + "n_samples": 12000000, + "n_features": [10, 100], "n_informative": 5, "noise": 2.0 } @@ -42,8 +42,8 @@ "source": "make_blobs", "generation_kwargs": { "centers": 1, - "n_samples": 1000, - "n_features": [16, 64] + "n_samples": 12000000, + "n_features": [10, 100] }, "split_kwargs": {"ignore": true} } @@ -54,16 +54,17 @@ "estimator": "IncrementalEmpiricalCovariance", "library": "sklearnex.covariance", "estimator_methods": {"training": "partial_fit"}, - "num_batches": {"training": 2} + "num_batches": {"training": 12} } ] - }, + }, "basic_statistics": { "algorithm": [ { "estimator": "IncrementalBasicStatistics", "library": "sklearnex.basic_statistics", - "num_batches": {"training": 2} + "estimator_methods": {"training": "partial_fit"}, + "num_batches": {"training": 12} } ] }, @@ -72,7 +73,8 @@ { "estimator": "IncrementalLinearRegression", "library": "sklearnex.linear_model", - "num_batches": {"training": 2} + "estimator_methods": {"training": "partial_fit"}, + "num_batches": {"training": 12} } ] }, @@ -81,16 +83,18 @@ { "estimator": "IncrementalPCA", "library": "sklearnex.preview.decomposition", - "num_batches": {"training": 2} - } + "estimator_methods": {"training": "partial_fit"}, + "num_batches": {"training": 12} + } ] } }, "TEMPLATES": { - "covariance": {"SETS": ["common", "covariance", "covariance data"]}, + "basic_statistics": {"SETS": ["common", "basic_statistics", "basic_statistics data", "sklearn-ex[gpu] implementations"]}, + "covariance": {"SETS": ["common", "covariance", "covariance data", "sklearn-ex[gpu] implementations"]}, "linear_regression": { - "SETS": ["common", "linear_regression", "linear_regression data"] + "SETS": ["common", "linear_regression", "linear_regression data", "sklearn-ex[gpu] implementations"] }, - "pca": {"SETS": ["common", "pca", "pca data"]} + "pca": {"SETS": ["common", "pca", "pca data", "sklearn-ex[gpu] implementations"]} } } From 5592d315512f76d0e47008e8b442b6308470e1bd Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 8 Oct 2024 14:34:31 -0400 Subject: [PATCH 064/110] spmd online --- configs/spmd/large_scale/spmd_for_online.json | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/configs/spmd/large_scale/spmd_for_online.json b/configs/spmd/large_scale/spmd_for_online.json index 53ac660e..7a0a08b4 100644 --- a/configs/spmd/large_scale/spmd_for_online.json +++ b/configs/spmd/large_scale/spmd_for_online.json @@ -61,7 +61,7 @@ "algorithm": [ { "estimator": "EmpiricalCovariance", - "library": "sklearnex.spmd.covariance", + "library": "sklearnex.spmd", "estimator_methods": {"training": "fit"} } ] @@ -70,16 +70,16 @@ "algorithm": [ { "estimator": "LinearRegression", - "library": "sklearnex.spmd.linear_model", + "library": "sklearnex.spmd", "estimator_methods": {"training": "fit"} } ] }, - "pca": { + "decomposition": { "algorithm": [ { "estimator": "PCA", - "library": "sklearnex.spmd.decomposition", + "library": "sklearnex.spmd", "estimator_methods": {"training": "fit"} } ] @@ -91,6 +91,6 @@ "linear_regression": { "SETS": ["linear_regression", "linear_regression data", "sklearnex spmd implementation", "large scale full one node parameters"] }, - "pca": {"SETS": ["pca", "pca data", "sklearnex spmd implementation", "large scale full one node parameters"]} + "pca": {"SETS": ["decomposition", "pca data", "sklearnex spmd implementation", "large scale full one node parameters"]} } } From c47649a1cf678747ef746213b1156daa2085ee80 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 8 Oct 2024 14:58:42 -0400 Subject: [PATCH 065/110] fix --- configs/spmd/large_scale/spmd_for_online.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/spmd/large_scale/spmd_for_online.json b/configs/spmd/large_scale/spmd_for_online.json index 7a0a08b4..2ef60f5b 100644 --- a/configs/spmd/large_scale/spmd_for_online.json +++ b/configs/spmd/large_scale/spmd_for_online.json @@ -75,12 +75,12 @@ } ] }, - "decomposition": { + "pca": { "algorithm": [ { "estimator": "PCA", "library": "sklearnex.spmd", - "estimator_methods": {"training": "fit"} + "estimator_methods": {"training": "fit", "inference": ""} } ] } @@ -91,6 +91,6 @@ "linear_regression": { "SETS": ["linear_regression", "linear_regression data", "sklearnex spmd implementation", "large scale full one node parameters"] }, - "pca": {"SETS": ["decomposition", "pca data", "sklearnex spmd implementation", "large scale full one node parameters"]} + "pca": {"SETS": ["pca", "pca data", "sklearnex spmd implementation", "large scale full one node parameters"]} } } From 687178b5d8b4af725aef323ef06c2cfcf61089e0 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 10 Oct 2024 03:02:25 -0400 Subject: [PATCH 066/110] incremental spmd --- .../large_scale/incremental/basic_stats.json | 31 +++++++++++++++++++ .../large_scale/incremental/covariance.json | 31 +++++++++++++++++++ .../large_scale/incremental/linear_model.json | 28 +++++++++++++++++ configs/spmd/large_scale/incremental/pca.json | 31 +++++++++++++++++++ 4 files changed, 121 insertions(+) create mode 100644 configs/spmd/large_scale/incremental/basic_stats.json create mode 100644 configs/spmd/large_scale/incremental/covariance.json create mode 100644 configs/spmd/large_scale/incremental/linear_model.json create mode 100644 configs/spmd/large_scale/incremental/pca.json diff --git a/configs/spmd/large_scale/incremental/basic_stats.json b/configs/spmd/large_scale/incremental/basic_stats.json new file mode 100644 index 00000000..88ad4c8d --- /dev/null +++ b/configs/spmd/large_scale/incremental/basic_stats.json @@ -0,0 +1,31 @@ +{ + "INCLUDE": ["../../../common/sklearn.json", "../../../spmd/stats_covariance.json", "../large_scale.json"], + "PARAMETERS_SETS": { + "spmd basicstats parameters": { + "algorithm": { + "estimator": "IncrementalBasicStatistics", + "estimator_methods": { "training": "fit" }, + "num_batches": {"training": 10} + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } + ] + } + }, + "TEMPLATES": { + "basicstats": { + "SETS": [ + "sklearnex spmd implementation", + "large scale 64 parameters", + "synthetic data", + "spmd basicstats parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/incremental/covariance.json b/configs/spmd/large_scale/incremental/covariance.json new file mode 100644 index 00000000..06c8e4ca --- /dev/null +++ b/configs/spmd/large_scale/incremental/covariance.json @@ -0,0 +1,31 @@ +{ + "INCLUDE": ["../../../common/sklearn.json", "../../../spmd/stats_covariance.json", "../large_scale.json"], + "PARAMETERS_SETS": { + "spmd covariance parameters": { + "algorithm": { + "estimator": "IncrementalEmpiricalCovariance", + "estimator_methods": { "training": "fit" }, + "num_batches": {"training": 10} + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } + ] + } + }, + "TEMPLATES": { + "covariance": { + "SETS": [ + "sklearnex spmd implementation", + "large scale 64 parameters", + "synthetic data", + "spmd covariance parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/incremental/linear_model.json b/configs/spmd/large_scale/incremental/linear_model.json new file mode 100644 index 00000000..19882482 --- /dev/null +++ b/configs/spmd/large_scale/incremental/linear_model.json @@ -0,0 +1,28 @@ +{ + "INCLUDE": ["../../../common/sklearn.json", "../../../regular/linear_model.json", "../large_scale.json"], + "PARAMETERS_SETS": { + "spmd linear parameters": { + "algorithm": { + "estimator": "IncrementalLinearRegression", + "estimator_methods": { "training": "fit" }, + "num_batches": {"training": 10} + } + }, + "synthetic data": { + "data": [ + { "source": "make_regression", "generation_kwargs": { "n_samples": 30005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 30000000, "test_size": 5000 } }, + { "source": "make_regression", "generation_kwargs": { "n_samples": 305000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 300000, "test_size": 5000 } } + ] + } + }, + "TEMPLATES": { + "linreg": { + "SETS": [ + "sklearnex spmd implementation", + "large scale 64 parameters", + "synthetic data", + "spmd linear parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/incremental/pca.json b/configs/spmd/large_scale/incremental/pca.json new file mode 100644 index 00000000..f1a264ea --- /dev/null +++ b/configs/spmd/large_scale/incremental/pca.json @@ -0,0 +1,31 @@ +{ + "INCLUDE": ["../../../common/sklearn.json", "../../../regular/pca.json", "../large_scale.json"], + "PARAMETERS_SETS": { + "spmd pca parameters": { + "algorithm": { + "estimator": "IncrementalPCA", + "estimator_methods": { "training": "fit", "inference": "" }, + "num_batches": {"training": 10} + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } + ] + } + }, + "TEMPLATES": { + "linreg": { + "SETS": [ + "sklearnex spmd implementation", + "large scale 64 parameters", + "synthetic data", + "spmd pca parameters" + ] + } + } +} From 5c97aed7f9c3e09e6c7679bd1faf8c24f392e052 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 10 Oct 2024 03:11:37 -0400 Subject: [PATCH 067/110] incremental spmd test --- configs/spmd/large_scale/incremental/basic_stats.json | 2 +- configs/spmd/large_scale/incremental/covariance.json | 2 +- configs/spmd/large_scale/incremental/linear_model.json | 2 +- configs/spmd/large_scale/incremental/pca.json | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/spmd/large_scale/incremental/basic_stats.json b/configs/spmd/large_scale/incremental/basic_stats.json index 88ad4c8d..deb03126 100644 --- a/configs/spmd/large_scale/incremental/basic_stats.json +++ b/configs/spmd/large_scale/incremental/basic_stats.json @@ -22,7 +22,7 @@ "basicstats": { "SETS": [ "sklearnex spmd implementation", - "large scale 64 parameters", + "large scale two nodes parameters", "synthetic data", "spmd basicstats parameters" ] diff --git a/configs/spmd/large_scale/incremental/covariance.json b/configs/spmd/large_scale/incremental/covariance.json index 06c8e4ca..f9c062b9 100644 --- a/configs/spmd/large_scale/incremental/covariance.json +++ b/configs/spmd/large_scale/incremental/covariance.json @@ -22,7 +22,7 @@ "covariance": { "SETS": [ "sklearnex spmd implementation", - "large scale 64 parameters", + "large scale two nodes parameters", "synthetic data", "spmd covariance parameters" ] diff --git a/configs/spmd/large_scale/incremental/linear_model.json b/configs/spmd/large_scale/incremental/linear_model.json index 19882482..593f25ed 100644 --- a/configs/spmd/large_scale/incremental/linear_model.json +++ b/configs/spmd/large_scale/incremental/linear_model.json @@ -19,7 +19,7 @@ "linreg": { "SETS": [ "sklearnex spmd implementation", - "large scale 64 parameters", + "large scale two nodes parameters", "synthetic data", "spmd linear parameters" ] diff --git a/configs/spmd/large_scale/incremental/pca.json b/configs/spmd/large_scale/incremental/pca.json index f1a264ea..31bc9ec9 100644 --- a/configs/spmd/large_scale/incremental/pca.json +++ b/configs/spmd/large_scale/incremental/pca.json @@ -22,7 +22,7 @@ "linreg": { "SETS": [ "sklearnex spmd implementation", - "large scale 64 parameters", + "large scale two nodes parameters", "synthetic data", "spmd pca parameters" ] From 907b35a4d67ad790fffe55c13eb915e5442f61ef Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 10 Oct 2024 03:49:35 -0400 Subject: [PATCH 068/110] incremental spmd --- configs/spmd/large_scale/incremental/basic_stats.json | 3 +-- configs/spmd/large_scale/incremental/covariance.json | 3 +-- configs/spmd/large_scale/incremental/linear_model.json | 3 +-- configs/spmd/large_scale/incremental/pca.json | 3 +-- configs/spmd/large_scale/large_scale.json | 9 +++++++++ 5 files changed, 13 insertions(+), 8 deletions(-) diff --git a/configs/spmd/large_scale/incremental/basic_stats.json b/configs/spmd/large_scale/incremental/basic_stats.json index deb03126..ca9e3eb9 100644 --- a/configs/spmd/large_scale/incremental/basic_stats.json +++ b/configs/spmd/large_scale/incremental/basic_stats.json @@ -13,7 +13,6 @@ }, "synthetic data": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }, { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } ] } @@ -22,7 +21,7 @@ "basicstats": { "SETS": [ "sklearnex spmd implementation", - "large scale two nodes parameters", + "large scale 32 parameters", "synthetic data", "spmd basicstats parameters" ] diff --git a/configs/spmd/large_scale/incremental/covariance.json b/configs/spmd/large_scale/incremental/covariance.json index f9c062b9..04fcd76b 100644 --- a/configs/spmd/large_scale/incremental/covariance.json +++ b/configs/spmd/large_scale/incremental/covariance.json @@ -13,7 +13,6 @@ }, "synthetic data": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }, { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } ] } @@ -22,7 +21,7 @@ "covariance": { "SETS": [ "sklearnex spmd implementation", - "large scale two nodes parameters", + "large scale 32 parameters", "synthetic data", "spmd covariance parameters" ] diff --git a/configs/spmd/large_scale/incremental/linear_model.json b/configs/spmd/large_scale/incremental/linear_model.json index 593f25ed..a483f613 100644 --- a/configs/spmd/large_scale/incremental/linear_model.json +++ b/configs/spmd/large_scale/incremental/linear_model.json @@ -10,7 +10,6 @@ }, "synthetic data": { "data": [ - { "source": "make_regression", "generation_kwargs": { "n_samples": 30005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 30000000, "test_size": 5000 } }, { "source": "make_regression", "generation_kwargs": { "n_samples": 305000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 300000, "test_size": 5000 } } ] } @@ -19,7 +18,7 @@ "linreg": { "SETS": [ "sklearnex spmd implementation", - "large scale two nodes parameters", + "large scale 32 parameters", "synthetic data", "spmd linear parameters" ] diff --git a/configs/spmd/large_scale/incremental/pca.json b/configs/spmd/large_scale/incremental/pca.json index 31bc9ec9..11fa5125 100644 --- a/configs/spmd/large_scale/incremental/pca.json +++ b/configs/spmd/large_scale/incremental/pca.json @@ -13,7 +13,6 @@ }, "synthetic data": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }, { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } ] } @@ -22,7 +21,7 @@ "linreg": { "SETS": [ "sklearnex spmd implementation", - "large scale two nodes parameters", + "large scale 32 parameters", "synthetic data", "spmd pca parameters" ] diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json index 55a70fbf..7fd10353 100644 --- a/configs/spmd/large_scale/large_scale.json +++ b/configs/spmd/large_scale/large_scale.json @@ -72,6 +72,15 @@ "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, + "large scale 32 parameters": { + "data": { + "dtype": "float64", + "distributed_split": "None" + }, + "bench": { + "mpi_params": {"n": [1,2,6,12,24,48,96,192,384], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, "large scale 128 to 2k parameters": { "data": { "dtype": "float64", From 7ed023524a2948ac895cc6febb0e06e784da36d6 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 10 Oct 2024 03:52:21 -0400 Subject: [PATCH 069/110] incremental spmd --- sklbench/utils/logger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklbench/utils/logger.py b/sklbench/utils/logger.py index 90940630..250c5fa6 100644 --- a/sklbench/utils/logger.py +++ b/sklbench/utils/logger.py @@ -19,7 +19,7 @@ logger = logging.Logger("sklbench") logging_channel = logging.StreamHandler() -logging_formatter = logging.Formatter("%(levelname)s:%(name)s: %(message)s") +logging_formatter = logging.Formatter("%(asctime)s - %(levelname)s:%(name)s: %(message)s") logging_channel.setFormatter(logging_formatter) logger.addHandler(logging_channel) From e68edd5389c2cb8302a126f6d41a326e7ab66d3b Mon Sep 17 00:00:00 2001 From: ethanglaser Date: Tue, 15 Oct 2024 23:41:37 +0000 Subject: [PATCH 070/110] configs nearly finalized + minor job updates --- configs/spmd/large_scale/basic_stats.json | 2 +- .../spmd/large_scale/basic_stats_single.json | 30 ----- .../spmd/large_scale/basic_stats_strong.json | 2 +- configs/spmd/large_scale/covariance.json | 2 +- .../spmd/large_scale/covariance_strong.json | 2 +- configs/spmd/large_scale/dbscan.json | 7 +- configs/spmd/large_scale/dbscan_strong.json | 7 +- configs/spmd/large_scale/forest.json | 9 +- configs/spmd/large_scale/forest_reg.json | 27 ----- configs/spmd/large_scale/forest_strong.json | 7 +- .../spmd/large_scale/forest_strong_reg.json | 27 ----- configs/spmd/large_scale/kmeans.json | 6 +- configs/spmd/large_scale/kmeans_strong.json | 4 +- configs/spmd/large_scale/kmeans_strong_2.json | 31 ------ configs/spmd/large_scale/knn.json | 6 +- configs/spmd/large_scale/knn_strong.json | 8 +- configs/spmd/large_scale/large_scale.json | 105 +----------------- .../{linear_model.json => linreg.json} | 2 +- ...r_model_strong.json => linreg_strong.json} | 2 +- configs/spmd/large_scale/logreg.json | 6 +- configs/spmd/large_scale/logreg_2.json | 29 ----- configs/spmd/large_scale/logreg_strong.json | 7 +- configs/spmd/large_scale/logreg_strong_2.json | 28 ----- configs/spmd/large_scale/pca.json | 2 +- configs/spmd/large_scale/pca_single.json | 30 ----- configs/spmd/large_scale/pca_strong.json | 2 +- sklbench/benchmarks/sklearn_estimator.py | 12 +- sklbench/datasets/common.py | 4 +- 28 files changed, 59 insertions(+), 347 deletions(-) delete mode 100644 configs/spmd/large_scale/basic_stats_single.json delete mode 100644 configs/spmd/large_scale/forest_reg.json delete mode 100644 configs/spmd/large_scale/forest_strong_reg.json delete mode 100644 configs/spmd/large_scale/kmeans_strong_2.json rename configs/spmd/large_scale/{linear_model.json => linreg.json} (90%) rename configs/spmd/large_scale/{linear_model_strong.json => linreg_strong.json} (88%) delete mode 100644 configs/spmd/large_scale/logreg_2.json delete mode 100644 configs/spmd/large_scale/logreg_strong_2.json delete mode 100644 configs/spmd/large_scale/pca_single.json diff --git a/configs/spmd/large_scale/basic_stats.json b/configs/spmd/large_scale/basic_stats.json index 9ac4725f..d6c2c4d2 100644 --- a/configs/spmd/large_scale/basic_stats.json +++ b/configs/spmd/large_scale/basic_stats.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd basicstats parameters": { "algorithm": { diff --git a/configs/spmd/large_scale/basic_stats_single.json b/configs/spmd/large_scale/basic_stats_single.json deleted file mode 100644 index 832bd3b2..00000000 --- a/configs/spmd/large_scale/basic_stats_single.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], - "PARAMETERS_SETS": { - "spmd basicstats parameters": { - "algorithm": { - "estimator": "BasicStatistics", - "estimator_methods": { "training": "fit" } - }, - "data": { - "split_kwargs": { "test_size": 0.0001 } - } - }, - "synthetic data": { - "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }, - { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } - ] - } - }, - "TEMPLATES": { - "basicstats": { - "SETS": [ - "sklearnex spmd implementation", - "large scale one node parameters", - "synthetic data", - "spmd basicstats parameters" - ] - } - } -} diff --git a/configs/spmd/large_scale/basic_stats_strong.json b/configs/spmd/large_scale/basic_stats_strong.json index b7aa22cb..b5b0ef69 100644 --- a/configs/spmd/large_scale/basic_stats_strong.json +++ b/configs/spmd/large_scale/basic_stats_strong.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd basicstats parameters": { "algorithm": { diff --git a/configs/spmd/large_scale/covariance.json b/configs/spmd/large_scale/covariance.json index 260befd0..20da8d15 100644 --- a/configs/spmd/large_scale/covariance.json +++ b/configs/spmd/large_scale/covariance.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd basicstats parameters": { "algorithm": { diff --git a/configs/spmd/large_scale/covariance_strong.json b/configs/spmd/large_scale/covariance_strong.json index 568b4a8f..b8424d92 100644 --- a/configs/spmd/large_scale/covariance_strong.json +++ b/configs/spmd/large_scale/covariance_strong.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd basicstats parameters": { "algorithm": { diff --git a/configs/spmd/large_scale/dbscan.json b/configs/spmd/large_scale/dbscan.json index 0660e869..61b0521e 100644 --- a/configs/spmd/large_scale/dbscan.json +++ b/configs/spmd/large_scale/dbscan.json @@ -6,6 +6,9 @@ "estimator": "DBSCAN", "estimator_methods": { "training": "fit" + }, + "estimator_params" : { + "eps": 10, "min_samples": 5 } }, "data": { @@ -14,7 +17,7 @@ }, "synthetic dataset": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 100, "centers": 10 }, "algorithm": { "eps": 5, "min_samples": 5 } } + { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 100, "centers": 10 } } ] } }, @@ -24,7 +27,7 @@ "common dbscan parameters", "synthetic dataset", "sklearnex spmd implementation", - "large scale default parameters", + "large scale <64 parameters", "spmd dbscan parameters" ] } diff --git a/configs/spmd/large_scale/dbscan_strong.json b/configs/spmd/large_scale/dbscan_strong.json index e591316e..24ea7cfc 100644 --- a/configs/spmd/large_scale/dbscan_strong.json +++ b/configs/spmd/large_scale/dbscan_strong.json @@ -6,7 +6,10 @@ "estimator": "DBSCAN", "estimator_methods": { "training": "fit" - } + }, + "estimator_params" : { + "eps": 10, "min_samples": 5 + } }, "data": { "dtype": "float64" @@ -14,7 +17,7 @@ }, "synthetic dataset": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 500000, "n_features": 100, "centers": 10 }, "algorithm": { "eps": 5, "min_samples": 5 } } + { "source": "make_blobs", "generation_kwargs": { "n_samples": 500000, "n_features": 100, "centers": 10 } } ] } }, diff --git a/configs/spmd/large_scale/forest.json b/configs/spmd/large_scale/forest.json index 2d9dfde9..b4402442 100644 --- a/configs/spmd/large_scale/forest.json +++ b/configs/spmd/large_scale/forest.json @@ -1,16 +1,17 @@ { - "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd forest classification parameters": { "algorithm": { "estimator": "RandomForestClassifier", - "estimator_methods": { "training": "fit" } + "estimator_methods": { "training": "fit" }, + "estimator_params": { "n_estimators": 20, "max_depth": 4 } } }, "synthetic data": { "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 501000, "n_features": 10, "n_classes": 2 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } }, - { "source": "make_classification", "split_kwargs": { "train_size": 10000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 11000, "n_features": 1000, "n_classes": 2 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } } + { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 501000, "n_features": 10, "n_classes": 2 } }, + { "source": "make_classification", "split_kwargs": { "train_size": 10000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 11000, "n_features": 1000, "n_classes": 2 } } ] } }, diff --git a/configs/spmd/large_scale/forest_reg.json b/configs/spmd/large_scale/forest_reg.json deleted file mode 100644 index a5ec73cd..00000000 --- a/configs/spmd/large_scale/forest_reg.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], - "PARAMETERS_SETS": { - "spmd forest regression parameters": { - "algorithm": { - "estimator": "RandomForestRegressor" - } - }, - "synthetic data": { - "data": [ - { "source": "make_regression", "generation_kwargs": { "n_samples": 501000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } }}, - { "source": "make_regression", "generation_kwargs": { "n_samples": 11000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 10000, "test_size": 1000 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } }} - - ] - } - }, - "TEMPLATES": { - "forestReg": { - "SETS": [ - "sklearnex spmd implementation", - "large scale 2k parameters", - "synthetic data", - "spmd forest regression parameters" - ] - } - } -} diff --git a/configs/spmd/large_scale/forest_strong.json b/configs/spmd/large_scale/forest_strong.json index 17ca8c51..23b982f5 100644 --- a/configs/spmd/large_scale/forest_strong.json +++ b/configs/spmd/large_scale/forest_strong.json @@ -1,15 +1,16 @@ { - "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd forest classification parameters": { "algorithm": { "estimator": "RandomForestClassifier", - "estimator_methods": { "training": "fit" } + "estimator_methods": { "training": "fit" }, + "estimator_params": { "n_estimators": 20, "max_depth": 4 } } }, "synthetic data": { "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 10001000, "n_features": 100, "n_classes": 2 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } } + { "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 10001000, "n_features": 100, "n_classes": 2 } } ] } }, diff --git a/configs/spmd/large_scale/forest_strong_reg.json b/configs/spmd/large_scale/forest_strong_reg.json deleted file mode 100644 index 305e729b..00000000 --- a/configs/spmd/large_scale/forest_strong_reg.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"], - "PARAMETERS_SETS": { - "spmd forest regression parameters": { - "algorithm": { - "estimator": "RandomForestRegressor" - } - }, - "synthetic data": { - "data": [ - { "source": "make_regression", "generation_kwargs": { "n_samples": 1000000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 900000, "test_size": 10000 }, "algorithm": { "estimator_params": { "n_estimators": 5, "max_depth": 4 } }}, - { "source": "make_regression", "generation_kwargs": { "n_samples": 100000, "n_features": 100, "noise": 1.25 }, "split_kwargs": { "train_size": 90000, "test_size": 10000 }, "algorithm": { "estimator_params": { "n_estimators": 10, "max_depth": 4 } }} - - ] - } - }, - "TEMPLATES": { - "forestReg": { - "SETS": [ - "sklearnex spmd implementation", - "large scale strong 32 parameters", - "synthetic data", - "spmd forest regression parameters" - ] - } - } -} diff --git a/configs/spmd/large_scale/kmeans.json b/configs/spmd/large_scale/kmeans.json index c77d22bc..1140823d 100644 --- a/configs/spmd/large_scale/kmeans.json +++ b/configs/spmd/large_scale/kmeans.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../../common/sklearn.json", "../../regular/kmeans.json", "large_scale.json"], + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd kmeans parameters": { "algorithm": { @@ -12,8 +12,8 @@ }, "synthetic data": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 3750000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }, - { "source": "make_blobs", "generation_kwargs": { "n_samples": 18750, "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } } + { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000, "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } } ] } }, diff --git a/configs/spmd/large_scale/kmeans_strong.json b/configs/spmd/large_scale/kmeans_strong.json index 6f095af0..6277745b 100644 --- a/configs/spmd/large_scale/kmeans_strong.json +++ b/configs/spmd/large_scale/kmeans_strong.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../../common/sklearn.json", "../../regular/kmeans.json", "large_scale.json"], + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd kmeans parameters": { "algorithm": { @@ -23,7 +23,7 @@ "SETS": [ "synthetic data", "sklearnex spmd implementation", - "large scale strong 32 parameters", + "large scale strong <64 parameters", "spmd kmeans parameters" ] } diff --git a/configs/spmd/large_scale/kmeans_strong_2.json b/configs/spmd/large_scale/kmeans_strong_2.json deleted file mode 100644 index 03f2bc59..00000000 --- a/configs/spmd/large_scale/kmeans_strong_2.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "INCLUDE": ["../../common/sklearn.json", "../../regular/kmeans.json", "large_scale.json"], - "PARAMETERS_SETS": { - "spmd kmeans parameters": { - "algorithm": { - "estimator": "KMeans", - "estimator_params": { - "algorithm": "lloyd" - }, - "estimator_methods": { "training": "fit", "inference": "predict" } - } - }, - "synthetic data": { - "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }, - { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000, "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }, - { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } } - ] - } - }, - "TEMPLATES": { - "kmeans": { - "SETS": [ - "synthetic data", - "sklearnex spmd implementation", - "large scale strong two nodes parameters", - "spmd kmeans parameters" - ] - } - } -} diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json index f1e0678d..b68b94af 100644 --- a/configs/spmd/large_scale/knn.json +++ b/configs/spmd/large_scale/knn.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../../common/sklearn.json", "../../regular/knn.json", "large_scale.json"], + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd knn cls parameters": { "algorithm": { @@ -19,15 +19,13 @@ }, "synthetic classification data": { "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 5000 }, "generation_kwargs": { "n_samples": 55000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, - { "source": "make_classification", "split_kwargs": { "train_size": 5000, "test_size": 50000 }, "generation_kwargs": { "n_samples": 55000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } + { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 5000 }, "generation_kwargs": { "n_samples": 5005000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } ] } }, "TEMPLATES": { "knn classifier": { "SETS": [ - "common knn parameters", "synthetic classification data", "sklearnex spmd implementation", "large scale 2k parameters", diff --git a/configs/spmd/large_scale/knn_strong.json b/configs/spmd/large_scale/knn_strong.json index 67398123..7fe862dd 100644 --- a/configs/spmd/large_scale/knn_strong.json +++ b/configs/spmd/large_scale/knn_strong.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../../common/sklearn.json", "../../regular/knn.json", "large_scale.json"], + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd knn cls parameters": { "algorithm": { @@ -19,18 +19,16 @@ }, "synthetic classification data": { "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 5000 }, "generation_kwargs": { "n_samples": 505000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, - { "source": "make_classification", "split_kwargs": { "train_size": 5000, "test_size": 500000 }, "generation_kwargs": { "n_samples": 505000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } + { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000000 }, "generation_kwargs": { "n_samples": 1500000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } ] } }, "TEMPLATES": { "knn classifier": { "SETS": [ - "common knn parameters", "synthetic classification data", "sklearnex spmd implementation", - "large scale strong 32 parameters", + "large scale strong <64 parameters", "spmd knn cls parameters" ] } diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json index 832259a0..7e523984 100644 --- a/configs/spmd/large_scale/large_scale.json +++ b/configs/spmd/large_scale/large_scale.json @@ -18,49 +18,22 @@ "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, - "large scale one node parameters": { - "data": { - "dtype": "float64", - "distributed_split": "None" - }, - "bench": { - "mpi_params": {"n": [1,2,3,4,5,6,7,8,9,10,11,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } - } - }, - "large scale strong one node parameters": { - "data": { - "dtype": "float64", - "distributed_split": "rank_based" - }, - "bench": { - "mpi_params": {"n": [1,2,3,4,5,6,7,8,9,10,11,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } - } - }, - "large scale full one node parameters": { - "data": { - "dtype": "float64", - "distributed_split": "None" - }, - "bench": { - "mpi_params": {"n": [12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } - } - }, - "large scale strong full one node parameters": { + "large scale 2k parameters": { "data": { "dtype": "float64", - "distributed_split": "rank_based" + "distributed_split": "None" }, "bench": { - "mpi_params": {"n": [12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, - "large scale 2k parameters": { + "large scale 32 parameters": { "data": { "dtype": "float64", "distributed_split": "None" }, "bench": { - "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + "mpi_params": {"n": [1,2,6,12,24,48,96,192,384], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, "large scale <64 parameters": { @@ -82,65 +55,6 @@ } }, - "large scale 128 parameters": { - "data": { - "dtype": "float64", - "distributed_split": "None" - }, - "bench": { - "mpi_params": {"n": [1536], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } - } - }, - - "large scale 256 parameters": { - "data": { - "dtype": "float64", - "distributed_split": "None" - }, - "bench": { - "mpi_params": {"n": [3072], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } - } - }, - - "large scale 512 parameters": { - "data": { - "dtype": "float64", - "distributed_split": "None" - }, - "bench": { - "mpi_params": {"n": [6144], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } - } - }, - - "large scale 1024 parameters": { - "data": { - "dtype": "float64", - "distributed_split": "None" - }, - "bench": { - "mpi_params": {"n": [12288], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } - } - }, - - "large scale 2048 parameters": { - "data": { - "dtype": "float64", - "distributed_split": "None" - }, - "bench": { - "mpi_params": {"n": [24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } - } - }, - - "large scale two nodes parameters": { - "data": { - "dtype": "float64", - "distributed_split": "None" - }, - "bench": { - "mpi_params": {"n": [24], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } - } - }, "large scale strong 2k parameters": { "data": { "dtype": "float64", @@ -159,15 +73,6 @@ "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, - "large scale strong two nodes parameters": { - "data": { - "dtype": "float64", - "distributed_split": "rank_based" - }, - "bench": { - "mpi_params": {"n": [24], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } - } - }, "large scale impi parameters": { "data": { "dtype": "float64", diff --git a/configs/spmd/large_scale/linear_model.json b/configs/spmd/large_scale/linreg.json similarity index 90% rename from configs/spmd/large_scale/linear_model.json rename to configs/spmd/large_scale/linreg.json index f9d17b5b..ea45a52c 100644 --- a/configs/spmd/large_scale/linear_model.json +++ b/configs/spmd/large_scale/linreg.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../../common/sklearn.json", "../../regular/linear_model.json", "large_scale.json"], + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd linear parameters": { "algorithm": { diff --git a/configs/spmd/large_scale/linear_model_strong.json b/configs/spmd/large_scale/linreg_strong.json similarity index 88% rename from configs/spmd/large_scale/linear_model_strong.json rename to configs/spmd/large_scale/linreg_strong.json index 77a9c79e..629bf544 100644 --- a/configs/spmd/large_scale/linear_model_strong.json +++ b/configs/spmd/large_scale/linreg_strong.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../../common/sklearn.json", "../../regular/linear_model.json", "large_scale.json"], + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd linear parameters": { "algorithm": { diff --git a/configs/spmd/large_scale/logreg.json b/configs/spmd/large_scale/logreg.json index c5ef6203..326f2580 100644 --- a/configs/spmd/large_scale/logreg.json +++ b/configs/spmd/large_scale/logreg.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../../common/sklearn.json", "../../regular/logreg.json", "../logreg.json", "large_scale.json"], + "INCLUDE": ["../../common/sklearn.json", "../logreg.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd logreg2 parameters": { "algorithm":{ @@ -11,12 +11,12 @@ "synthetic data": { "data": [ { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 10, "n_classes": 2 } }, - { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 101000, "n_features": 1000, "n_classes": 2 } } + { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 501000, "n_features": 1000, "n_classes": 2, "n_informative": 40, "n_clusters_per_class": 3, "flip_y": 0.05 } } ] } }, "TEMPLATES": { - "linreg": { + "logreg": { "SETS": [ "sklearnex spmd implementation", "large scale 2k parameters", diff --git a/configs/spmd/large_scale/logreg_2.json b/configs/spmd/large_scale/logreg_2.json deleted file mode 100644 index 796eb8ad..00000000 --- a/configs/spmd/large_scale/logreg_2.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "INCLUDE": ["../../common/sklearn.json", "../../regular/logreg.json", "../logreg.json", "large_scale.json"], - "PARAMETERS_SETS": { - "spmd logreg2 parameters": { - "algorithm":{ - "estimator": "LogisticRegression", - "estimator_methods": { "inference": "predict" }, - "estimator_params": { "max_iter": 20 } - } - }, - "synthetic data": { - "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 10, "n_classes": 2 } }, - { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 101000, "n_features": 1000, "n_classes": 2 } } - ] - } - }, - "TEMPLATES": { - "linreg": { - "SETS": [ - "sklearnex spmd implementation", - "large scale two nodes parameters", - "spmd logreg parameters", - "synthetic data", - "spmd logreg2 parameters" - ] - } - } -} diff --git a/configs/spmd/large_scale/logreg_strong.json b/configs/spmd/large_scale/logreg_strong.json index 8787f6b6..0b79ba9d 100644 --- a/configs/spmd/large_scale/logreg_strong.json +++ b/configs/spmd/large_scale/logreg_strong.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../../common/sklearn.json", "../../regular/logreg.json", "../logreg.json", "large_scale.json"], + "INCLUDE": ["../../common/sklearn.json", "../logreg.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd logreg2 parameters": { "algorithm":{ @@ -10,12 +10,13 @@ }, "synthetic data": { "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 10001000, "n_features": 100, "n_classes": 2 } } + { "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 10001000, "n_features": 100, "n_classes": 2 } }, + { "source": "make_classification", "split_kwargs": { "train_size": 12000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 12001000, "n_features": 200, "n_classes": 2, "n_informative": 40, "n_clusters_per_class": 3, "flip_y": 0.05 } } ] } }, "TEMPLATES": { - "linreg": { + "logreg": { "SETS": [ "sklearnex spmd implementation", "large scale strong 2k parameters", diff --git a/configs/spmd/large_scale/logreg_strong_2.json b/configs/spmd/large_scale/logreg_strong_2.json deleted file mode 100644 index 998e3bb7..00000000 --- a/configs/spmd/large_scale/logreg_strong_2.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "INCLUDE": ["../../common/sklearn.json", "../../regular/logreg.json", "../logreg.json", "large_scale.json"], - "PARAMETERS_SETS": { - "spmd logreg2 parameters": { - "algorithm":{ - "estimator": "LogisticRegression", - "estimator_methods": { "inference": "predict" }, - "estimator_params": { "max_iter": 30 } - } - }, - "synthetic data": { - "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 10001000, "n_features": 100, "n_classes": 2 } } - ] - } - }, - "TEMPLATES": { - "linreg": { - "SETS": [ - "sklearnex spmd implementation", - "large scale strong two nodes parameters", - "spmd logreg parameters", - "synthetic data", - "spmd logreg2 parameters" - ] - } - } -} diff --git a/configs/spmd/large_scale/pca.json b/configs/spmd/large_scale/pca.json index 9a6a6b02..d0ee879a 100644 --- a/configs/spmd/large_scale/pca.json +++ b/configs/spmd/large_scale/pca.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../../common/sklearn.json", "../../regular/pca.json", "large_scale.json"], + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd pca parameters": { "algorithm": { diff --git a/configs/spmd/large_scale/pca_single.json b/configs/spmd/large_scale/pca_single.json deleted file mode 100644 index 07775a6a..00000000 --- a/configs/spmd/large_scale/pca_single.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "INCLUDE": ["../../common/sklearn.json", "../../regular/pca.json", "large_scale.json"], - "PARAMETERS_SETS": { - "spmd pca parameters": { - "algorithm": { - "estimator": "PCA", - "estimator_methods": { "training": "fit", "inference": "" } - }, - "data": { - "split_kwargs": { "test_size": 0.0001 } - } - }, - "synthetic data": { - "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }, - { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } - ] - } - }, - "TEMPLATES": { - "linreg": { - "SETS": [ - "sklearnex spmd implementation", - "large scale one node parameters", - "synthetic data", - "spmd pca parameters" - ] - } - } -} diff --git a/configs/spmd/large_scale/pca_strong.json b/configs/spmd/large_scale/pca_strong.json index 9063c22e..3cb33e72 100644 --- a/configs/spmd/large_scale/pca_strong.json +++ b/configs/spmd/large_scale/pca_strong.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../../common/sklearn.json", "../../regular/pca.json", "large_scale.json"], + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd pca parameters": { "algorithm": { diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index b4d4f3ee..36ec40b6 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -66,8 +66,8 @@ def get_estimator(library_name: str, estimator_name: str): f"Using first {classes_map[estimator_name][0]}." ) estimator = classes_map[estimator_name][0] - if not issubclass(estimator, BaseEstimator): - logger.info(f"{estimator} estimator is not derived from sklearn's BaseEstimator") + #if not issubclass(estimator, BaseEstimator): + # logger.info(f"{estimator} estimator is not derived from sklearn's BaseEstimator") return estimator @@ -515,7 +515,11 @@ def main(bench_case: BenchCase, filters: List[BenchCase]): estimator_params = get_bench_case_value( bench_case, "algorithm:estimator_params", dict() ) - + #logger.debug("estimator params: " + str(estimator_params)) + if "DBSCAN" in str(estimator_name): + if "min_samples" in estimator_params: + from mpi4py import MPI + estimator_params["min_samples"] = MPI.COMM_WORLD.Get_size() * estimator_params["min_samples"] # get estimator methods for measurement estimator_methods = get_estimator_methods(bench_case) @@ -551,7 +555,7 @@ def main(bench_case: BenchCase, filters: List[BenchCase]): # note: "handle" is not JSON-serializable if "handle" in estimator_params: del estimator_params["handle"] - logger.debug(f"Estimator parameters:\n{custom_format(estimator_params)}") + #logger.debug(f"Estimator parameters:\n{custom_format(estimator_params)}") result_template.update(estimator_params) data_descs = { diff --git a/sklbench/datasets/common.py b/sklbench/datasets/common.py index e7ed0160..5c6bd27a 100644 --- a/sklbench/datasets/common.py +++ b/sklbench/datasets/common.py @@ -136,11 +136,11 @@ def cache_wrapper(**kwargs): data_name = kwargs["data_name"] data_cache = kwargs["data_cache"] if len(get_filenames_by_prefix(data_cache, data_name)) > 0: - logger.info(f'Loading "{data_name}" dataset from cache files') + #logger.info(f'Loading "{data_name}" dataset from cache files') data = load_data_from_cache(data_cache, data_name) data_desc = load_data_description(data_cache, data_name) else: - logger.info(f'Loading "{data_name}" dataset from scratch') + #logger.info(f'Loading "{data_name}" dataset from scratch') data, data_desc = function(**kwargs) save_data_to_cache(data, data_cache, data_name) save_data_description(data_desc, data_cache, data_name) From e8344932c33cf07f095c6a0de33ab9fdcbe18000 Mon Sep 17 00:00:00 2001 From: ethanglaser Date: Wed, 16 Oct 2024 03:55:12 +0000 Subject: [PATCH 071/110] <= --- configs/spmd/large_scale/dbscan.json | 2 +- configs/spmd/large_scale/dbscan_strong.json | 2 +- configs/spmd/large_scale/kmeans_strong.json | 2 +- configs/spmd/large_scale/knn_strong.json | 2 +- configs/spmd/large_scale/large_scale.json | 14 ++------------ 5 files changed, 6 insertions(+), 16 deletions(-) diff --git a/configs/spmd/large_scale/dbscan.json b/configs/spmd/large_scale/dbscan.json index 61b0521e..e4996c9e 100644 --- a/configs/spmd/large_scale/dbscan.json +++ b/configs/spmd/large_scale/dbscan.json @@ -27,7 +27,7 @@ "common dbscan parameters", "synthetic dataset", "sklearnex spmd implementation", - "large scale <64 parameters", + "large scale <=64 parameters", "spmd dbscan parameters" ] } diff --git a/configs/spmd/large_scale/dbscan_strong.json b/configs/spmd/large_scale/dbscan_strong.json index 24ea7cfc..04fb9016 100644 --- a/configs/spmd/large_scale/dbscan_strong.json +++ b/configs/spmd/large_scale/dbscan_strong.json @@ -27,7 +27,7 @@ "common dbscan parameters", "synthetic dataset", "sklearnex spmd implementation", - "large scale strong <64 parameters", + "large scale strong <=64 parameters", "spmd dbscan parameters" ] } diff --git a/configs/spmd/large_scale/kmeans_strong.json b/configs/spmd/large_scale/kmeans_strong.json index 6277745b..87fb7fac 100644 --- a/configs/spmd/large_scale/kmeans_strong.json +++ b/configs/spmd/large_scale/kmeans_strong.json @@ -23,7 +23,7 @@ "SETS": [ "synthetic data", "sklearnex spmd implementation", - "large scale strong <64 parameters", + "large scale strong <=64 parameters", "spmd kmeans parameters" ] } diff --git a/configs/spmd/large_scale/knn_strong.json b/configs/spmd/large_scale/knn_strong.json index 7fe862dd..d202f6e4 100644 --- a/configs/spmd/large_scale/knn_strong.json +++ b/configs/spmd/large_scale/knn_strong.json @@ -28,7 +28,7 @@ "SETS": [ "synthetic classification data", "sklearnex spmd implementation", - "large scale strong <64 parameters", + "large scale strong <=64 parameters", "spmd knn cls parameters" ] } diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json index 7e523984..4e4c9d0c 100644 --- a/configs/spmd/large_scale/large_scale.json +++ b/configs/spmd/large_scale/large_scale.json @@ -36,7 +36,7 @@ "mpi_params": {"n": [1,2,6,12,24,48,96,192,384], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, - "large scale <64 parameters": { + "large scale <=64 parameters": { "data": { "dtype": "float64", "distributed_split": "None" @@ -45,16 +45,6 @@ "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, - "large scale >64 parameters": { - "data": { - "dtype": "float64", - "distributed_split": "None" - }, - "bench": { - "mpi_params": {"n": [768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } - } - }, - "large scale strong 2k parameters": { "data": { "dtype": "float64", @@ -64,7 +54,7 @@ "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, - "large scale strong <64 parameters": { + "large scale strong <=64 parameters": { "data": { "dtype": "float64", "distributed_split": "rank_based" From 75f2f10e42728437ec6a32b98f76d84546c68b8b Mon Sep 17 00:00:00 2001 From: ethanglaser Date: Wed, 16 Oct 2024 03:59:40 +0000 Subject: [PATCH 072/110] lint --- sklbench/benchmarks/sklearn_estimator.py | 11 +++++++---- sklbench/datasets/common.py | 4 ++-- sklbench/utils/measurement.py | 8 +++++++- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index 36ec40b6..e57a9038 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -66,7 +66,7 @@ def get_estimator(library_name: str, estimator_name: str): f"Using first {classes_map[estimator_name][0]}." ) estimator = classes_map[estimator_name][0] - #if not issubclass(estimator, BaseEstimator): + # if not issubclass(estimator, BaseEstimator): # logger.info(f"{estimator} estimator is not derived from sklearn's BaseEstimator") return estimator @@ -515,11 +515,14 @@ def main(bench_case: BenchCase, filters: List[BenchCase]): estimator_params = get_bench_case_value( bench_case, "algorithm:estimator_params", dict() ) - #logger.debug("estimator params: " + str(estimator_params)) + # logger.debug("estimator params: " + str(estimator_params)) if "DBSCAN" in str(estimator_name): if "min_samples" in estimator_params: from mpi4py import MPI - estimator_params["min_samples"] = MPI.COMM_WORLD.Get_size() * estimator_params["min_samples"] + + estimator_params["min_samples"] = ( + MPI.COMM_WORLD.Get_size() * estimator_params["min_samples"] + ) # get estimator methods for measurement estimator_methods = get_estimator_methods(bench_case) @@ -555,7 +558,7 @@ def main(bench_case: BenchCase, filters: List[BenchCase]): # note: "handle" is not JSON-serializable if "handle" in estimator_params: del estimator_params["handle"] - #logger.debug(f"Estimator parameters:\n{custom_format(estimator_params)}") + # logger.debug(f"Estimator parameters:\n{custom_format(estimator_params)}") result_template.update(estimator_params) data_descs = { diff --git a/sklbench/datasets/common.py b/sklbench/datasets/common.py index 5c6bd27a..28b62fe6 100644 --- a/sklbench/datasets/common.py +++ b/sklbench/datasets/common.py @@ -136,11 +136,11 @@ def cache_wrapper(**kwargs): data_name = kwargs["data_name"] data_cache = kwargs["data_cache"] if len(get_filenames_by_prefix(data_cache, data_name)) > 0: - #logger.info(f'Loading "{data_name}" dataset from cache files') + # logger.info(f'Loading "{data_name}" dataset from cache files') data = load_data_from_cache(data_cache, data_name) data_desc = load_data_description(data_cache, data_name) else: - #logger.info(f'Loading "{data_name}" dataset from scratch') + # logger.info(f'Loading "{data_name}" dataset from scratch') data, data_desc = function(**kwargs) save_data_to_cache(data, data_cache, data_name) save_data_description(data_desc, data_cache, data_name) diff --git a/sklbench/utils/measurement.py b/sklbench/utils/measurement.py index 3628813d..bfabbdc0 100644 --- a/sklbench/utils/measurement.py +++ b/sklbench/utils/measurement.py @@ -93,8 +93,14 @@ def measure_time( ) break from mpi4py import MPI + if MPI.COMM_WORLD.Get_rank() == 0: - logger.debug("iters across n runs: " + str(iters) + ", inner iters across n runs: " + str(inners)) + logger.debug( + "iters across n runs: " + + str(iters) + + ", inner iters across n runs: " + + str(inners) + ) logger.debug(times) # mean, std = box_filter(times) # if std / mean > std_mean_ratio: From fdd32d1bf9d84bdd33d9363e170353c0623d2ca4 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 16 Oct 2024 15:51:08 +0200 Subject: [PATCH 073/110] Update knn.json --- configs/regular/bf16/knn.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/regular/bf16/knn.json b/configs/regular/bf16/knn.json index c39b577e..fabf6d6d 100644 --- a/configs/regular/bf16/knn.json +++ b/configs/regular/bf16/knn.json @@ -22,7 +22,7 @@ "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": [1, 2] } }, "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 51000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } + { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } ] }, "synthetic regression data": { @@ -31,7 +31,7 @@ "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": [1, 2] } }, "data": [ - { "source": "make_regression", "split_kwargs": { "train_size": 50000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 51000, "n_features": 100, "noise":1.5 } } + { "source": "make_regression", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 100, "noise":1.5 } } ] } }, From 99fdb8949662fc8b3a59fced3629c71e4d51137f Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 16 Oct 2024 17:18:11 +0200 Subject: [PATCH 074/110] Update linear_model.json --- configs/regular/bf16/linear_model.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/configs/regular/bf16/linear_model.json b/configs/regular/bf16/linear_model.json index 393b2c64..b081bd68 100644 --- a/configs/regular/bf16/linear_model.json +++ b/configs/regular/bf16/linear_model.json @@ -12,7 +12,8 @@ "estimator_params": { "fit_intercept": true, "copy_X": true } }, "data": { - "dtype": ["float32"] + "dtype": ["float32"], + "order": "C", } }, "sklearn linear parameters": { From d419a01e25fb9796db68b7d7f623765d6508d893 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 17 Oct 2024 20:22:48 +0000 Subject: [PATCH 075/110] minor --- configs/regular/bf16/linear_model.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/regular/bf16/linear_model.json b/configs/regular/bf16/linear_model.json index b081bd68..23aa49c0 100644 --- a/configs/regular/bf16/linear_model.json +++ b/configs/regular/bf16/linear_model.json @@ -13,7 +13,7 @@ }, "data": { "dtype": ["float32"], - "order": "C", + "order": "C" } }, "sklearn linear parameters": { From fd59a64c6065aea8e330906967cc6cc3d4fca9b1 Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Mon, 17 Mar 2025 16:16:03 -0700 Subject: [PATCH 076/110] Added updated configs. --- configs/spmd/kmeans_strong.json | 32 +++++++++ configs/spmd/kmeans_wide_weak.json | 34 +++++++++ configs/spmd/kmeans_wide_weak.json.backup | 34 +++++++++ configs/spmd/large_scale.json | 85 +++++++++++++++++++++++ 4 files changed, 185 insertions(+) create mode 100644 configs/spmd/kmeans_strong.json create mode 100644 configs/spmd/kmeans_wide_weak.json create mode 100644 configs/spmd/kmeans_wide_weak.json.backup create mode 100644 configs/spmd/large_scale.json diff --git a/configs/spmd/kmeans_strong.json b/configs/spmd/kmeans_strong.json new file mode 100644 index 00000000..c0028de3 --- /dev/null +++ b/configs/spmd/kmeans_strong.json @@ -0,0 +1,32 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../large_scale/large_scale.json"], + "PARAMETERS_SETS": { + "spmd kmeans parameters": { + "algorithm": { + "estimator": "KMeans", + "estimator_params": { + "algorithm": "lloyd", + "max_iter": 20, + "n_clusters": 100 + }, + "estimator_methods": { "training": "fit", "inference": "predict" }, + "sklearnex_context": { "use_raw_input": true } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000, "n_features": 100, "centers": 100 }} + ] + } + }, + "TEMPLATES": { + "kmeans": { + "SETS": [ + "synthetic data", + "sklearnex spmd implementation", + "large scale strong <=64 parameters", + "spmd kmeans parameters" + ] + } + } +} diff --git a/configs/spmd/kmeans_wide_weak.json b/configs/spmd/kmeans_wide_weak.json new file mode 100644 index 00000000..56874e77 --- /dev/null +++ b/configs/spmd/kmeans_wide_weak.json @@ -0,0 +1,34 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../large_scale/large_scale.json"], + "PARAMETERS_SETS": { + "spmd kmeans parameters": { + "algorithm": { + "estimator": "KMeans", + "estimator_params": { + "algorithm": "lloyd", + "max_iter": 20, + "n_clusters": 10, + "random_state": 42 + }, + "estimator_methods": { "training": "fit", "inference": "" }, + "sklearnex_context": { "use_raw_input": true } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 1000, "centers": 2000}} + ] + } + }, + "TEMPLATES": { + "kmeans": { + "SETS": [ + "synthetic data", + "sklearnex spmd implementation", + "large scale 2k parameters sample shift", + "spmd kmeans parameters" + ] + } + } +} + diff --git a/configs/spmd/kmeans_wide_weak.json.backup b/configs/spmd/kmeans_wide_weak.json.backup new file mode 100644 index 00000000..603ee877 --- /dev/null +++ b/configs/spmd/kmeans_wide_weak.json.backup @@ -0,0 +1,34 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../large_scale/large_scale.json"], + "PARAMETERS_SETS": { + "spmd kmeans parameters": { + "algorithm": { + "estimator": "KMeans", + "estimator_params": { + "algorithm": "lloyd", + "max_iter": 20, + "n_clusters": 10, + "random_state": 42 + }, + "estimator_methods": { "training": "fit", "inference": "" }, + "sklearnex_context": { "use_raw_input": true } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 1000, "centers": 2000, "cluster_std":3.0, "center_box":1000}} + ] + } + }, + "TEMPLATES": { + "kmeans": { + "SETS": [ + "synthetic data", + "sklearnex spmd implementation", + "large scale 2k parameters sample shift", + "spmd kmeans parameters" + ] + } + } +} + diff --git a/configs/spmd/large_scale.json b/configs/spmd/large_scale.json new file mode 100644 index 00000000..8b575dbf --- /dev/null +++ b/configs/spmd/large_scale.json @@ -0,0 +1,85 @@ +{ + "PARAMETERS_SETS": { + "large scale default parameters": { + "data": { + "dtype": "float64", + "distributed_split": "None" + }, + "bench": { + "mpi_params": {"n": [1,2], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + "large scale strong parameters": { + "data": { + "dtype": "float64", + "distributed_split": "rank_based" + }, + "bench": { + "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + "large scale 2k parameters sample shift": { + "data": { + "dtype": "float64", + "distributed_split": "sample_shift" + }, + "bench": { + "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + "large scale 2k parameters": { + "data": { + "dtype": "float64", + "distributed_split": "None" + }, + "bench": { + "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + "large scale 32 parameters": { + "data": { + "dtype": "float64", + "distributed_split": "None" + }, + "bench": { + "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + "large scale <=64 parameters": { + "data": { + "dtype": "float64", + "distributed_split": "None" + }, + "bench": { + "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + "large scale strong 2k parameters": { + "data": { + "dtype": "float64", + "distributed_split": "rank_based" + }, + "bench": { + "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + "large scale strong <=64 parameters": { + "data": { + "dtype": "float64", + "distributed_split": "rank_based" + }, + "bench": { + "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, + "large scale impi parameters": { + "data": { + "dtype": "float64", + "distributed_split": "no" + }, + "bench": { + "mpi_params": {"n": [1,2,4,6,9,12], "ppn": 12} + } + } + } +} From 985db075e277d286dd94c542d8802239b55bad8b Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Mon, 17 Mar 2025 16:27:23 -0700 Subject: [PATCH 077/110] Added shift. --- configs/spmd/kmeans_wide_weak.json | 2 +- sklbench/datasets/transformer.py | 37 ++++++++++++++++++++++++++---- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/configs/spmd/kmeans_wide_weak.json b/configs/spmd/kmeans_wide_weak.json index 56874e77..d5fe545a 100644 --- a/configs/spmd/kmeans_wide_weak.json +++ b/configs/spmd/kmeans_wide_weak.json @@ -25,7 +25,7 @@ "SETS": [ "synthetic data", "sklearnex spmd implementation", - "large scale 2k parameters sample shift", + "large scale 2k parameters", "spmd kmeans parameters" ] } diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index 86944ead..040ac2ee 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -23,7 +23,7 @@ from ..utils.bench_case import get_bench_case_value from ..utils.logger import logger - +from mpi4py import MPI def convert_data(data, dformat: str, order: str, dtype: str, device: str = None): if isinstance(data, csr_matrix) and dformat != "csr_matrix": @@ -113,8 +113,36 @@ def split_and_transform_data(bench_case, data, data_description): "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "") and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1 ) - if distributed_split == "rank_based" or knn_split_train: - from mpi4py import MPI + + if distributed_split == "sample_shift": + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + size = comm.Get_size() + + n_train = len(x_train) + n_test = len(x_test) + + train_start = 0 + train_end = n_train + test_start = 0 + test_end = n_test + + adjust_number = (math.sqrt(rank) * 0.003) + 1 + + if "y" in data: + x_train, y_train = ( + x_train[train_start:train_end] * adjust_number, + y_train[train_start:train_end], + ) + + x_test, y_test = x_test[test_start:test_end] * adjust_number, y_test[test_start:test_end] + else: + x_train = x_train[train_start:train_end] + + x_test = x_test[test_start:test_end] * adjust_number + + elif distributed_split == "rank_based" or knn_split_train: + comm = MPI.COMM_WORLD rank = comm.Get_rank() @@ -127,6 +155,7 @@ def split_and_transform_data(bench_case, data, data_description): train_end = (1 + rank) * n_train // size test_start = rank * n_test // size test_end = (1 + rank) * n_test // size + x_train_rank = x_train[train_start:train_end] if "y" in data: x_train, y_train = ( @@ -138,7 +167,7 @@ def split_and_transform_data(bench_case, data, data_description): else: x_train = x_train[train_start:train_end] if distributed_split == "rank_based": - x_test = x_test[test_start:test_end] + x_test = x_test[test_start:test_end] * adjust_number device = get_bench_case_value(bench_case, "algorithm:device", None) common_data_format = get_bench_case_value(bench_case, "data:format", "pandas") From 34a30c74177e480b2a55eb7cfc2c9a4c865e6bb7 Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Mon, 17 Mar 2025 16:32:36 -0700 Subject: [PATCH 078/110] Added center box. --- sklbench/datasets/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklbench/datasets/__init__.py b/sklbench/datasets/__init__.py index 093875c4..27336eb4 100644 --- a/sklbench/datasets/__init__.py +++ b/sklbench/datasets/__init__.py @@ -67,6 +67,8 @@ def load_data(bench_case: BenchCase) -> Tuple[Dict, Dict]: generation_kwargs = get_bench_case_value( bench_case, "data:generation_kwargs", dict() ) + if 'center_box' in generation_kwargs: + generation_kwargs['center_box'] = (-1 * generation_kwargs['center_box'], generation_kwargs['center_box']) return load_sklearn_synthetic_data( function_name=source, input_kwargs=generation_kwargs, From d47face749f5af71599139b2080c5d4cef189a08 Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Tue, 18 Mar 2025 10:21:01 -0700 Subject: [PATCH 079/110] Removed the inertia for Kmeans. --- sklbench/benchmarks/sklearn_estimator.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index e57a9038..877707af 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -191,19 +191,6 @@ def get_subset_metrics_of_estimator( } ) elif task == "clustering": - if hasattr(estimator_instance, "inertia_"): - # compute inertia manually using distances to cluster centers - # provided by KMeans.transform - metrics.update( - { - "inertia": float( - np.power( - convert_to_numpy(estimator_instance.transform(x)).min(axis=1), - 2, - ).sum() - ) - } - ) if hasattr(estimator_instance, "predict"): y_pred = convert_to_numpy(estimator_instance.predict(x)) metrics.update( From e6177916bf7375ef06bf6b8dfc2d119345f7d936 Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Tue, 18 Mar 2025 10:39:00 -0700 Subject: [PATCH 080/110] fixed config locations. --- configs/spmd/kmeans_wide_weak.json.backup | 34 ------------------- .../{ => large_scale}/kmeans_wide_weak.json | 0 2 files changed, 34 deletions(-) delete mode 100644 configs/spmd/kmeans_wide_weak.json.backup rename configs/spmd/{ => large_scale}/kmeans_wide_weak.json (100%) diff --git a/configs/spmd/kmeans_wide_weak.json.backup b/configs/spmd/kmeans_wide_weak.json.backup deleted file mode 100644 index 603ee877..00000000 --- a/configs/spmd/kmeans_wide_weak.json.backup +++ /dev/null @@ -1,34 +0,0 @@ -{ - "INCLUDE": ["../../common/sklearn.json", "../large_scale/large_scale.json"], - "PARAMETERS_SETS": { - "spmd kmeans parameters": { - "algorithm": { - "estimator": "KMeans", - "estimator_params": { - "algorithm": "lloyd", - "max_iter": 20, - "n_clusters": 10, - "random_state": 42 - }, - "estimator_methods": { "training": "fit", "inference": "" }, - "sklearnex_context": { "use_raw_input": true } - } - }, - "synthetic data": { - "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 1000, "centers": 2000, "cluster_std":3.0, "center_box":1000}} - ] - } - }, - "TEMPLATES": { - "kmeans": { - "SETS": [ - "synthetic data", - "sklearnex spmd implementation", - "large scale 2k parameters sample shift", - "spmd kmeans parameters" - ] - } - } -} - diff --git a/configs/spmd/kmeans_wide_weak.json b/configs/spmd/large_scale/kmeans_wide_weak.json similarity index 100% rename from configs/spmd/kmeans_wide_weak.json rename to configs/spmd/large_scale/kmeans_wide_weak.json From 00ac46d6132a44f2124f042c31e62ae2a041cd3c Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Tue, 18 Mar 2025 10:47:28 -0700 Subject: [PATCH 081/110] Updated configs. --- .../spmd/large_scale/kmeans_narrow_weak.json | 33 +++++++++++++++++++ configs/spmd/large_scale/kmeans_strong.json | 18 +++++----- 2 files changed, 43 insertions(+), 8 deletions(-) create mode 100644 configs/spmd/large_scale/kmeans_narrow_weak.json diff --git a/configs/spmd/large_scale/kmeans_narrow_weak.json b/configs/spmd/large_scale/kmeans_narrow_weak.json new file mode 100644 index 00000000..4d8a34d1 --- /dev/null +++ b/configs/spmd/large_scale/kmeans_narrow_weak.json @@ -0,0 +1,33 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "../large_scale/large_scale.json"], + "PARAMETERS_SETS": { + "spmd kmeans parameters": { + "algorithm": { + "estimator": "KMeans", + "estimator_params": { + "algorithm": "lloyd", + "max_iter": 20, + "n_clusters": 10, + "random_state": 42 + }, + "estimator_methods": { "training": "fit", "inference": "" }, + "sklearnex_context": { "use_raw_input": true } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 2000000, "n_features": 100, "centers": 2000, "cluster_std": 3, "center_box": 100.0}} + ] + } + }, + "TEMPLATES": { + "kmeans": { + "SETS": [ + "synthetic data", + "sklearnex spmd implementation", + "large scale 2k parameters sample shift", + "spmd kmeans parameters" + ] + } + } +} \ No newline at end of file diff --git a/configs/spmd/large_scale/kmeans_strong.json b/configs/spmd/large_scale/kmeans_strong.json index 87fb7fac..f61172c9 100644 --- a/configs/spmd/large_scale/kmeans_strong.json +++ b/configs/spmd/large_scale/kmeans_strong.json @@ -1,20 +1,21 @@ { - "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], + "INCLUDE": ["../../common/sklearn.json", "../large_scale/large_scale.json"], "PARAMETERS_SETS": { "spmd kmeans parameters": { "algorithm": { "estimator": "KMeans", "estimator_params": { - "algorithm": "lloyd" + "algorithm": "lloyd", + "max_iter": 20, + "n_clusters": 100 }, - "estimator_methods": { "training": "fit", "inference": "predict" } + "estimator_methods": { "training": "fit", "inference": "predict" }, + "sklearnex_context": { "use_raw_input": true } } - }, - "synthetic data": { + }, + "synthetic data": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }, - { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000, "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }, - { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } } + { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000, "n_features": 100, "centers": 100 }} ] } }, @@ -29,3 +30,4 @@ } } } + From f37f964a729c133f6bdd46f4e623316030c16fe2 Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Tue, 18 Mar 2025 10:54:53 -0700 Subject: [PATCH 082/110] Moved large scale files. --- configs/spmd/kmeans_strong.json | 32 ------------- configs/spmd/large_scale.json | 85 --------------------------------- 2 files changed, 117 deletions(-) delete mode 100644 configs/spmd/kmeans_strong.json delete mode 100644 configs/spmd/large_scale.json diff --git a/configs/spmd/kmeans_strong.json b/configs/spmd/kmeans_strong.json deleted file mode 100644 index c0028de3..00000000 --- a/configs/spmd/kmeans_strong.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "INCLUDE": ["../../common/sklearn.json", "../large_scale/large_scale.json"], - "PARAMETERS_SETS": { - "spmd kmeans parameters": { - "algorithm": { - "estimator": "KMeans", - "estimator_params": { - "algorithm": "lloyd", - "max_iter": 20, - "n_clusters": 100 - }, - "estimator_methods": { "training": "fit", "inference": "predict" }, - "sklearnex_context": { "use_raw_input": true } - } - }, - "synthetic data": { - "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000, "n_features": 100, "centers": 100 }} - ] - } - }, - "TEMPLATES": { - "kmeans": { - "SETS": [ - "synthetic data", - "sklearnex spmd implementation", - "large scale strong <=64 parameters", - "spmd kmeans parameters" - ] - } - } -} diff --git a/configs/spmd/large_scale.json b/configs/spmd/large_scale.json deleted file mode 100644 index 8b575dbf..00000000 --- a/configs/spmd/large_scale.json +++ /dev/null @@ -1,85 +0,0 @@ -{ - "PARAMETERS_SETS": { - "large scale default parameters": { - "data": { - "dtype": "float64", - "distributed_split": "None" - }, - "bench": { - "mpi_params": {"n": [1,2], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } - } - }, - "large scale strong parameters": { - "data": { - "dtype": "float64", - "distributed_split": "rank_based" - }, - "bench": { - "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } - } - }, - "large scale 2k parameters sample shift": { - "data": { - "dtype": "float64", - "distributed_split": "sample_shift" - }, - "bench": { - "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } - } - }, - "large scale 2k parameters": { - "data": { - "dtype": "float64", - "distributed_split": "None" - }, - "bench": { - "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } - } - }, - "large scale 32 parameters": { - "data": { - "dtype": "float64", - "distributed_split": "None" - }, - "bench": { - "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } - } - }, - "large scale <=64 parameters": { - "data": { - "dtype": "float64", - "distributed_split": "None" - }, - "bench": { - "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } - } - }, - "large scale strong 2k parameters": { - "data": { - "dtype": "float64", - "distributed_split": "rank_based" - }, - "bench": { - "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } - } - }, - "large scale strong <=64 parameters": { - "data": { - "dtype": "float64", - "distributed_split": "rank_based" - }, - "bench": { - "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } - } - }, - "large scale impi parameters": { - "data": { - "dtype": "float64", - "distributed_split": "no" - }, - "bench": { - "mpi_params": {"n": [1,2,4,6,9,12], "ppn": 12} - } - } - } -} From 1c5552b27893b27a6a695f2acda88527570b73a2 Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Tue, 18 Mar 2025 10:55:53 -0700 Subject: [PATCH 083/110] Added line. --- configs/spmd/large_scale/kmeans_narrow_weak.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/spmd/large_scale/kmeans_narrow_weak.json b/configs/spmd/large_scale/kmeans_narrow_weak.json index 4d8a34d1..d6b73029 100644 --- a/configs/spmd/large_scale/kmeans_narrow_weak.json +++ b/configs/spmd/large_scale/kmeans_narrow_weak.json @@ -30,4 +30,4 @@ ] } } -} \ No newline at end of file +} From dcfef94ab2bb8c1a34b7ed7eea57e2396fe8d9eb Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Tue, 18 Mar 2025 10:58:29 -0700 Subject: [PATCH 084/110] Added large scale 2k parameters sample shift --- configs/spmd/large_scale/large_scale.json | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json index 4e4c9d0c..a1ae8a62 100644 --- a/configs/spmd/large_scale/large_scale.json +++ b/configs/spmd/large_scale/large_scale.json @@ -27,6 +27,15 @@ "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, + "large scale 2k parameters sample shift": { + "data": { + "dtype": "float64", + "distributed_split": "sample_shift" + }, + "bench": { + "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, "large scale 32 parameters": { "data": { "dtype": "float64", From 4ba3fe43e5effba47c942ea0ebce7b16aeef2f69 Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Tue, 18 Mar 2025 12:09:12 -0700 Subject: [PATCH 085/110] Fixed imports. --- sklbench/datasets/transformer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index 040ac2ee..894d711d 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -18,12 +18,13 @@ import numpy as np import pandas as pd +from mpi4py import MPI from scipy.sparse import csr_matrix from sklearn.model_selection import train_test_split from ..utils.bench_case import get_bench_case_value from ..utils.logger import logger -from mpi4py import MPI + def convert_data(data, dformat: str, order: str, dtype: str, device: str = None): if isinstance(data, csr_matrix) and dformat != "csr_matrix": From 5c04a35f8d0156e5a4a308c5eaa2a38aee714387 Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Tue, 18 Mar 2025 12:29:03 -0700 Subject: [PATCH 086/110] Updated format. --- sklbench/datasets/__init__.py | 7 ++++-- sklbench/datasets/transformer.py | 38 +++++++++++++++++--------------- sklbench/utils/common.py | 2 +- 3 files changed, 26 insertions(+), 21 deletions(-) diff --git a/sklbench/datasets/__init__.py b/sklbench/datasets/__init__.py index 27336eb4..d4bddca1 100644 --- a/sklbench/datasets/__init__.py +++ b/sklbench/datasets/__init__.py @@ -67,8 +67,11 @@ def load_data(bench_case: BenchCase) -> Tuple[Dict, Dict]: generation_kwargs = get_bench_case_value( bench_case, "data:generation_kwargs", dict() ) - if 'center_box' in generation_kwargs: - generation_kwargs['center_box'] = (-1 * generation_kwargs['center_box'], generation_kwargs['center_box']) + if "center_box" in generation_kwargs: + generation_kwargs["center_box"] = ( + -1 * generation_kwargs["center_box"], + generation_kwargs["center_box"], + ) return load_sklearn_synthetic_data( function_name=source, input_kwargs=generation_kwargs, diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index 894d711d..b386578e 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -116,34 +116,36 @@ def split_and_transform_data(bench_case, data, data_description): ) if distributed_split == "sample_shift": - comm = MPI.COMM_WORLD - rank = comm.Get_rank() - size = comm.Get_size() + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + size = comm.Get_size() - n_train = len(x_train) - n_test = len(x_test) + n_train = len(x_train) + n_test = len(x_test) - train_start = 0 - train_end = n_train - test_start = 0 - test_end = n_test + train_start = 0 + train_end = n_train + test_start = 0 + test_end = n_test - adjust_number = (math.sqrt(rank) * 0.003) + 1 + adjust_number = (math.sqrt(rank) * 0.003) + 1 - if "y" in data: + if "y" in data: x_train, y_train = ( - x_train[train_start:train_end] * adjust_number, + x_train[train_start:train_end] * adjust_number, y_train[train_start:train_end], ) - - x_test, y_test = x_test[test_start:test_end] * adjust_number, y_test[test_start:test_end] - else: + + x_test, y_test = ( + x_test[test_start:test_end] * adjust_number, + y_test[test_start:test_end], + ) + else: x_train = x_train[train_start:train_end] - + x_test = x_test[test_start:test_end] * adjust_number elif distributed_split == "rank_based" or knn_split_train: - comm = MPI.COMM_WORLD rank = comm.Get_rank() @@ -156,7 +158,7 @@ def split_and_transform_data(bench_case, data, data_description): train_end = (1 + rank) * n_train // size test_start = rank * n_test // size test_end = (1 + rank) * n_test // size - x_train_rank = x_train[train_start:train_end] + x_train_rank = x_train[train_start:train_end] if "y" in data: x_train, y_train = ( diff --git a/sklbench/utils/common.py b/sklbench/utils/common.py index 06486428..995f4b5e 100755 --- a/sklbench/utils/common.py +++ b/sklbench/utils/common.py @@ -120,7 +120,7 @@ def flatten_list(input_list: List, ensure_type_homogeneity: bool = False) -> Lis def get_module_members( - module_names_chain: Union[List, str] + module_names_chain: Union[List, str], ) -> Tuple[ModuleContentMap, ModuleContentMap]: def get_module_name(module_names_chain: List[str]) -> str: name = module_names_chain[0] From af48e968488fbfa2c4b664e7fd0ec973ce90bd4e Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Tue, 18 Mar 2025 16:48:09 -0700 Subject: [PATCH 087/110] Added the math import. --- sklbench/datasets/transformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index b386578e..57999775 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -16,6 +16,7 @@ import os +import math import numpy as np import pandas as pd from mpi4py import MPI From c7f38f4b5b43a6dcc8920ba2d9f7e1f89e3847eb Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Thu, 20 Mar 2025 16:25:28 -0700 Subject: [PATCH 088/110] Rolled back the accidental changes to the ranked_based distributed_split. --- sklbench/datasets/transformer.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index 57999775..46342b3b 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -146,8 +146,7 @@ def split_and_transform_data(bench_case, data, data_description): x_test = x_test[test_start:test_end] * adjust_number - elif distributed_split == "rank_based" or knn_split_train: - + if distributed_split == "rank_based" or knn_split_train: comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() @@ -159,7 +158,6 @@ def split_and_transform_data(bench_case, data, data_description): train_end = (1 + rank) * n_train // size test_start = rank * n_test // size test_end = (1 + rank) * n_test // size - x_train_rank = x_train[train_start:train_end] if "y" in data: x_train, y_train = ( @@ -171,7 +169,8 @@ def split_and_transform_data(bench_case, data, data_description): else: x_train = x_train[train_start:train_end] if distributed_split == "rank_based": - x_test = x_test[test_start:test_end] * adjust_number + x_test = x_test[test_start:test_end] + device = get_bench_case_value(bench_case, "algorithm:device", None) common_data_format = get_bench_case_value(bench_case, "data:format", "pandas") From 264701eae57156971a6b1a6e4c77c6e41f6dffe3 Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Thu, 20 Mar 2025 16:32:23 -0700 Subject: [PATCH 089/110] Updated large scale 2k parameters for the full 24576 tiles. --- configs/spmd/large_scale/large_scale.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json index a1ae8a62..28626dc9 100644 --- a/configs/spmd/large_scale/large_scale.json +++ b/configs/spmd/large_scale/large_scale.json @@ -33,7 +33,7 @@ "distributed_split": "sample_shift" }, "bench": { - "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, "large scale 32 parameters": { From 20419a954e02557f791df9051e6986c4784b8df8 Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Thu, 20 Mar 2025 16:40:55 -0700 Subject: [PATCH 090/110] Updated config files. --- configs/spmd/large_scale/kmeans.json | 30 ------------------- .../spmd/large_scale/kmeans_narrow_weak.json | 2 +- configs/spmd/large_scale/kmeans_strong.json | 2 +- .../spmd/large_scale/kmeans_wide_weak.json | 2 +- 4 files changed, 3 insertions(+), 33 deletions(-) delete mode 100644 configs/spmd/large_scale/kmeans.json diff --git a/configs/spmd/large_scale/kmeans.json b/configs/spmd/large_scale/kmeans.json deleted file mode 100644 index 1140823d..00000000 --- a/configs/spmd/large_scale/kmeans.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], - "PARAMETERS_SETS": { - "spmd kmeans parameters": { - "algorithm": { - "estimator": "KMeans", - "estimator_params": { - "algorithm": "lloyd" - }, - "estimator_methods": { "training": "fit", "inference": "predict" } - } - }, - "synthetic data": { - "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }, - { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000, "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } } - ] - } - }, - "TEMPLATES": { - "kmeans": { - "SETS": [ - "synthetic data", - "sklearnex spmd implementation", - "large scale 2k parameters", - "spmd kmeans parameters" - ] - } - } -} diff --git a/configs/spmd/large_scale/kmeans_narrow_weak.json b/configs/spmd/large_scale/kmeans_narrow_weak.json index d6b73029..523aba01 100644 --- a/configs/spmd/large_scale/kmeans_narrow_weak.json +++ b/configs/spmd/large_scale/kmeans_narrow_weak.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../../common/sklearn.json", "../large_scale/large_scale.json"], + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd kmeans parameters": { "algorithm": { diff --git a/configs/spmd/large_scale/kmeans_strong.json b/configs/spmd/large_scale/kmeans_strong.json index f61172c9..90a1ea3f 100644 --- a/configs/spmd/large_scale/kmeans_strong.json +++ b/configs/spmd/large_scale/kmeans_strong.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../../common/sklearn.json", "../large_scale/large_scale.json"], + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd kmeans parameters": { "algorithm": { diff --git a/configs/spmd/large_scale/kmeans_wide_weak.json b/configs/spmd/large_scale/kmeans_wide_weak.json index d5fe545a..1c588d60 100644 --- a/configs/spmd/large_scale/kmeans_wide_weak.json +++ b/configs/spmd/large_scale/kmeans_wide_weak.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../../common/sklearn.json", "../large_scale/large_scale.json"], + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd kmeans parameters": { "algorithm": { From 4e93858b24fa1b95e80947b7d6cf82e4230e6165 Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Thu, 20 Mar 2025 16:46:55 -0700 Subject: [PATCH 091/110] cleaned up diff. --- sklbench/utils/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklbench/utils/common.py b/sklbench/utils/common.py index 995f4b5e..06486428 100755 --- a/sklbench/utils/common.py +++ b/sklbench/utils/common.py @@ -120,7 +120,7 @@ def flatten_list(input_list: List, ensure_type_homogeneity: bool = False) -> Lis def get_module_members( - module_names_chain: Union[List, str], + module_names_chain: Union[List, str] ) -> Tuple[ModuleContentMap, ModuleContentMap]: def get_module_name(module_names_chain: List[str]) -> str: name = module_names_chain[0] From 428f3df094feacd79eb4703e25e325fc55d232eb Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Thu, 20 Mar 2025 17:23:13 -0700 Subject: [PATCH 092/110] Reformatted correctly. --- sklbench/datasets/transformer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index 46342b3b..9e00d05d 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -171,7 +171,6 @@ def split_and_transform_data(bench_case, data, data_description): if distributed_split == "rank_based": x_test = x_test[test_start:test_end] - device = get_bench_case_value(bench_case, "algorithm:device", None) common_data_format = get_bench_case_value(bench_case, "data:format", "pandas") common_data_order = get_bench_case_value(bench_case, "data:order", "F") From 2f8c68b89dd397390accbaf633c4978d38559531 Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Thu, 20 Mar 2025 17:28:29 -0700 Subject: [PATCH 093/110] Fixed if else. --- sklbench/datasets/transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index 9e00d05d..fa5badd9 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -146,7 +146,7 @@ def split_and_transform_data(bench_case, data, data_description): x_test = x_test[test_start:test_end] * adjust_number - if distributed_split == "rank_based" or knn_split_train: + elif distributed_split == "rank_based" or knn_split_train: comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() From 816c6dcf5111ffe58923499c823fc39f8a2c1bca Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Fri, 21 Mar 2025 09:42:19 -0700 Subject: [PATCH 094/110] Updated format. --- sklbench/datasets/transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index fa5badd9..63a636c3 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -14,9 +14,9 @@ # limitations under the License. # =============================================================================== +import math import os -import math import numpy as np import pandas as pd from mpi4py import MPI From 5d3bf52927709502a30712ad6020678adb3f8225 Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Fri, 21 Mar 2025 10:07:02 -0700 Subject: [PATCH 095/110] Added mpi4py --- envs/conda-env-rapids.yml | 1 + envs/conda-env-sklearn.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/envs/conda-env-rapids.yml b/envs/conda-env-rapids.yml index d72aa2d8..9eaa9206 100644 --- a/envs/conda-env-rapids.yml +++ b/envs/conda-env-rapids.yml @@ -19,3 +19,4 @@ dependencies: - psutil - requests - py-cpuinfo + - mpi4py \ No newline at end of file diff --git a/envs/conda-env-sklearn.yml b/envs/conda-env-sklearn.yml index bbc34463..afa7641f 100644 --- a/envs/conda-env-sklearn.yml +++ b/envs/conda-env-sklearn.yml @@ -21,3 +21,4 @@ dependencies: - psutil - requests - py-cpuinfo + - mpi4py From a937963e214bc79092cbf18b2a750959ca948e01 Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Fri, 21 Mar 2025 10:44:55 -0700 Subject: [PATCH 096/110] fixed mpi4py --- envs/conda-env-rapids.yml | 2 +- envs/conda-env-sklearn.yml | 2 +- sklbench/datasets/transformer.py | 5 ++++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/envs/conda-env-rapids.yml b/envs/conda-env-rapids.yml index 9eaa9206..b43c0958 100644 --- a/envs/conda-env-rapids.yml +++ b/envs/conda-env-rapids.yml @@ -19,4 +19,4 @@ dependencies: - psutil - requests - py-cpuinfo - - mpi4py \ No newline at end of file + \ No newline at end of file diff --git a/envs/conda-env-sklearn.yml b/envs/conda-env-sklearn.yml index afa7641f..070be2cc 100644 --- a/envs/conda-env-sklearn.yml +++ b/envs/conda-env-sklearn.yml @@ -21,4 +21,4 @@ dependencies: - psutil - requests - py-cpuinfo - - mpi4py + diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index 63a636c3..cd00d724 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -19,7 +19,6 @@ import numpy as np import pandas as pd -from mpi4py import MPI from scipy.sparse import csr_matrix from sklearn.model_selection import train_test_split @@ -117,6 +116,8 @@ def split_and_transform_data(bench_case, data, data_description): ) if distributed_split == "sample_shift": + from mpi4py import MPI + comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() @@ -147,6 +148,8 @@ def split_and_transform_data(bench_case, data, data_description): x_test = x_test[test_start:test_end] * adjust_number elif distributed_split == "rank_based" or knn_split_train: + from mpi4py import MPI + comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() From 3809d1760aaa77acc43299460e38f57eea70e0bb Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Fri, 21 Mar 2025 10:47:33 -0700 Subject: [PATCH 097/110] Rolled back mpi4py. --- envs/conda-env-rapids.yml | 1 - envs/conda-env-sklearn.yml | 1 - 2 files changed, 2 deletions(-) diff --git a/envs/conda-env-rapids.yml b/envs/conda-env-rapids.yml index b43c0958..d72aa2d8 100644 --- a/envs/conda-env-rapids.yml +++ b/envs/conda-env-rapids.yml @@ -19,4 +19,3 @@ dependencies: - psutil - requests - py-cpuinfo - \ No newline at end of file diff --git a/envs/conda-env-sklearn.yml b/envs/conda-env-sklearn.yml index 070be2cc..bbc34463 100644 --- a/envs/conda-env-sklearn.yml +++ b/envs/conda-env-sklearn.yml @@ -21,4 +21,3 @@ dependencies: - psutil - requests - py-cpuinfo - From c12874832788ad3ae874ac9b9b38e6f26044abe3 Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Fri, 21 Mar 2025 12:53:41 -0700 Subject: [PATCH 098/110] Formatted file. --- sklbench/datasets/transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index cd00d724..ea5646ca 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -149,7 +149,7 @@ def split_and_transform_data(bench_case, data, data_description): elif distributed_split == "rank_based" or knn_split_train: from mpi4py import MPI - + comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() From 15db7929f76696956a4fe0529e8addef35f95f74 Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Fri, 21 Mar 2025 14:56:49 -0700 Subject: [PATCH 099/110] Removed environment from diff. --- sklbench/report/implementation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py index 2bc3a05e..cddb45f3 100644 --- a/sklbench/report/implementation.py +++ b/sklbench/report/implementation.py @@ -100,7 +100,8 @@ "batch_size", ] -DIFFBY_COLUMNS = ["environment_name", "library", "format", "device"] +#DIFFBY_COLUMNS = ["environment_name", "library", "format", "device"] +DIFFBY_COLUMNS = ["library", "format", "device"] def geomean_wrapper(a): From 30b0b80353f19fc87108b04a08b6d089a037047c Mon Sep 17 00:00:00 2001 From: ethanglaser <42726565+ethanglaser@users.noreply.github.com> Date: Fri, 21 Mar 2025 15:00:01 -0700 Subject: [PATCH 100/110] initial alignment of configs to final results (#176) * initial alignment of configs to final results * logic updates * fix large scale CI * black * minimize incremental * black * align custom function with skl estimator * Update sklbench/utils/measurement.py * Update sklbench/utils/measurement.py * Update sklbench/utils/measurement.py * Update sklbench/utils/measurement.py --- configs/incremental.json | 100 ------------------ configs/spmd/large_scale/basic_stats.json | 15 +-- .../spmd/large_scale/basic_stats_strong.json | 13 +-- configs/spmd/large_scale/covariance.json | 9 +- .../spmd/large_scale/covariance_strong.json | 7 +- configs/spmd/large_scale/dbscan.json | 5 +- configs/spmd/large_scale/dbscan_strong.json | 15 +-- .../spmd/large_scale/forest_max_samples.json | 28 +++++ ...forest.json => forest_no_max_samples.json} | 9 +- configs/spmd/large_scale/forest_strong.json | 11 +- configs/spmd/large_scale/incremental.json | 77 ++++++++++++++ .../large_scale/incremental/basic_stats.json | 30 ------ .../large_scale/incremental/covariance.json | 30 ------ .../large_scale/incremental/linear_model.json | 27 ----- configs/spmd/large_scale/incremental/pca.json | 30 ------ configs/spmd/large_scale/knn_strong.json | 17 +-- configs/spmd/large_scale/knn_tier1.json | 35 ++++++ .../large_scale/{knn.json => knn_tier2.json} | 19 ++-- configs/spmd/large_scale/linreg.json | 9 +- configs/spmd/large_scale/linreg_strong.json | 7 +- configs/spmd/large_scale/logreg.json | 15 +-- configs/spmd/large_scale/logreg_strong.json | 18 ++-- configs/spmd/large_scale/pca.json | 11 +- configs/spmd/large_scale/pca_strong.json | 9 +- configs/spmd/large_scale/spmd_for_online.json | 96 ----------------- .../large_scale/spmd_for_online_strong.json | 60 ----------- sklbench/benchmarks/custom_function.py | 10 +- sklbench/benchmarks/sklearn_estimator.py | 6 +- sklbench/datasets/transformer.py | 10 +- sklbench/utils/measurement.py | 26 +++-- test-configuration-linux.yml | 5 - test-configuration-win.yml | 4 - 32 files changed, 273 insertions(+), 490 deletions(-) delete mode 100644 configs/incremental.json create mode 100644 configs/spmd/large_scale/forest_max_samples.json rename configs/spmd/large_scale/{forest.json => forest_no_max_samples.json} (58%) create mode 100644 configs/spmd/large_scale/incremental.json delete mode 100644 configs/spmd/large_scale/incremental/basic_stats.json delete mode 100644 configs/spmd/large_scale/incremental/covariance.json delete mode 100644 configs/spmd/large_scale/incremental/linear_model.json delete mode 100644 configs/spmd/large_scale/incremental/pca.json create mode 100644 configs/spmd/large_scale/knn_tier1.json rename configs/spmd/large_scale/{knn.json => knn_tier2.json} (55%) delete mode 100644 configs/spmd/large_scale/spmd_for_online.json delete mode 100644 configs/spmd/large_scale/spmd_for_online_strong.json diff --git a/configs/incremental.json b/configs/incremental.json deleted file mode 100644 index e1f589a4..00000000 --- a/configs/incremental.json +++ /dev/null @@ -1,100 +0,0 @@ -{ "INCLUDE": ["./common/sklearn.json"], - "PARAMETERS_SETS": { - "common": {"bench": {"n_runs": 10, "time_limit": 60}}, - "covariance data": { - "data": [ - { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 12000000, - "n_features": [10, 100] - }, - "split_kwargs": {"ignore": true} - } - ] - }, - "basic_statistics data": { - "data": { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 12000000, - "n_features": [10, 100] - }, - "split_kwargs": {"ignore": true} - } - }, - "linear_regression data": { - "data": { - "source": "make_regression", - "split_kwargs": {"train_size": 0.2, "test_size": 0.8}, - "generation_kwargs": { - "n_samples": 12000000, - "n_features": [10, 100], - "n_informative": 5, - "noise": 2.0 - } - } - }, - "pca data": { - "data": { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 12000000, - "n_features": [10, 100] - }, - "split_kwargs": {"ignore": true} - } - }, - "covariance": { - "algorithm": [ - { - "estimator": "IncrementalEmpiricalCovariance", - "library": "sklearnex.covariance", - "estimator_methods": {"training": "partial_fit"}, - "num_batches": {"training": 12} - } - ] - }, - "basic_statistics": { - "algorithm": [ - { - "estimator": "IncrementalBasicStatistics", - "library": "sklearnex.basic_statistics", - "estimator_methods": {"training": "partial_fit"}, - "num_batches": {"training": 12} - } - ] - }, - "linear_regression": { - "algorithm": [ - { - "estimator": "IncrementalLinearRegression", - "library": "sklearnex.linear_model", - "estimator_methods": {"training": "partial_fit"}, - "num_batches": {"training": 12} - } - ] - }, - "pca": { - "algorithm": [ - { - "estimator": "IncrementalPCA", - "library": "sklearnex.preview.decomposition", - "estimator_methods": {"training": "partial_fit"}, - "num_batches": {"training": 12} - } - ] - } - }, - "TEMPLATES": { - "basic_statistics": {"SETS": ["common", "basic_statistics", "basic_statistics data", "sklearn-ex[gpu] implementations"]}, - "covariance": {"SETS": ["common", "covariance", "covariance data", "sklearn-ex[gpu] implementations"]}, - "linear_regression": { - "SETS": ["common", "linear_regression", "linear_regression data", "sklearn-ex[gpu] implementations"] - }, - "pca": {"SETS": ["common", "pca", "pca data", "sklearn-ex[gpu] implementations"]} - } -} diff --git a/configs/spmd/large_scale/basic_stats.json b/configs/spmd/large_scale/basic_stats.json index d6c2c4d2..f8f44e4e 100644 --- a/configs/spmd/large_scale/basic_stats.json +++ b/configs/spmd/large_scale/basic_stats.json @@ -4,16 +4,17 @@ "spmd basicstats parameters": { "algorithm": { "estimator": "BasicStatistics", - "estimator_methods": { "training": "fit" } + "estimator_methods": { "training": "fit" }, + "sklearnex_context": { "use_raw_input": true } }, - "data": { - "split_kwargs": { "test_size": 0.0001 } - } + "data": { + "split_kwargs": { "test_size": 0.0001 } + } }, "synthetic data": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }, - { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } + { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000000, "n_features": 10, "centers": 1 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 1000, "centers": 1 } } ] } }, @@ -22,7 +23,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale 2k parameters", - "synthetic data", + "synthetic data", "spmd basicstats parameters" ] } diff --git a/configs/spmd/large_scale/basic_stats_strong.json b/configs/spmd/large_scale/basic_stats_strong.json index b5b0ef69..0c7c671e 100644 --- a/configs/spmd/large_scale/basic_stats_strong.json +++ b/configs/spmd/large_scale/basic_stats_strong.json @@ -4,11 +4,12 @@ "spmd basicstats parameters": { "algorithm": { "estimator": "BasicStatistics", - "estimator_methods": { "training": "fit" } + "estimator_methods": { "training": "fit" }, + "sklearnex_context": { "use_raw_input": true } }, - "data": { - "split_kwargs": { "test_size": 0.0001 } - } + "data": { + "split_kwargs": { "test_size": 0.0001 } + } }, "synthetic data": { "data": [ @@ -20,8 +21,8 @@ "basicstats": { "SETS": [ "sklearnex spmd implementation", - "large scale strong 2k parameters", - "synthetic data", + "large scale strong <=64 parameters", + "synthetic data", "spmd basicstats parameters" ] } diff --git a/configs/spmd/large_scale/covariance.json b/configs/spmd/large_scale/covariance.json index 20da8d15..7f4d6d7d 100644 --- a/configs/spmd/large_scale/covariance.json +++ b/configs/spmd/large_scale/covariance.json @@ -4,7 +4,8 @@ "spmd basicstats parameters": { "algorithm": { "estimator": "EmpiricalCovariance", - "estimator_methods": { "training": "fit" } + "estimator_methods": { "training": "fit" }, + "sklearnex_context": { "use_raw_input": true } }, "data": { "split_kwargs": { "test_size": 0.0001 } @@ -12,8 +13,8 @@ }, "synthetic data": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }, - { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } + { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000000, "n_features": 10, "centers": 1 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 1000, "centers": 1 } } ] } }, @@ -22,7 +23,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale 2k parameters", - "synthetic data", + "synthetic data", "spmd basicstats parameters" ] } diff --git a/configs/spmd/large_scale/covariance_strong.json b/configs/spmd/large_scale/covariance_strong.json index b8424d92..8e388801 100644 --- a/configs/spmd/large_scale/covariance_strong.json +++ b/configs/spmd/large_scale/covariance_strong.json @@ -4,7 +4,8 @@ "spmd basicstats parameters": { "algorithm": { "estimator": "EmpiricalCovariance", - "estimator_methods": { "training": "fit" } + "estimator_methods": { "training": "fit" }, + "sklearnex_context": { "use_raw_input": true } }, "data": { "split_kwargs": { "test_size": 0.0001 } @@ -20,8 +21,8 @@ "covariance": { "SETS": [ "sklearnex spmd implementation", - "large scale strong 2k parameters", - "synthetic data", + "large scale strong <=64 parameters", + "synthetic data", "spmd basicstats parameters" ] } diff --git a/configs/spmd/large_scale/dbscan.json b/configs/spmd/large_scale/dbscan.json index e4996c9e..bf60b7cc 100644 --- a/configs/spmd/large_scale/dbscan.json +++ b/configs/spmd/large_scale/dbscan.json @@ -9,7 +9,8 @@ }, "estimator_params" : { "eps": 10, "min_samples": 5 - } + }, + "sklearnex_context": { "use_raw_input": true } }, "data": { "dtype": "float64" @@ -17,7 +18,7 @@ }, "synthetic dataset": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 100, "centers": 10 } } + { "source": "make_blobs", "generation_kwargs": { "n_samples": 40000, "n_features": 100, "centers": 10 } } ] } }, diff --git a/configs/spmd/large_scale/dbscan_strong.json b/configs/spmd/large_scale/dbscan_strong.json index 04fb9016..5e7ab322 100644 --- a/configs/spmd/large_scale/dbscan_strong.json +++ b/configs/spmd/large_scale/dbscan_strong.json @@ -3,13 +3,14 @@ "PARAMETERS_SETS": { "spmd dbscan parameters": { "algorithm": { - "estimator": "DBSCAN", - "estimator_methods": { - "training": "fit" + "estimator": "DBSCAN", + "estimator_methods": { + "training": "fit" }, "estimator_params" : { - "eps": 10, "min_samples": 5 - } + "eps": 15, "min_samples": 50 + }, + "sklearnex_context": { "use_raw_input": true } }, "data": { "dtype": "float64" @@ -17,7 +18,7 @@ }, "synthetic dataset": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 500000, "n_features": 100, "centers": 10 } } + { "source": "make_blobs", "generation_kwargs": { "n_samples": 4000000, "n_features": 100, "centers": 10 } } ] } }, @@ -27,7 +28,7 @@ "common dbscan parameters", "synthetic dataset", "sklearnex spmd implementation", - "large scale strong <=64 parameters", + "large scale strong <=64 parameters", "spmd dbscan parameters" ] } diff --git a/configs/spmd/large_scale/forest_max_samples.json b/configs/spmd/large_scale/forest_max_samples.json new file mode 100644 index 00000000..95affb16 --- /dev/null +++ b/configs/spmd/large_scale/forest_max_samples.json @@ -0,0 +1,28 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd forest classification parameters": { + "algorithm": { + "estimator": "RandomForestClassifier", + "estimator_methods": { "training": "fit" }, + "estimator_params": { "n_estimators": 20, "max_depth": 10 }, + "sklearnex_context": { "use_raw_input": true } + } + }, + "synthetic data": { + "data": [ + { "source": "make_classification", "split_kwargs": { "train_size": 1000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 1001000, "n_features": 100, "n_classes": 2 } } + ] + } + }, + "TEMPLATES": { + "forestCls": { + "SETS": [ + "sklearnex spmd implementation", + "large scale 32 parameters", + "synthetic data", + "spmd forest classification parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/forest.json b/configs/spmd/large_scale/forest_no_max_samples.json similarity index 58% rename from configs/spmd/large_scale/forest.json rename to configs/spmd/large_scale/forest_no_max_samples.json index b4402442..c371371b 100644 --- a/configs/spmd/large_scale/forest.json +++ b/configs/spmd/large_scale/forest_no_max_samples.json @@ -4,14 +4,13 @@ "spmd forest classification parameters": { "algorithm": { "estimator": "RandomForestClassifier", - "estimator_methods": { "training": "fit" }, - "estimator_params": { "n_estimators": 20, "max_depth": 4 } + "estimator_params": { "n_estimators": 100, "max_depth": 7 }, + "sklearnex_context": { "use_raw_input": true } } }, "synthetic data": { "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 501000, "n_features": 10, "n_classes": 2 } }, - { "source": "make_classification", "split_kwargs": { "train_size": 10000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 11000, "n_features": 1000, "n_classes": 2 } } + { "source": "make_classification", "split_kwargs": { "train_size": 1000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 1001000, "n_features": 100, "n_classes": 2 }, "n_informative": "[SPECIAL_VALUE]0.5" } ] } }, @@ -20,7 +19,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale 2k parameters", - "synthetic data", + "synthetic data", "spmd forest classification parameters" ] } diff --git a/configs/spmd/large_scale/forest_strong.json b/configs/spmd/large_scale/forest_strong.json index 23b982f5..653c70dc 100644 --- a/configs/spmd/large_scale/forest_strong.json +++ b/configs/spmd/large_scale/forest_strong.json @@ -4,13 +4,14 @@ "spmd forest classification parameters": { "algorithm": { "estimator": "RandomForestClassifier", - "estimator_methods": { "training": "fit" }, - "estimator_params": { "n_estimators": 20, "max_depth": 4 } + "estimator_methods": { "training": "fit" }, + "estimator_params": { "n_estimators": 100, "max_depth": 8 }, + "sklearnex_context": {"use_raw_input": true} } }, "synthetic data": { "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 10001000, "n_features": 100, "n_classes": 2 } } + { "source": "make_classification", "split_kwargs": { "train_size": 20000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 21000, "n_features": 200, "n_classes": 2 } } ] } }, @@ -18,8 +19,8 @@ "forestCls": { "SETS": [ "sklearnex spmd implementation", - "large scale strong 2k parameters", - "synthetic data", + "large scale strong <=64 parameters", + "synthetic data", "spmd forest classification parameters" ] } diff --git a/configs/spmd/large_scale/incremental.json b/configs/spmd/large_scale/incremental.json new file mode 100644 index 00000000..195074ee --- /dev/null +++ b/configs/spmd/large_scale/incremental.json @@ -0,0 +1,77 @@ +{ "INCLUDE": [ ], + "PARAMETERS_SETS": { + "common incremental raw gpu params": { + "algorithm": { + "device": "gpu", + "sklearnex_context": { "use_raw_input": true } + }, + "data": { + "format":"dpctl", + "order": "C" + } + }, + "statistical batches and data": [ + { "algorithm": { "num_batches": { "training": [1,2,6,12]} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000000, "n_features": 10, "centers": 1 } } }, + { "algorithm": { "num_batches": { "training": 2} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 50000000, "n_features": 10, "centers": 1 } } }, + { "algorithm": { "num_batches": { "training": 6} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 16666667, "n_features": 10, "centers": 1 } } }, + { "algorithm": { "num_batches": { "training": 12} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 8333333, "n_features": 10, "centers": 1 } } }, + { "algorithm": { "num_batches": { "training": [1,2,6,12]} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 1000, "centers": 1 } } }, + { "algorithm": { "num_batches": { "training": 2} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 500000, "n_features": 1000, "centers": 1 } } }, + { "algorithm": { "num_batches": { "training": 6} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 166667, "n_features": 1000, "centers": 1 } } }, + { "algorithm": { "num_batches": { "training": 12} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 83333, "n_features": 1000, "centers": 1 } } } + ], + "regression batches and data": [ + { "algorithm": { "num_batches": { "training": [1,2,6,12]} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 100005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 100000000, "test_size": 5000 } } }, + { "algorithm": { "num_batches": { "training": 2} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 100005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 50000000, "test_size": 5000 } } }, + { "algorithm": { "num_batches": { "training": 6} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 100005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 16666667, "test_size": 5000 } } }, + { "algorithm": { "num_batches": { "training": 12} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 100005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 8333333, "test_size": 5000 } } }, + { "algorithm": { "num_batches": { "training": [1,2,6,12]} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 3000000, "test_size": 5000 } } }, + { "algorithm": { "num_batches": { "training": 2} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 1500000, "test_size": 5000 } } }, + { "algorithm": { "num_batches": { "training": 6} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 500000, "test_size": 5000 } } }, + { "algorithm": { "num_batches": { "training": 12} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 250000, "test_size": 5000 } } } + ], + "covariance": { + "algorithm": { + "estimator": "IncrementalEmpiricalCovariance", + "library": "sklearnex", + "estimator_methods": {"training": "partial_fit"} + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + }, + "basic_statistics": { + "algorithm": { + "estimator": "IncrementalBasicStatistics", + "library": "sklearnex", + "estimator_methods": {"training": "partial_fit"} + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + }, + "linear_regression": { + "algorithm": { + "estimator": "IncrementalLinearRegression", + "library": "sklearnex", + "estimator_methods": {"training": "partial_fit"} + } + }, + "pca": { + "algorithm": { + "estimator": "IncrementalPCA", + "library": "sklearnex.preview", + "estimator_methods": {"training": "partial_fit"} + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + } + }, + "TEMPLATES": { + "basic_statistics": { "SETS": ["common incremental raw gpu params", "basic_statistics", "statistical batches and data"] }, + "covariance": { "SETS": ["common incremental raw gpu params", "covariance", "statistical batches and data"] }, + "linear_regression": { "SETS": ["common incremental raw gpu params", "linear_regression", "regression batches and data"] }, + "pca": { "SETS": ["common incremental raw gpu params", "pca", "statistical batches and data"] } + } +} diff --git a/configs/spmd/large_scale/incremental/basic_stats.json b/configs/spmd/large_scale/incremental/basic_stats.json deleted file mode 100644 index ca9e3eb9..00000000 --- a/configs/spmd/large_scale/incremental/basic_stats.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "INCLUDE": ["../../../common/sklearn.json", "../../../spmd/stats_covariance.json", "../large_scale.json"], - "PARAMETERS_SETS": { - "spmd basicstats parameters": { - "algorithm": { - "estimator": "IncrementalBasicStatistics", - "estimator_methods": { "training": "fit" }, - "num_batches": {"training": 10} - }, - "data": { - "split_kwargs": { "test_size": 0.0001 } - } - }, - "synthetic data": { - "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } - ] - } - }, - "TEMPLATES": { - "basicstats": { - "SETS": [ - "sklearnex spmd implementation", - "large scale 32 parameters", - "synthetic data", - "spmd basicstats parameters" - ] - } - } -} diff --git a/configs/spmd/large_scale/incremental/covariance.json b/configs/spmd/large_scale/incremental/covariance.json deleted file mode 100644 index 04fcd76b..00000000 --- a/configs/spmd/large_scale/incremental/covariance.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "INCLUDE": ["../../../common/sklearn.json", "../../../spmd/stats_covariance.json", "../large_scale.json"], - "PARAMETERS_SETS": { - "spmd covariance parameters": { - "algorithm": { - "estimator": "IncrementalEmpiricalCovariance", - "estimator_methods": { "training": "fit" }, - "num_batches": {"training": 10} - }, - "data": { - "split_kwargs": { "test_size": 0.0001 } - } - }, - "synthetic data": { - "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } - ] - } - }, - "TEMPLATES": { - "covariance": { - "SETS": [ - "sklearnex spmd implementation", - "large scale 32 parameters", - "synthetic data", - "spmd covariance parameters" - ] - } - } -} diff --git a/configs/spmd/large_scale/incremental/linear_model.json b/configs/spmd/large_scale/incremental/linear_model.json deleted file mode 100644 index a483f613..00000000 --- a/configs/spmd/large_scale/incremental/linear_model.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "INCLUDE": ["../../../common/sklearn.json", "../../../regular/linear_model.json", "../large_scale.json"], - "PARAMETERS_SETS": { - "spmd linear parameters": { - "algorithm": { - "estimator": "IncrementalLinearRegression", - "estimator_methods": { "training": "fit" }, - "num_batches": {"training": 10} - } - }, - "synthetic data": { - "data": [ - { "source": "make_regression", "generation_kwargs": { "n_samples": 305000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 300000, "test_size": 5000 } } - ] - } - }, - "TEMPLATES": { - "linreg": { - "SETS": [ - "sklearnex spmd implementation", - "large scale 32 parameters", - "synthetic data", - "spmd linear parameters" - ] - } - } -} diff --git a/configs/spmd/large_scale/incremental/pca.json b/configs/spmd/large_scale/incremental/pca.json deleted file mode 100644 index 11fa5125..00000000 --- a/configs/spmd/large_scale/incremental/pca.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "INCLUDE": ["../../../common/sklearn.json", "../../../regular/pca.json", "../large_scale.json"], - "PARAMETERS_SETS": { - "spmd pca parameters": { - "algorithm": { - "estimator": "IncrementalPCA", - "estimator_methods": { "training": "fit", "inference": "" }, - "num_batches": {"training": 10} - }, - "data": { - "split_kwargs": { "test_size": 0.0001 } - } - }, - "synthetic data": { - "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } - ] - } - }, - "TEMPLATES": { - "linreg": { - "SETS": [ - "sklearnex spmd implementation", - "large scale 32 parameters", - "synthetic data", - "spmd pca parameters" - ] - } - } -} diff --git a/configs/spmd/large_scale/knn_strong.json b/configs/spmd/large_scale/knn_strong.json index d202f6e4..36daf3f1 100644 --- a/configs/spmd/large_scale/knn_strong.json +++ b/configs/spmd/large_scale/knn_strong.json @@ -3,23 +3,24 @@ "PARAMETERS_SETS": { "spmd knn cls parameters": { "algorithm": { - "estimator": "KNeighborsClassifier", + "estimator": "KNeighborsClassifier", "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": 2, "weights": "uniform", - "n_neighbors": 5 + "n_neighbors": 100 }, - "estimator_methods": { - "training": "fit", - "inference": "predict" - } + "estimator_methods": { + "training": "fit", + "inference": "predict" + }, + "sklearnex_context": { "use_raw_input": true } } }, "synthetic classification data": { "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000000 }, "generation_kwargs": { "n_samples": 1500000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } + { "source": "make_classification", "split_kwargs": { "train_size": 3000000, "test_size": 2000000 }, "generation_kwargs": { "n_samples": 5000000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } ] } }, @@ -28,7 +29,7 @@ "SETS": [ "synthetic classification data", "sklearnex spmd implementation", - "large scale strong <=64 parameters", + "large scale strong <=64 parameters", "spmd knn cls parameters" ] } diff --git a/configs/spmd/large_scale/knn_tier1.json b/configs/spmd/large_scale/knn_tier1.json new file mode 100644 index 00000000..c230cc4e --- /dev/null +++ b/configs/spmd/large_scale/knn_tier1.json @@ -0,0 +1,35 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd knn cls parameters": { + "algorithm": { + "estimator": "KNeighborsClassifier", + "estimator_params": { + "algorithm": "brute", + "metric": "minkowski", + "p": 2, + "weights": "uniform" + }, + "estimator_methods": { + "training": "fit", + "inference": "predict" + }, + "sklearnex_context": { "use_raw_input": true } + } + }, + "synthetic classification data": [ + { "data": { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 100000}, "generation_kwargs": { "n_samples": 2000000, "n_features": 10, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, "algorithm": { "estimator_params": { "n_neighbors": 5 } } }, + { "data": { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 10000}, "generation_kwargs": { "n_samples": 2000000, "n_features": 1000, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, "algorithm": { "estimator_params": { "n_neighbors": 100 } } } + ] + }, + "TEMPLATES": { + "knn classifier": { + "SETS": [ + "synthetic classification data", + "sklearnex spmd implementation", + "large scale 32 parameters", + "spmd knn cls parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn_tier2.json similarity index 55% rename from configs/spmd/large_scale/knn.json rename to configs/spmd/large_scale/knn_tier2.json index b68b94af..ff0032e2 100644 --- a/configs/spmd/large_scale/knn.json +++ b/configs/spmd/large_scale/knn_tier2.json @@ -3,23 +3,24 @@ "PARAMETERS_SETS": { "spmd knn cls parameters": { "algorithm": { - "estimator": "KNeighborsClassifier", + "estimator": "KNeighborsClassifier", "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": 2, "weights": "uniform", - "n_neighbors": 5 + "n_neighbors": 5 }, - "estimator_methods": { - "training": "fit", - "inference": "predict" - } - } + "estimator_methods": { + "training": "fit", + "inference": "predict" + }, + "sklearnex_context": { "use_raw_input": true } + } }, "synthetic classification data": { "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 5000 }, "generation_kwargs": { "n_samples": 5005000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } + { "source": "make_classification", "split_kwargs": { "train_size": 100, "test_size": 100}, "generation_kwargs": { "n_samples": 200, "n_features": 10, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } ] } }, @@ -28,7 +29,7 @@ "SETS": [ "synthetic classification data", "sklearnex spmd implementation", - "large scale 2k parameters", + "large scale 2k parameters", "spmd knn cls parameters" ] } diff --git a/configs/spmd/large_scale/linreg.json b/configs/spmd/large_scale/linreg.json index ea45a52c..7c7fb035 100644 --- a/configs/spmd/large_scale/linreg.json +++ b/configs/spmd/large_scale/linreg.json @@ -4,13 +4,14 @@ "spmd linear parameters": { "algorithm": { "estimator": "LinearRegression", - "estimator_methods": { "training": "fit" } + "estimator_methods": { "training": "fit" }, + "sklearnex_context": { "use_raw_input": true } } }, "synthetic data": { "data": [ - { "source": "make_regression", "generation_kwargs": { "n_samples": 30005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 30000000, "test_size": 5000 } }, - { "source": "make_regression", "generation_kwargs": { "n_samples": 305000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 300000, "test_size": 5000 } } + { "source": "make_regression", "generation_kwargs": { "n_samples": 100005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 100000000, "test_size": 5000 } }, + { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 3000000, "test_size": 5000 } } ] } }, @@ -19,7 +20,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale 2k parameters", - "synthetic data", + "synthetic data", "spmd linear parameters" ] } diff --git a/configs/spmd/large_scale/linreg_strong.json b/configs/spmd/large_scale/linreg_strong.json index 629bf544..ac5a6c7a 100644 --- a/configs/spmd/large_scale/linreg_strong.json +++ b/configs/spmd/large_scale/linreg_strong.json @@ -4,7 +4,8 @@ "spmd linear parameters": { "algorithm": { "estimator": "LinearRegression", - "estimator_methods": { "training": "fit" } + "estimator_methods": { "training": "fit" }, + "sklearnex_context": { "use_raw_input": true } } }, "synthetic data": { @@ -17,8 +18,8 @@ "linreg": { "SETS": [ "sklearnex spmd implementation", - "large scale strong 2k parameters", - "synthetic data", + "large scale strong <=64 parameters", + "synthetic data", "spmd linear parameters" ] } diff --git a/configs/spmd/large_scale/logreg.json b/configs/spmd/large_scale/logreg.json index 326f2580..b7b4b998 100644 --- a/configs/spmd/large_scale/logreg.json +++ b/configs/spmd/large_scale/logreg.json @@ -2,15 +2,16 @@ "INCLUDE": ["../../common/sklearn.json", "../logreg.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd logreg2 parameters": { - "algorithm":{ - "estimator": "LogisticRegression", + "algorithm":{ + "estimator": "LogisticRegression", "estimator_methods": { "inference": "predict" }, - "estimator_params": { "max_iter": 20 } + "estimator_params": { "max_iter": 10 }, + "sklearnex_context": { "use_raw_input": true } } - }, + }, "synthetic data": { "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 10, "n_classes": 2 } }, + { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 10, "n_classes": 2 } }, { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 501000, "n_features": 1000, "n_classes": 2, "n_informative": 40, "n_clusters_per_class": 3, "flip_y": 0.05 } } ] } @@ -21,8 +22,8 @@ "sklearnex spmd implementation", "large scale 2k parameters", "spmd logreg parameters", - "synthetic data", - "spmd logreg2 parameters" + "synthetic data", + "spmd logreg2 parameters" ] } } diff --git a/configs/spmd/large_scale/logreg_strong.json b/configs/spmd/large_scale/logreg_strong.json index 0b79ba9d..219840ea 100644 --- a/configs/spmd/large_scale/logreg_strong.json +++ b/configs/spmd/large_scale/logreg_strong.json @@ -2,16 +2,16 @@ "INCLUDE": ["../../common/sklearn.json", "../logreg.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd logreg2 parameters": { - "algorithm":{ - "estimator": "LogisticRegression", + "algorithm":{ + "estimator": "LogisticRegression", "estimator_methods": { "inference": "predict" }, - "estimator_params": { "max_iter": 16 } + "estimator_params": { "max_iter": 16 }, + "sklearnex_context": { "use_raw_input": true } } - }, + }, "synthetic data": { "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 10001000, "n_features": 100, "n_classes": 2 } }, - { "source": "make_classification", "split_kwargs": { "train_size": 12000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 12001000, "n_features": 200, "n_classes": 2, "n_informative": 40, "n_clusters_per_class": 3, "flip_y": 0.05 } } + { "source": "make_classification", "split_kwargs": { "train_size": 12000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 12001000, "n_features": 200, "n_classes": 2, "n_informative": 40, "n_clusters_per_class": 3, "flip_y": 0.05 } } ] } }, @@ -19,10 +19,10 @@ "logreg": { "SETS": [ "sklearnex spmd implementation", - "large scale strong 2k parameters", + "large scale strong 64 parameters", "spmd logreg parameters", - "synthetic data", - "spmd logreg2 parameters" + "synthetic data", + "spmd logreg2 parameters" ] } } diff --git a/configs/spmd/large_scale/pca.json b/configs/spmd/large_scale/pca.json index d0ee879a..ce56bd8a 100644 --- a/configs/spmd/large_scale/pca.json +++ b/configs/spmd/large_scale/pca.json @@ -4,7 +4,8 @@ "spmd pca parameters": { "algorithm": { "estimator": "PCA", - "estimator_methods": { "training": "fit", "inference": "" } + "estimator_methods": { "training": "fit", "inference": "" }, + "sklearnex_context": { "use_raw_input": true } }, "data": { "split_kwargs": { "test_size": 0.0001 } @@ -12,18 +13,18 @@ }, "synthetic data": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }, - { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } + { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000000, "n_features": 10, "centers": 1 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 1000, "centers": 1 } } ] } }, "TEMPLATES": { "linreg": { "SETS": [ - "sklearnex spmd implementation", + "sklearnex spmd implementation", "large scale 2k parameters", "synthetic data", - "spmd pca parameters" + "spmd pca parameters" ] } } diff --git a/configs/spmd/large_scale/pca_strong.json b/configs/spmd/large_scale/pca_strong.json index 3cb33e72..70461ba7 100644 --- a/configs/spmd/large_scale/pca_strong.json +++ b/configs/spmd/large_scale/pca_strong.json @@ -4,7 +4,8 @@ "spmd pca parameters": { "algorithm": { "estimator": "PCA", - "estimator_methods": { "training": "fit", "inference": "" } + "estimator_methods": { "training": "fit", "inference": "" }, + "sklearnex_context": { "use_raw_input": true } }, "data": { "split_kwargs": { "test_size": 0.0001 } @@ -19,10 +20,10 @@ "TEMPLATES": { "linreg": { "SETS": [ - "sklearnex spmd implementation", - "large scale strong 2k parameters", + "sklearnex spmd implementation", + "large scale strong <=64 parameters", "synthetic data", - "spmd pca parameters" + "spmd pca parameters" ] } } diff --git a/configs/spmd/large_scale/spmd_for_online.json b/configs/spmd/large_scale/spmd_for_online.json deleted file mode 100644 index 2ef60f5b..00000000 --- a/configs/spmd/large_scale/spmd_for_online.json +++ /dev/null @@ -1,96 +0,0 @@ -{ - "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], - "PARAMETERS_SETS": { - "covariance data": { - "data": [ - { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 1000000, - "n_features": [10, 100] - }, - "split_kwargs": {"ignore": true} - } - ] - }, - "basic_statistics data": { - "data": { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 1000000, - "n_features": [10, 100] - }, - "split_kwargs": {"ignore": true} - } - }, - "linear_regression data": { - "data": { - "source": "make_regression", - "split_kwargs": {"train_size": 0.2, "test_size": 0.8}, - "generation_kwargs": { - "n_samples": 1000000, - "n_features": [10, 100], - "n_informative": 5, - "noise": 2.0 - } - } - }, - "pca data": { - "data": { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 1000000, - "n_features": [10, 100] - }, - "split_kwargs": {"ignore": true} - } - }, - "basic_statistics": { - "algorithm": [ - { - "estimator": "BasicStatistics", - "library": "sklearnex.spmd", - "estimator_methods": {"training": "fit"} - } - ] - }, - "covariance": { - "algorithm": [ - { - "estimator": "EmpiricalCovariance", - "library": "sklearnex.spmd", - "estimator_methods": {"training": "fit"} - } - ] - }, - "linear_regression": { - "algorithm": [ - { - "estimator": "LinearRegression", - "library": "sklearnex.spmd", - "estimator_methods": {"training": "fit"} - } - ] - }, - "pca": { - "algorithm": [ - { - "estimator": "PCA", - "library": "sklearnex.spmd", - "estimator_methods": {"training": "fit", "inference": ""} - } - ] - } - }, - "TEMPLATES": { - "basic_statistics": {"SETS": ["basic_statistics", "basic_statistics data", "sklearnex spmd implementation", "large scale full one node parameters"]}, - "covariance": {"SETS": ["covariance", "covariance data", "sklearnex spmd implementation", "large scale full one node parameters"]}, - "linear_regression": { - "SETS": ["linear_regression", "linear_regression data", "sklearnex spmd implementation", "large scale full one node parameters"] - }, - "pca": {"SETS": ["pca", "pca data", "sklearnex spmd implementation", "large scale full one node parameters"]} - } -} diff --git a/configs/spmd/large_scale/spmd_for_online_strong.json b/configs/spmd/large_scale/spmd_for_online_strong.json deleted file mode 100644 index 77a25075..00000000 --- a/configs/spmd/large_scale/spmd_for_online_strong.json +++ /dev/null @@ -1,60 +0,0 @@ -{ - "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], - "PARAMETERS_SETS": { - "covariance data": { - "data": [ - { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 12000000, - "n_features": [10, 100] - }, - "split_kwargs": {"ignore": true} - } - ] - }, - "basic_statistics data": { - "data": { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 12000000, - "n_features": [10, 100] - }, - "split_kwargs": {"ignore": true} - } - }, - "linear_regression data": { - "data": { - "source": "make_regression", - "split_kwargs": {"train_size": 0.2, "test_size": 0.8}, - "generation_kwargs": { - "n_samples": 12000000, - "n_features": [10, 100], - "n_informative": 5, - "noise": 2.0 - } - } - }, - "pca data": { - "data": { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 12000000, - "n_features": [10, 100] - }, - "split_kwargs": {"ignore": true} - } - } - }, - "TEMPLATES": { - "basic_statistics": {"SETS": ["basic_statistics data", "spmd default parameters", "sklearnex spmd implementation", "large scale strong full one node parameters"]}, - "covariance": {"SETS": ["covariance data", "spmd default parameters","sklearnex spmd implementation", "large scale strong full one node parameters"]}, - "linear_regression": { - "SETS": ["linear_regression data", "spmd default parameters", "sklearnex spmd implementation", "large scale strong full one node parameters"] - }, - "pca": {"SETS": ["pca data", "spmd default parameters", "sklearnex spmd implementation", "large scale strong full one node parameters"]} - } -} diff --git a/sklbench/benchmarks/custom_function.py b/sklbench/benchmarks/custom_function.py index 25abb900..34b223ed 100644 --- a/sklbench/benchmarks/custom_function.py +++ b/sklbench/benchmarks/custom_function.py @@ -64,9 +64,13 @@ def get_function_args(bench_case: BenchCase, x_train, y_train, x_test, y_test) - def measure_function_instance(bench_case, function_instance, args: Tuple, kwargs: Dict): metrics = dict() - metrics["time[ms]"], metrics["time std[ms]"], _ = measure_case( - bench_case, function_instance, *args, **kwargs - ) + ( + metrics["time[ms]"], + metrics["time std[ms]"], + metrics["first iter[ms]"], + metrics["box filter mean[ms]"], + metrics["box filter std[ms]"], + ) = measure_case(bench_case, function_instance, *args, **kwargs) return metrics diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index e57a9038..4164a10d 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -516,7 +516,11 @@ def main(bench_case: BenchCase, filters: List[BenchCase]): bench_case, "algorithm:estimator_params", dict() ) # logger.debug("estimator params: " + str(estimator_params)) - if "DBSCAN" in str(estimator_name): + if ( + "DBSCAN" in str(estimator_name) + and get_bench_case_value(bench_case, "data:distributed_split", None) + != "rank_based" + ): if "min_samples" in estimator_params: from mpi4py import MPI diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index 86944ead..38b4fe3b 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -109,11 +109,11 @@ def split_and_transform_data(bench_case, data, data_description): y_train, y_test = None, None distributed_split = get_bench_case_value(bench_case, "data:distributed_split", None) - knn_split_train = ( - "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "") - and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1 - ) - if distributed_split == "rank_based" or knn_split_train: + # knn_split_train = ( + # "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "") + # and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1 + # ) + if distributed_split == "rank_based": from mpi4py import MPI comm = MPI.COMM_WORLD diff --git a/sklbench/utils/measurement.py b/sklbench/utils/measurement.py index bfabbdc0..3677e760 100644 --- a/sklbench/utils/measurement.py +++ b/sklbench/utils/measurement.py @@ -79,7 +79,7 @@ def measure_time( t0 = timeit.default_timer() func_return_value = func(*args, **kwargs) t1 = timeit.default_timer() - if hasattr(func.__self__, "_n_inner_iter"): + if hasattr(func, "__self__") and hasattr(func.__self__, "_n_inner_iter"): inners.append(func.__self__._n_inner_iter) iters.append(func.__self__.n_iter_) if enable_itt and itt_is_available: @@ -92,16 +92,20 @@ def measure_time( f"exceeded time limit ({time_limit} seconds)" ) break - from mpi4py import MPI - - if MPI.COMM_WORLD.Get_rank() == 0: - logger.debug( - "iters across n runs: " - + str(iters) - + ", inner iters across n runs: " - + str(inners) - ) - logger.debug(times) + + try: + from mpi4py import MPI + + if MPI.COMM_WORLD.Get_rank() == 0: + logger.debug( + "iters across n runs: " + + str(iters) + + ", inner iters across n runs: " + + str(inners) + ) + logger.debug(f"Runtime for all {n_runs} iterations: {times}") + except ModuleNotFoundError: + logger.debug(f"Runtime for all {n_runs} iterations: {times}") # mean, std = box_filter(times) # if std / mean > std_mean_ratio: # logger.warning( diff --git a/test-configuration-linux.yml b/test-configuration-linux.yml index 722d1008..a37769ce 100644 --- a/test-configuration-linux.yml +++ b/test-configuration-linux.yml @@ -45,11 +45,6 @@ steps: conda activate bench-env python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json displayName: Sklearn example run - - script: | - source /usr/share/miniconda/etc/profile.d/conda.sh - conda activate bench-env - python -m sklbench --report -l DEBUG --report -c configs/incremental.json - displayName: Incremental algorithms example run - script: | source /usr/share/miniconda/etc/profile.d/conda.sh conda activate bench-env diff --git a/test-configuration-win.yml b/test-configuration-win.yml index f3ac1595..a1eddaeb 100644 --- a/test-configuration-win.yml +++ b/test-configuration-win.yml @@ -43,10 +43,6 @@ steps: call activate bench-env python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json displayName: Sklearn example run - - script: | - call activate bench-env - python -m sklbench --report -l DEBUG --report -c configs/incremental.json - displayName: Incremental algorithms example run - script: | call activate bench-env python -m sklbench --report -l DEBUG --report -c configs/xgboost_example.json From f0fccdd263725f143ba827e30597e84d78c1352c Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Fri, 21 Mar 2025 15:02:15 -0700 Subject: [PATCH 101/110] Revert "Removed environment from diff." This reverts commit 15db7929f76696956a4fe0529e8addef35f95f74. --- sklbench/report/implementation.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py index cddb45f3..2bc3a05e 100644 --- a/sklbench/report/implementation.py +++ b/sklbench/report/implementation.py @@ -100,8 +100,7 @@ "batch_size", ] -#DIFFBY_COLUMNS = ["environment_name", "library", "format", "device"] -DIFFBY_COLUMNS = ["library", "format", "device"] +DIFFBY_COLUMNS = ["environment_name", "library", "format", "device"] def geomean_wrapper(a): From 4d675ecdfbb117750fc79ebaeb66a7feaa2444d5 Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Fri, 21 Mar 2025 15:29:50 -0700 Subject: [PATCH 102/110] Removed extra code for sample_shift. --- sklbench/datasets/transformer.py | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index 44476871..34b438af 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -117,34 +117,21 @@ def split_and_transform_data(bench_case, data, data_description): if distributed_split == "sample_shift": from mpi4py import MPI - comm = MPI.COMM_WORLD - rank = comm.Get_rank() - size = comm.Get_size() - - n_train = len(x_train) - n_test = len(x_test) - - train_start = 0 - train_end = n_train - test_start = 0 - test_end = n_test - + rank = MPI.COMM_WORLD.Get_rank() adjust_number = (math.sqrt(rank) * 0.003) + 1 if "y" in data: x_train, y_train = ( - x_train[train_start:train_end] * adjust_number, - y_train[train_start:train_end], + x_train * adjust_number, + y_train, ) x_test, y_test = ( - x_test[test_start:test_end] * adjust_number, - y_test[test_start:test_end], + x_test * adjust_number, + y_test, ) else: - x_train = x_train[train_start:train_end] - - x_test = x_test[test_start:test_end] * adjust_number + x_test = x_test * adjust_number elif distributed_split == "rank_based": from mpi4py import MPI From e8fbd0b3784b957d4c8555228063f544b381a5b5 Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Fri, 21 Mar 2025 15:39:33 -0700 Subject: [PATCH 103/110] Changes for sample_shift. --- configs/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/configs/README.md b/configs/README.md index 8d3c5ac2..b1219124 100644 --- a/configs/README.md +++ b/configs/README.md @@ -105,6 +105,7 @@ Configs have the three highest parameter keys: | `data`:`order` | `F` | `C`, `F` | Data order to use in benchmark: contiguous(C) or Fortran. | | `data`:`dtype` | `float64` | | Data type to use in benchmark. | | `data`:`distributed_split` | None | None, `rank_based` | Split type used to distribute data between machines in distributed algorithm. `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. | +| `data`: `sample_shift` | None | None, `rank_based` | Determines how data points are shifted based on MPI rank. `None` type means use all data without any shift across all machines. `sample_shift`: Shift each data point in a rank by \((\sqrt{\text{rank id}} \times 0.003) + 1\). | |

Algorithm parameters

|||| | `algorithm`:`library` | None | | Python module containing measured entity (class or function). | | `algorithm`:`device` | `default` | `default`, `cpu`, `gpu` | Device selected for computation. | From a7cea17bed5f6ad721a98b7d15ae6d64bb35c75f Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Fri, 21 Mar 2025 15:44:53 -0700 Subject: [PATCH 104/110] Updated sample shift. --- configs/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/configs/README.md b/configs/README.md index b1219124..91b45b81 100644 --- a/configs/README.md +++ b/configs/README.md @@ -104,8 +104,7 @@ Configs have the three highest parameter keys: | `data`:`format` | `pandas` | `pandas`, `numpy`, `cudf` | Data format to use in benchmark. | | `data`:`order` | `F` | `C`, `F` | Data order to use in benchmark: contiguous(C) or Fortran. | | `data`:`dtype` | `float64` | | Data type to use in benchmark. | -| `data`:`distributed_split` | None | None, `rank_based` | Split type used to distribute data between machines in distributed algorithm. `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. | -| `data`: `sample_shift` | None | None, `rank_based` | Determines how data points are shifted based on MPI rank. `None` type means use all data without any shift across all machines. `sample_shift`: Shift each data point in a rank by \((\sqrt{\text{rank id}} \times 0.003) + 1\). | +| `data`:`distributed_split` | None | None, `rank_based`, `sample_shift` | `rank_based` Split type used to distribute data between machines in distributed algorithm. `sample_shift`: Shift each data point in a rank by \((\sqrt{\text{rank id}} \times 0.003) + 1\). `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. | |

Algorithm parameters

|||| | `algorithm`:`library` | None | | Python module containing measured entity (class or function). | | `algorithm`:`device` | `default` | `default`, `cpu`, `gpu` | Device selected for computation. | From f3c2757e87dd10a723ff8a241b6cdd78d16a3bd7 Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Fri, 21 Mar 2025 15:47:14 -0700 Subject: [PATCH 105/110] Updated sample shift. --- configs/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/README.md b/configs/README.md index 91b45b81..3d16a6ce 100644 --- a/configs/README.md +++ b/configs/README.md @@ -104,7 +104,7 @@ Configs have the three highest parameter keys: | `data`:`format` | `pandas` | `pandas`, `numpy`, `cudf` | Data format to use in benchmark. | | `data`:`order` | `F` | `C`, `F` | Data order to use in benchmark: contiguous(C) or Fortran. | | `data`:`dtype` | `float64` | | Data type to use in benchmark. | -| `data`:`distributed_split` | None | None, `rank_based`, `sample_shift` | `rank_based` Split type used to distribute data between machines in distributed algorithm. `sample_shift`: Shift each data point in a rank by \((\sqrt{\text{rank id}} \times 0.003) + 1\). `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. | +| `data`:`distributed_split` | None | None, `rank_based`, `sample_shift` | `rank_based` Split type used to distribute data between machines in distributed algorithm. `sample_shift`: Shift each data point in each rank by sqrt (rank id) * 0.003) + 1. `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. | |

Algorithm parameters

|||| | `algorithm`:`library` | None | | Python module containing measured entity (class or function). | | `algorithm`:`device` | `default` | `default`, `cpu`, `gpu` | Device selected for computation. | From 2ae3c394547490c0c6782af591d6eec7ee7e8838 Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Fri, 21 Mar 2025 15:52:40 -0700 Subject: [PATCH 106/110] Removed extra code. --- sklbench/datasets/transformer.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index 34b438af..79a6d7a0 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -119,19 +119,7 @@ def split_and_transform_data(bench_case, data, data_description): rank = MPI.COMM_WORLD.Get_rank() adjust_number = (math.sqrt(rank) * 0.003) + 1 - - if "y" in data: - x_train, y_train = ( - x_train * adjust_number, - y_train, - ) - - x_test, y_test = ( - x_test * adjust_number, - y_test, - ) - else: - x_test = x_test * adjust_number + x_test = x_test * adjust_number elif distributed_split == "rank_based": from mpi4py import MPI From 39cc4f2de559d7f1b96b01f4f58931e0b3ef58a9 Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Fri, 21 Mar 2025 16:07:06 -0700 Subject: [PATCH 107/110] Added comment for sample_shift. --- sklbench/datasets/transformer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index 79a6d7a0..81bdf5fb 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -118,6 +118,10 @@ def split_and_transform_data(bench_case, data, data_description): from mpi4py import MPI rank = MPI.COMM_WORLD.Get_rank() + # This approach was chosen to shift the distribution of synthetic data on each rank + # for KMeans weak scaling tests. When testing with a large number of tiles, this method avoids duplication of data on each rank. + # For example, if there are 24,576 tiles being used, each data point in the 24,576th tile would be multiplied by 1.47. + # The factor 0.003 was chosen arbitrarily and can be fine-tuned for other datasets and algorithms if needed. adjust_number = (math.sqrt(rank) * 0.003) + 1 x_test = x_test * adjust_number From 3fc7c42dfc1ea52bb19bffb944bd675a8b1dd093 Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Fri, 21 Mar 2025 16:14:18 -0700 Subject: [PATCH 108/110] Added back in x_train in sample_shift. --- sklbench/datasets/transformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index 81bdf5fb..c63d3b20 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -124,6 +124,7 @@ def split_and_transform_data(bench_case, data, data_description): # The factor 0.003 was chosen arbitrarily and can be fine-tuned for other datasets and algorithms if needed. adjust_number = (math.sqrt(rank) * 0.003) + 1 x_test = x_test * adjust_number + x_train = x_train * adjust_number elif distributed_split == "rank_based": from mpi4py import MPI From 1bd5aa150735d97efeb8776355f1dccbd71fdf3c Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Fri, 21 Mar 2025 16:17:59 -0700 Subject: [PATCH 109/110] Updated description of sample_shift. --- configs/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/README.md b/configs/README.md index 3d16a6ce..e1cf8390 100644 --- a/configs/README.md +++ b/configs/README.md @@ -104,7 +104,7 @@ Configs have the three highest parameter keys: | `data`:`format` | `pandas` | `pandas`, `numpy`, `cudf` | Data format to use in benchmark. | | `data`:`order` | `F` | `C`, `F` | Data order to use in benchmark: contiguous(C) or Fortran. | | `data`:`dtype` | `float64` | | Data type to use in benchmark. | -| `data`:`distributed_split` | None | None, `rank_based`, `sample_shift` | `rank_based` Split type used to distribute data between machines in distributed algorithm. `sample_shift`: Shift each data point in each rank by sqrt (rank id) * 0.003) + 1. `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. | +| `data`:`distributed_split` | None | None, `rank_based`, `sample_shift` | Split type used to distribute data between machines in distributed algorithm. `sample_shift`: Shift each data point in each rank by sqrt (rank id) * 0.003) + 1. `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. | |

Algorithm parameters

|||| | `algorithm`:`library` | None | | Python module containing measured entity (class or function). | | `algorithm`:`device` | `default` | `default`, `cpu`, `gpu` | Device selected for computation. | From 06944c172997de5e5e1659cc973637bea8d8704b Mon Sep 17 00:00:00 2001 From: "Mcgrievy, Kathleen" Date: Fri, 21 Mar 2025 16:20:52 -0700 Subject: [PATCH 110/110] Added predict back in. --- configs/spmd/large_scale/kmeans_narrow_weak.json | 2 +- configs/spmd/large_scale/kmeans_wide_weak.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/spmd/large_scale/kmeans_narrow_weak.json b/configs/spmd/large_scale/kmeans_narrow_weak.json index 523aba01..69f0b6ac 100644 --- a/configs/spmd/large_scale/kmeans_narrow_weak.json +++ b/configs/spmd/large_scale/kmeans_narrow_weak.json @@ -10,7 +10,7 @@ "n_clusters": 10, "random_state": 42 }, - "estimator_methods": { "training": "fit", "inference": "" }, + "estimator_methods": { "training": "fit", "inference": "predict" }, "sklearnex_context": { "use_raw_input": true } } }, diff --git a/configs/spmd/large_scale/kmeans_wide_weak.json b/configs/spmd/large_scale/kmeans_wide_weak.json index 1c588d60..5520f10a 100644 --- a/configs/spmd/large_scale/kmeans_wide_weak.json +++ b/configs/spmd/large_scale/kmeans_wide_weak.json @@ -10,7 +10,7 @@ "n_clusters": 10, "random_state": 42 }, - "estimator_methods": { "training": "fit", "inference": "" }, + "estimator_methods": { "training": "fit", "inference": "predict" }, "sklearnex_context": { "use_raw_input": true } } },