From b3500dd94fe02bbcb79af06b77655421edf5026a Mon Sep 17 00:00:00 2001
From: ethanglaser <ethan.glaser@intel.com>
Date: Fri, 30 Aug 2024 23:27:21 +0000
Subject: [PATCH 001/110] Creating branch for large scale measurements

---
 configs/spmd/large_scale/basic_stats.json  | 30 +++++++++++++++
 configs/spmd/large_scale/covariance.json   | 30 +++++++++++++++
 configs/spmd/large_scale/dbscan.json       | 32 ++++++++++++++++
 configs/spmd/large_scale/kmeans.json       | 32 ++++++++++++++++
 configs/spmd/large_scale/knn.json          | 43 ++++++++++++++++++++++
 configs/spmd/large_scale/large_scale.json  | 31 ++++++++++++++++
 configs/spmd/large_scale/linear_model.json | 27 ++++++++++++++
 configs/spmd/large_scale/logreg.json       | 29 +++++++++++++++
 configs/spmd/large_scale/pca.json          | 30 +++++++++++++++
 sklbench/benchmarks/sklearn_estimator.py   |  8 ++--
 sklbench/datasets/transformer.py           |  9 +++--
 sklbench/runner/commands_helper.py         |  3 ++
 12 files changed, 298 insertions(+), 6 deletions(-)
 create mode 100644 configs/spmd/large_scale/basic_stats.json
 create mode 100644 configs/spmd/large_scale/covariance.json
 create mode 100644 configs/spmd/large_scale/dbscan.json
 create mode 100644 configs/spmd/large_scale/kmeans.json
 create mode 100644 configs/spmd/large_scale/knn.json
 create mode 100644 configs/spmd/large_scale/large_scale.json
 create mode 100644 configs/spmd/large_scale/linear_model.json
 create mode 100644 configs/spmd/large_scale/logreg.json
 create mode 100644 configs/spmd/large_scale/pca.json

diff --git a/configs/spmd/large_scale/basic_stats.json b/configs/spmd/large_scale/basic_stats.json
new file mode 100644
index 00000000..a9542017
--- /dev/null
+++ b/configs/spmd/large_scale/basic_stats.json
@@ -0,0 +1,30 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd basicstats parameters": {
+            "algorithm": {
+                "estimator": "BasicStatistics",
+                "estimator_methods": { "training": "compute" }
+            },
+	    "data": {
+		"split_kwargs": { "test_size": 0.0001 }
+	    }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
+		{ "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "basicstats": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale default parameters",
+		"synthetic data",
+                "spmd basicstats parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/covariance.json b/configs/spmd/large_scale/covariance.json
new file mode 100644
index 00000000..3280bf77
--- /dev/null
+++ b/configs/spmd/large_scale/covariance.json
@@ -0,0 +1,30 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd basicstats parameters": {
+            "algorithm": {
+                "estimator": "EmpiricalCovariance",
+                "estimator_methods": { "training": "fit" }
+            },
+            "data": {
+                "split_kwargs": { "test_size": 0.0001 }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "covariance": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale default parameters",
+		"synthetic data",
+                "spmd basicstats parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/dbscan.json b/configs/spmd/large_scale/dbscan.json
new file mode 100644
index 00000000..c46287d8
--- /dev/null
+++ b/configs/spmd/large_scale/dbscan.json
@@ -0,0 +1,32 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/dbscan.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd dbscan parameters": {
+	    "algorithm": {
+		"estimator": "DBSCAN",
+		"estimator_methods": {
+		    "training": "fit"
+		}
+	    },
+	    "data": {
+		"dtype": "float64"
+	    }
+	},
+	"synthetic dataset": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 10, "centers": 10 }, "algorithm": { "eps": 5, "min_samples": 5 } }
+            ]
+	}
+    },
+    "TEMPLATES": {
+        "dbscan": {
+            "SETS": [
+                "common dbscan parameters",
+                "synthetic dataset",
+                "sklearnex spmd implementation",
+		"large scale default parameters",
+                "spmd dbscan parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/kmeans.json b/configs/spmd/large_scale/kmeans.json
new file mode 100644
index 00000000..3b490f14
--- /dev/null
+++ b/configs/spmd/large_scale/kmeans.json
@@ -0,0 +1,32 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/kmeans.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd kmeans parameters": {
+            "algorithm": {
+                "estimator": "KMeans",
+                "estimator_params": {
+                    "algorithm": "lloyd"
+                },
+                "estimator_methods": { "training": "fit" }
+            },
+            "bench": {
+                "mpi_params": {"n": 48}
+            }
+	},
+	"synthetic data": {
+                "data": [
+                        { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000,  "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }
+                ]
+        }
+    },
+    "TEMPLATES": {
+        "kmeans": {
+            "SETS": [
+                "synthetic data",
+                "sklearnex spmd implementation",
+                "large scale default parameters",
+                "spmd kmeans parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json
new file mode 100644
index 00000000..8b82094d
--- /dev/null
+++ b/configs/spmd/large_scale/knn.json
@@ -0,0 +1,43 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/knn.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd knn cls parameters": {
+            "algorithm": {
+		"estimator": "KNeighborsClassifier",
+                "estimator_params": {
+                    "algorithm": "brute",
+                    "metric": "minkowski",
+                    "p": 2,
+                    "weights": "uniform",
+		    "n_neighbors": 5
+                },
+		"estimator_methods": {
+			"training": "fit",
+			"inference": "predict"
+		}
+	    },
+	    "bench": {
+	        "mpi_params": {}
+	    }
+        },
+        "synthetic classification data": {
+            "data": [
+		{ "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
+		{ "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 500000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
+                { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 50000 },   "generation_kwargs": {  "n_samples": 100000,  "n_features": 1000, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
+                { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 500000 },   "generation_kwargs": {  "n_samples": 1000000,  "n_features": 10, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
+            ]
+        }	
+    },
+    "TEMPLATES": {
+        "knn classifier": {
+            "SETS": [
+                "common knn parameters",
+                "synthetic classification data",
+                "sklearnex spmd implementation",
+		"large scale 2k parameters",
+                "spmd knn cls parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
new file mode 100644
index 00000000..4b39d5e2
--- /dev/null
+++ b/configs/spmd/large_scale/large_scale.json
@@ -0,0 +1,31 @@
+{
+    "PARAMETERS_SETS": {
+        "large scale default parameters": {
+            "data": {
+                "dtype": "float64",
+		"distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,3,4,5,6,7,8,9,10,11,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+	"large scale 2k parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+	"large scale impi parameters": {
+	    "data": {
+		"dtype": "float64",
+                "distributed_split": "no"
+            },
+            "bench": {
+		    "mpi_params": {"n": [1,2,4,6,12,24], "ppn": 12}
+	    }
+	}
+    }
+}
diff --git a/configs/spmd/large_scale/linear_model.json b/configs/spmd/large_scale/linear_model.json
new file mode 100644
index 00000000..4c861caa
--- /dev/null
+++ b/configs/spmd/large_scale/linear_model.json
@@ -0,0 +1,27 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/linear_model.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd linear parameters": {
+            "algorithm": {
+                "estimator": "LinearRegression",
+                "estimator_methods": { "training": "fit" }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_regression", "generation_kwargs": { "n_samples": 30005000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 30000000, "test_size": 5000 } },
+		{ "source": "make_regression", "generation_kwargs": { "n_samples": 305000,  "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 300000, "test_size": 5000 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale default parameters",
+		"synthetic data",
+                "spmd linear parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/logreg.json b/configs/spmd/large_scale/logreg.json
new file mode 100644
index 00000000..c5ef6203
--- /dev/null
+++ b/configs/spmd/large_scale/logreg.json
@@ -0,0 +1,29 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/logreg.json", "../logreg.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd logreg2 parameters": {
+	    "algorithm":{
+		"estimator": "LogisticRegression",
+                "estimator_methods": { "inference": "predict" },
+		"estimator_params": { "max_iter": 20 }
+            }
+	},
+        "synthetic data": {
+            "data": [
+		{ "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 5001000, "n_features": 10, "n_classes": 2 } },
+                { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 101000, "n_features": 1000, "n_classes": 2 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale 2k parameters",
+                "spmd logreg parameters",
+		"synthetic data",
+		"spmd logreg2 parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/pca.json b/configs/spmd/large_scale/pca.json
new file mode 100644
index 00000000..35c1942a
--- /dev/null
+++ b/configs/spmd/large_scale/pca.json
@@ -0,0 +1,30 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/pca.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd pca parameters": {
+            "algorithm": {
+                "estimator": "PCA",
+                "estimator_methods": { "training": "fit", "inference": "" }
+            },
+            "data": {
+                "split_kwargs": { "test_size": 0.0001 }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+		"sklearnex spmd implementation",
+                "large scale default parameters",
+                "synthetic data",
+		"spmd pca parameters"
+            ]
+        }
+    }
+}
diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
index f9c0a75e..42f8725b 100644
--- a/sklbench/benchmarks/sklearn_estimator.py
+++ b/sklbench/benchmarks/sklearn_estimator.py
@@ -134,6 +134,9 @@ def get_subset_metrics_of_estimator(
                 and isinstance(iterations[0], Union[Numeric, NumpyNumeric].__args__)
             ):
                 metrics.update({"iterations": int(iterations[0])})
+        if hasattr(estimator_instance, "_n_inner_iter"):
+            inner_iters = estimator_instance._n_inner_iter
+            metrics.update({"inner_iters": int(inner_iters)})
     if task == "classification":
         y_pred = convert_to_numpy(estimator_instance.predict(x))
         metrics.update(
@@ -142,7 +145,7 @@ def get_subset_metrics_of_estimator(
                 "balanced accuracy": float(balanced_accuracy_score(y_compat, y_pred)),
             }
         )
-        if hasattr(estimator_instance, "predict_proba") and not (
+        '''if hasattr(estimator_instance, "predict_proba") and not (
             hasattr(estimator_instance, "probability")
             and getattr(estimator_instance, "probability") == False
         ):
@@ -162,7 +165,7 @@ def get_subset_metrics_of_estimator(
                     ),
                     "logloss": float(log_loss(y_compat, y_pred_proba)),
                 }
-            )
+            )'''
     elif task == "regression":
         y_pred = convert_to_numpy(estimator_instance.predict(x))
         metrics.update(
@@ -429,7 +432,6 @@ def measure_sklearn_estimator(
                         estimator_instance.get_booster()
                     )
                     method_instance = getattr(daal_model, method)
-
                 metrics[method] = dict()
                 (
                     metrics[method]["time[ms]"],
diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index d2e63e9e..1ac7d7bc 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -109,7 +109,8 @@ def split_and_transform_data(bench_case, data, data_description):
         y_train, y_test = None, None
 
     distributed_split = get_bench_case_value(bench_case, "data:distributed_split", None)
-    if distributed_split == "rank_based":
+    knn_split_train = "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "") and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1
+    if distributed_split == "rank_based" or knn_split_train:
         from mpi4py import MPI
 
         comm = MPI.COMM_WORLD
@@ -129,10 +130,12 @@ def split_and_transform_data(bench_case, data, data_description):
                 x_train[train_start:train_end],
                 y_train[train_start:train_end],
             )
-            x_test, y_test = x_test[test_start:test_end], y_test[test_start:test_end]
+            if distributed_split == "rank_based":
+                x_test, y_test = x_test[test_start:test_end], y_test[test_start:test_end]
         else:
             x_train = x_train[train_start:train_end]
-            x_test = x_test[test_start:test_end]
+            if distributed_split == "rank_based":
+                x_test = x_test[test_start:test_end]
 
     device = get_bench_case_value(bench_case, "algorithm:device", None)
     common_data_format = get_bench_case_value(bench_case, "data:format", "pandas")
diff --git a/sklbench/runner/commands_helper.py b/sklbench/runner/commands_helper.py
index b66da011..a63686c6 100644
--- a/sklbench/runner/commands_helper.py
+++ b/sklbench/runner/commands_helper.py
@@ -45,6 +45,9 @@ def generate_benchmark_command(
         mpi_prefix = "mpirun"
         for mpi_param_name, mpi_param_value in mpi_params.items():
             mpi_prefix += f" -{mpi_param_name} {mpi_param_value}"
+            if mpi_param_name == "-hostfile":
+                import os
+                mpi_prefix += os.environ.get("PBS_NODEFILE")
         command_prefix = f"{mpi_prefix} {command_prefix}"
     # 3. Intel(R) VTune* profiling command prefix
     vtune_profiling = get_bench_case_value(bench_case, "bench:vtune_profiling")

From 4bd6c7f91e0dcb5dc0001efc5b7c180f50dd9adc Mon Sep 17 00:00:00 2001
From: ethanglaser <ethan.glaser@intel.com>
Date: Wed, 18 Sep 2024 18:01:07 +0000
Subject: [PATCH 002/110] strong scaling, config updates, minor revisions

---
 configs/spmd/large_scale/basic_stats.json     |  4 +--
 .../spmd/large_scale/basic_stats_strong.json  | 29 +++++++++++++++++
 configs/spmd/large_scale/covariance.json      |  2 +-
 .../spmd/large_scale/covariance_strong.json   | 29 +++++++++++++++++
 configs/spmd/large_scale/kmeans.json          | 11 +++----
 configs/spmd/large_scale/knn.json             |  5 +--
 configs/spmd/large_scale/large_scale.json     | 24 ++++++++++++--
 configs/spmd/large_scale/linear_model.json    |  2 +-
 .../spmd/large_scale/linear_model_strong.json | 26 ++++++++++++++++
 configs/spmd/large_scale/logreg_strong.json   | 28 +++++++++++++++++
 configs/spmd/large_scale/pca.json             |  2 +-
 configs/spmd/large_scale/pca_strong.json      | 29 +++++++++++++++++
 sklbench/benchmarks/sklearn_estimator.py      |  4 ++-
 sklbench/utils/measurement.py                 | 31 ++++++++++++++-----
 14 files changed, 200 insertions(+), 26 deletions(-)
 create mode 100644 configs/spmd/large_scale/basic_stats_strong.json
 create mode 100644 configs/spmd/large_scale/covariance_strong.json
 create mode 100644 configs/spmd/large_scale/linear_model_strong.json
 create mode 100644 configs/spmd/large_scale/logreg_strong.json
 create mode 100644 configs/spmd/large_scale/pca_strong.json

diff --git a/configs/spmd/large_scale/basic_stats.json b/configs/spmd/large_scale/basic_stats.json
index a9542017..9ac4725f 100644
--- a/configs/spmd/large_scale/basic_stats.json
+++ b/configs/spmd/large_scale/basic_stats.json
@@ -4,7 +4,7 @@
         "spmd basicstats parameters": {
             "algorithm": {
                 "estimator": "BasicStatistics",
-                "estimator_methods": { "training": "compute" }
+                "estimator_methods": { "training": "fit" }
             },
 	    "data": {
 		"split_kwargs": { "test_size": 0.0001 }
@@ -21,7 +21,7 @@
         "basicstats": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale default parameters",
+                "large scale 2k parameters",
 		"synthetic data",
                 "spmd basicstats parameters"
             ]
diff --git a/configs/spmd/large_scale/basic_stats_strong.json b/configs/spmd/large_scale/basic_stats_strong.json
new file mode 100644
index 00000000..b7aa22cb
--- /dev/null
+++ b/configs/spmd/large_scale/basic_stats_strong.json
@@ -0,0 +1,29 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd basicstats parameters": {
+            "algorithm": {
+                "estimator": "BasicStatistics",
+                "estimator_methods": { "training": "fit" }
+            },
+	    "data": {
+		"split_kwargs": { "test_size": 0.0001 }
+	    }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000,  "n_features": 100, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "basicstats": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale strong 2k parameters",
+		"synthetic data",
+                "spmd basicstats parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/covariance.json b/configs/spmd/large_scale/covariance.json
index 3280bf77..260befd0 100644
--- a/configs/spmd/large_scale/covariance.json
+++ b/configs/spmd/large_scale/covariance.json
@@ -21,7 +21,7 @@
         "covariance": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale default parameters",
+                "large scale 2k parameters",
 		"synthetic data",
                 "spmd basicstats parameters"
             ]
diff --git a/configs/spmd/large_scale/covariance_strong.json b/configs/spmd/large_scale/covariance_strong.json
new file mode 100644
index 00000000..568b4a8f
--- /dev/null
+++ b/configs/spmd/large_scale/covariance_strong.json
@@ -0,0 +1,29 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd basicstats parameters": {
+            "algorithm": {
+                "estimator": "EmpiricalCovariance",
+                "estimator_methods": { "training": "fit" }
+            },
+            "data": {
+                "split_kwargs": { "test_size": 0.0001 }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000,  "n_features": 100, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "covariance": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale strong 2k parameters",
+		"synthetic data",
+                "spmd basicstats parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/kmeans.json b/configs/spmd/large_scale/kmeans.json
index 3b490f14..89524965 100644
--- a/configs/spmd/large_scale/kmeans.json
+++ b/configs/spmd/large_scale/kmeans.json
@@ -7,15 +7,14 @@
                 "estimator_params": {
                     "algorithm": "lloyd"
                 },
-                "estimator_methods": { "training": "fit" }
-            },
-            "bench": {
-                "mpi_params": {"n": 48}
+                "estimator_methods": { "training": "fit", "inference": "predict" }
             }
 	},
 	"synthetic data": {
                 "data": [
-                        { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000,  "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }
+                        { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000,  "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
+	                { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000,  "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
+			{ "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000,  "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } }
                 ]
         }
     },
@@ -24,7 +23,7 @@
             "SETS": [
                 "synthetic data",
                 "sklearnex spmd implementation",
-                "large scale default parameters",
+                "large scale 2k parameters",
                 "spmd kmeans parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json
index 8b82094d..e979e2aa 100644
--- a/configs/spmd/large_scale/knn.json
+++ b/configs/spmd/large_scale/knn.json
@@ -15,9 +15,6 @@
 			"training": "fit",
 			"inference": "predict"
 		}
-	    },
-	    "bench": {
-	        "mpi_params": {}
 	    }
         },
         "synthetic classification data": {
@@ -35,7 +32,7 @@
                 "common knn parameters",
                 "synthetic classification data",
                 "sklearnex spmd implementation",
-		"large scale 2k parameters",
+		"large scale default parameters",
                 "spmd knn cls parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
index 4b39d5e2..72b808fe 100644
--- a/configs/spmd/large_scale/large_scale.json
+++ b/configs/spmd/large_scale/large_scale.json
@@ -6,7 +6,16 @@
 		"distributed_split": "None"
             },
             "bench": {
-                "mpi_params": {"n": [1,2,3,4,5,6,7,8,9,10,11,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+                "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+	"large scale strong parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "rank_based"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
 	"large scale 2k parameters": {
@@ -15,7 +24,16 @@
                 "distributed_split": "None"
             },
             "bench": {
-                "mpi_params": {"n": [192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+        "large scale strong 2k parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "rank_based"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
 	"large scale impi parameters": {
@@ -24,7 +42,7 @@
                 "distributed_split": "no"
             },
             "bench": {
-		    "mpi_params": {"n": [1,2,4,6,12,24], "ppn": 12}
+		    "mpi_params": {"n": [1,2,4,6,9,12], "ppn": 12}
 	    }
 	}
     }
diff --git a/configs/spmd/large_scale/linear_model.json b/configs/spmd/large_scale/linear_model.json
index 4c861caa..aeda4441 100644
--- a/configs/spmd/large_scale/linear_model.json
+++ b/configs/spmd/large_scale/linear_model.json
@@ -18,7 +18,7 @@
         "linreg": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale default parameters",
+                "large scale 2k parameters",
 		"synthetic data",
                 "spmd linear parameters"
             ]
diff --git a/configs/spmd/large_scale/linear_model_strong.json b/configs/spmd/large_scale/linear_model_strong.json
new file mode 100644
index 00000000..77a9c79e
--- /dev/null
+++ b/configs/spmd/large_scale/linear_model_strong.json
@@ -0,0 +1,26 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/linear_model.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd linear parameters": {
+            "algorithm": {
+                "estimator": "LinearRegression",
+                "estimator_methods": { "training": "fit" }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_regression", "generation_kwargs": { "n_samples": 25005000,  "n_features": 100, "noise": 1.25 }, "split_kwargs": { "train_size": 25000000, "test_size": 5000 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale strong 2k parameters",
+		"synthetic data",
+                "spmd linear parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/logreg_strong.json b/configs/spmd/large_scale/logreg_strong.json
new file mode 100644
index 00000000..2bf1c0f9
--- /dev/null
+++ b/configs/spmd/large_scale/logreg_strong.json
@@ -0,0 +1,28 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/logreg.json", "../logreg.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd logreg2 parameters": {
+	    "algorithm":{
+		"estimator": "LogisticRegression",
+                "estimator_methods": { "inference": "predict" },
+		"estimator_params": { "max_iter": 30 }
+            }
+	},
+        "synthetic data": {
+            "data": [
+		{ "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 10001000, "n_features": 100, "n_classes": 2 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale strong 2k parameters",
+                "spmd logreg parameters",
+		"synthetic data",
+		"spmd logreg2 parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/pca.json b/configs/spmd/large_scale/pca.json
index 35c1942a..9a6a6b02 100644
--- a/configs/spmd/large_scale/pca.json
+++ b/configs/spmd/large_scale/pca.json
@@ -21,7 +21,7 @@
         "linreg": {
             "SETS": [
 		"sklearnex spmd implementation",
-                "large scale default parameters",
+                "large scale 2k parameters",
                 "synthetic data",
 		"spmd pca parameters"
             ]
diff --git a/configs/spmd/large_scale/pca_strong.json b/configs/spmd/large_scale/pca_strong.json
new file mode 100644
index 00000000..adee3c79
--- /dev/null
+++ b/configs/spmd/large_scale/pca_strong.json
@@ -0,0 +1,29 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/pca.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd pca parameters": {
+            "algorithm": {
+                "estimator": "PCA",
+                "estimator_methods": { "training": "fit", "inference": "" }
+            },
+            "data": {
+                "split_kwargs": { "test_size": 0.0001 }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000,  "n_features": 100, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+		"sklearnex spmd implementation",
+                "large scale strong 2k parameters",
+                "synthetic data",
+		"spmd pca parameters"
+            ]
+        }
+    }
+}
diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
index 42f8725b..a08a6e9c 100644
--- a/sklbench/benchmarks/sklearn_estimator.py
+++ b/sklbench/benchmarks/sklearn_estimator.py
@@ -436,7 +436,9 @@ def measure_sklearn_estimator(
                 (
                     metrics[method]["time[ms]"],
                     metrics[method]["time std[ms]"],
-                    _,
+                    metrics[method]["first iter[ms]"],
+                    metrics[method]["box filter mean[ms]"],
+                    metrics[method]["box filter std[ms]"]
                 ) = measure_case(bench_case, method_instance, *data_args)
                 if batch_size is not None:
                     metrics[method]["throughput[samples/ms]"] = (
diff --git a/sklbench/utils/measurement.py b/sklbench/utils/measurement.py
index 989daefd..df74e8da 100644
--- a/sklbench/utils/measurement.py
+++ b/sklbench/utils/measurement.py
@@ -40,6 +40,22 @@ def box_filter(timing, left=0.2, right=0.8):
     return np.mean(result) * 1000, np.std(result) * 1000
 
 
+def large_scale_measurements(timing):
+    first_iter = timing[0] * 1000
+    mean = np.mean(timing[1:]) * 1000
+    stdev = np.std(timing[1:]) * 1000
+    timing_sorted = np.sort(timing)
+    Q1, Q3 = np.percentile(timing_sorted, [25, 75])
+    IQ = Q3 - Q1
+    lower, upper = Q1 - 1.5 * IQ, Q3 + 1.5 * IQ
+    
+    filtered_times = timing_sorted[(timing_sorted >= lower) & (timing_sorted <= upper)]
+    
+    box_filter_mean = np.mean(filtered_times) * 1000 if filtered_times.size > 0 else 0
+    box_filter_stdev = np.std(filtered_times) * 1000 if filtered_times.size > 0 else 0
+    return mean, stdev, first_iter, box_filter_mean, box_filter_stdev
+
+
 def measure_time(
     func,
     *args,
@@ -72,13 +88,14 @@ def measure_time(
                 f"exceeded time limit ({time_limit} seconds)"
             )
             break
-    mean, std = box_filter(times)
-    if std / mean > std_mean_ratio:
-        logger.warning(
-            f'Measured "std / mean" time ratio of "{str(func)}" function is higher '
-            f"than threshold ({round(std / mean, 3)} vs. {std_mean_ratio})"
-        )
-    return mean, std, func_return_value
+    logger.debug(times)
+    #mean, std = box_filter(times)
+    #if std / mean > std_mean_ratio:
+    #    logger.warning(
+    #        f'Measured "std / mean" time ratio of "{str(func)}" function is higher '
+    #        f"than threshold ({round(std / mean, 3)} vs. {std_mean_ratio})"
+    #    )
+    return large_scale_measurements(times)
 
 
 # wrapper to get measurement params from benchmarking case

From 3cd955c3eec84fe07654e71364d1dd4cc354cbdc Mon Sep 17 00:00:00 2001
From: ethanglaser <ethan.glaser@intel.com>
Date: Sat, 21 Sep 2024 05:25:44 +0000
Subject: [PATCH 003/110] knn and forest config updates

---
 configs/spmd/large_scale/forest.json        | 26 +++++++++++++++++++++
 configs/spmd/large_scale/forest_strong.json | 25 ++++++++++++++++++++
 configs/spmd/large_scale/knn.json           |  4 ++--
 sklbench/benchmarks/sklearn_estimator.py    |  4 ++--
 sklbench/datasets/transformer.py            |  2 +-
 5 files changed, 56 insertions(+), 5 deletions(-)
 create mode 100644 configs/spmd/large_scale/forest.json
 create mode 100644 configs/spmd/large_scale/forest_strong.json

diff --git a/configs/spmd/large_scale/forest.json b/configs/spmd/large_scale/forest.json
new file mode 100644
index 00000000..ee614ed3
--- /dev/null
+++ b/configs/spmd/large_scale/forest.json
@@ -0,0 +1,26 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd forest classification parameters": {
+            "algorithm": {
+                "estimator": "RandomForestClassifier"
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 501000, "n_features": 10, "n_classes": 2 },  "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } },
+                { "source": "make_classification", "split_kwargs": { "train_size": 10000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 11000, "n_features": 1000, "n_classes": 2 },  "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "basicstats": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale 2k parameters",
+		"synthetic data",
+                "spmd forest classification parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/forest_strong.json b/configs/spmd/large_scale/forest_strong.json
new file mode 100644
index 00000000..121aa916
--- /dev/null
+++ b/configs/spmd/large_scale/forest_strong.json
@@ -0,0 +1,25 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd forest classification parameters": {
+            "algorithm": {
+                "estimator": "RandomForestClassifier"
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 10001000, "n_features": 100, "n_classes": 2 },  "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "basicstats": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale strong 2k parameters",
+		"synthetic data",
+                "spmd forest classification parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json
index e979e2aa..1ef849f1 100644
--- a/configs/spmd/large_scale/knn.json
+++ b/configs/spmd/large_scale/knn.json
@@ -22,7 +22,7 @@
 		{ "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
 		{ "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 500000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
                 { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 50000 },   "generation_kwargs": {  "n_samples": 100000,  "n_features": 1000, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
-                { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 500000 },   "generation_kwargs": {  "n_samples": 1000000,  "n_features": 10, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
+                { "source": "make_classification", "split_kwargs": { "train_size": 200000, "test_size": 200000 },   "generation_kwargs": {  "n_samples": 400000,  "n_features": 10, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
             ]
         }	
     },
@@ -32,7 +32,7 @@
                 "common knn parameters",
                 "synthetic classification data",
                 "sklearnex spmd implementation",
-		"large scale default parameters",
+		"large scale 2k parameters",
                 "spmd knn cls parameters"
             ]
         }
diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
index a08a6e9c..a1dc7a2f 100644
--- a/sklbench/benchmarks/sklearn_estimator.py
+++ b/sklbench/benchmarks/sklearn_estimator.py
@@ -525,8 +525,8 @@ def main(bench_case: BenchCase, filters: List[BenchCase]):
     result_template = enrich_result(result_template, bench_case)
     if "assume_finite" in context_params:
         result_template["assume_finite"] = context_params["assume_finite"]
-    if hasattr(estimator_instance, "get_params"):
-        estimator_params = estimator_instance.get_params()
+    #if hasattr(estimator_instance, "get_params"):
+    #    estimator_params = estimator_instance.get_params()
     # note: "handle" is not JSON-serializable
     if "handle" in estimator_params:
         del estimator_params["handle"]
diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index 1ac7d7bc..55cfc245 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -181,7 +181,7 @@ def split_and_transform_data(bench_case, data, data_description):
                 "format": data_format,
                 "order": data_order,
                 "dtype": data_dtype,
-                "samples": converted_data.shape[0],
+                "samples (per rank)": converted_data.shape[0],
             }
             if len(converted_data.shape) == 2 and converted_data.shape[1] > 1:
                 data_description[subset_name]["features"] = converted_data.shape[1]

From e39dc2bed8100aafbb128460154ce6000f630a2e Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Mon, 23 Sep 2024 04:53:12 -0700
Subject: [PATCH 004/110] lint

---
 sklbench/benchmarks/sklearn_estimator.py | 8 ++++----
 sklbench/datasets/transformer.py         | 5 ++++-
 sklbench/runner/commands_helper.py       | 1 +
 sklbench/utils/measurement.py            | 8 ++++----
 4 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
index a1dc7a2f..bbfd3e62 100644
--- a/sklbench/benchmarks/sklearn_estimator.py
+++ b/sklbench/benchmarks/sklearn_estimator.py
@@ -145,7 +145,7 @@ def get_subset_metrics_of_estimator(
                 "balanced accuracy": float(balanced_accuracy_score(y_compat, y_pred)),
             }
         )
-        '''if hasattr(estimator_instance, "predict_proba") and not (
+        """if hasattr(estimator_instance, "predict_proba") and not (
             hasattr(estimator_instance, "probability")
             and getattr(estimator_instance, "probability") == False
         ):
@@ -165,7 +165,7 @@ def get_subset_metrics_of_estimator(
                     ),
                     "logloss": float(log_loss(y_compat, y_pred_proba)),
                 }
-            )'''
+            )"""
     elif task == "regression":
         y_pred = convert_to_numpy(estimator_instance.predict(x))
         metrics.update(
@@ -438,7 +438,7 @@ def measure_sklearn_estimator(
                     metrics[method]["time std[ms]"],
                     metrics[method]["first iter[ms]"],
                     metrics[method]["box filter mean[ms]"],
-                    metrics[method]["box filter std[ms]"]
+                    metrics[method]["box filter std[ms]"],
                 ) = measure_case(bench_case, method_instance, *data_args)
                 if batch_size is not None:
                     metrics[method]["throughput[samples/ms]"] = (
@@ -525,7 +525,7 @@ def main(bench_case: BenchCase, filters: List[BenchCase]):
     result_template = enrich_result(result_template, bench_case)
     if "assume_finite" in context_params:
         result_template["assume_finite"] = context_params["assume_finite"]
-    #if hasattr(estimator_instance, "get_params"):
+    # if hasattr(estimator_instance, "get_params"):
     #    estimator_params = estimator_instance.get_params()
     # note: "handle" is not JSON-serializable
     if "handle" in estimator_params:
diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index 55cfc245..86944ead 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -109,7 +109,10 @@ def split_and_transform_data(bench_case, data, data_description):
         y_train, y_test = None, None
 
     distributed_split = get_bench_case_value(bench_case, "data:distributed_split", None)
-    knn_split_train = "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "") and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1
+    knn_split_train = (
+        "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "")
+        and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1
+    )
     if distributed_split == "rank_based" or knn_split_train:
         from mpi4py import MPI
 
diff --git a/sklbench/runner/commands_helper.py b/sklbench/runner/commands_helper.py
index a63686c6..2441085a 100644
--- a/sklbench/runner/commands_helper.py
+++ b/sklbench/runner/commands_helper.py
@@ -47,6 +47,7 @@ def generate_benchmark_command(
             mpi_prefix += f" -{mpi_param_name} {mpi_param_value}"
             if mpi_param_name == "-hostfile":
                 import os
+
                 mpi_prefix += os.environ.get("PBS_NODEFILE")
         command_prefix = f"{mpi_prefix} {command_prefix}"
     # 3. Intel(R) VTune* profiling command prefix
diff --git a/sklbench/utils/measurement.py b/sklbench/utils/measurement.py
index df74e8da..7495e258 100644
--- a/sklbench/utils/measurement.py
+++ b/sklbench/utils/measurement.py
@@ -48,9 +48,9 @@ def large_scale_measurements(timing):
     Q1, Q3 = np.percentile(timing_sorted, [25, 75])
     IQ = Q3 - Q1
     lower, upper = Q1 - 1.5 * IQ, Q3 + 1.5 * IQ
-    
+
     filtered_times = timing_sorted[(timing_sorted >= lower) & (timing_sorted <= upper)]
-    
+
     box_filter_mean = np.mean(filtered_times) * 1000 if filtered_times.size > 0 else 0
     box_filter_stdev = np.std(filtered_times) * 1000 if filtered_times.size > 0 else 0
     return mean, stdev, first_iter, box_filter_mean, box_filter_stdev
@@ -89,8 +89,8 @@ def measure_time(
             )
             break
     logger.debug(times)
-    #mean, std = box_filter(times)
-    #if std / mean > std_mean_ratio:
+    # mean, std = box_filter(times)
+    # if std / mean > std_mean_ratio:
     #    logger.warning(
     #        f'Measured "std / mean" time ratio of "{str(func)}" function is higher '
     #        f"than threshold ({round(std / mean, 3)} vs. {std_mean_ratio})"

From 6e0fbf8a1947895169b48731be24b97a9c29db70 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Mon, 23 Sep 2024 06:28:03 -0700
Subject: [PATCH 005/110] just gpu for regular

---
 configs/common/sklearn.json       |  5 +++++
 configs/regular/dbscan.json       |  2 +-
 configs/regular/ensemble.json     |  4 ++--
 configs/regular/kmeans.json       |  2 +-
 configs/regular/knn.json          | 20 ++------------------
 configs/regular/linear_model.json | 24 +-----------------------
 configs/regular/logreg.json       |  2 +-
 configs/regular/pca.json          |  2 +-
 8 files changed, 14 insertions(+), 47 deletions(-)

diff --git a/configs/common/sklearn.json b/configs/common/sklearn.json
index d7b13188..43051093 100644
--- a/configs/common/sklearn.json
+++ b/configs/common/sklearn.json
@@ -12,6 +12,11 @@
                 { "library": "sklearnex", "device": ["cpu", "gpu"] }
             ]
         },
+        "sklearn-ex[gpu] implementations": {
+            "algorithm": [
+                { "library": "sklearnex", "device": ["gpu"] }
+            ]
+        },
         "sklearn-ex[preview] implementations": {
             "algorithm": [
                 { "library": "sklearn", "device": "cpu" },
diff --git a/configs/regular/dbscan.json b/configs/regular/dbscan.json
index 71dcdc9b..1d0d732b 100644
--- a/configs/regular/dbscan.json
+++ b/configs/regular/dbscan.json
@@ -58,7 +58,7 @@
     "TEMPLATES": {
         "sklearn dbscan": {
             "SETS": [
-                "sklearn-ex[cpu,gpu] implementations",
+                "sklearn-ex[gpu] implementations",
                 "common dbscan parameters",
                 "sklearn dbscan parameters",
                 "dbscan datasets"
diff --git a/configs/regular/ensemble.json b/configs/regular/ensemble.json
index 56e37e77..164cb236 100644
--- a/configs/regular/ensemble.json
+++ b/configs/regular/ensemble.json
@@ -90,7 +90,7 @@
     "TEMPLATES": {
         "sklearn ensemble classification": {
             "SETS": [
-                "sklearn-ex[cpu,gpu] implementations",
+                "sklearn-ex[gpu] implementations",
                 "common ensemble params",
                 "sklearn ensemble classifier params",
                 "ensemble classification data"
@@ -98,7 +98,7 @@
         },
         "sklearn ensemble regression": {
             "SETS": [
-                "sklearn-ex[cpu,gpu] implementations",
+                "sklearn-ex[gpu] implementations",
                 "common ensemble params",
                 "sklearn ensemble regressor params",
                 "ensemble regression data"
diff --git a/configs/regular/kmeans.json b/configs/regular/kmeans.json
index d4953615..8aba9055 100644
--- a/configs/regular/kmeans.json
+++ b/configs/regular/kmeans.json
@@ -70,7 +70,7 @@
     "TEMPLATES": {
         "sklearn kmeans": {
             "SETS": [
-                "sklearn-ex[preview] implementations",
+                "sklearn-ex[gpu] implementations",
                 "common kmeans parameters",
                 "sklearn kmeans parameters",
                 "kmeans datasets"
diff --git a/configs/regular/knn.json b/configs/regular/knn.json
index e1cd8a75..bcbed117 100644
--- a/configs/regular/knn.json
+++ b/configs/regular/knn.json
@@ -74,36 +74,20 @@
     "TEMPLATES": {
         "sklearn brute knn clsf": {
             "SETS": [
-                "sklearn-ex[cpu,gpu] implementations",
+                "sklearn-ex[gpu] implementations",
                 "common knn parameters",
                 "sklearn knn parameters",
                 "brute knn algorithm - classification data"
             ]
         },
-        "sklearn kd_tree knn clsf": {
-            "SETS": [
-                "sklearn-ex[cpu] implementations",
-                "common knn parameters",
-                "sklearn knn parameters",
-                "kd_tree knn algorithm - classification data"
-            ]
-        },
         "sklearn brute knn regr": {
             "SETS": [
-                "sklearn-ex[cpu,gpu] implementations",
+                "sklearn-ex[gpu] implementations",
                 "common knn parameters",
                 "sklearn knn parameters",
                 "brute knn algorithm - regression data"
             ]
         },
-        "sklearn kd_tree knn regr": {
-            "SETS": [
-                "sklearn-ex[cpu] implementations",
-                "common knn parameters",
-                "sklearn knn parameters",
-                "kd_tree knn algorithm - regression data"
-            ]
-        },
         "cuml brute knn clsf": {
             "SETS": [
                 "cuml implementation",
diff --git a/configs/regular/linear_model.json b/configs/regular/linear_model.json
index eb1b79ba..66667343 100644
--- a/configs/regular/linear_model.json
+++ b/configs/regular/linear_model.json
@@ -85,34 +85,12 @@
     "TEMPLATES": {
         "sklearn linear": {
             "SETS": [
-                "sklearn-ex[cpu,gpu] implementations",
+                "sklearn-ex[gpu] implementations",
                 "common linear parameters",
                 "sklearn linear parameters",
                 "regression datasets"
             ]
         },
-        "sklearn ridge": {
-            "SETS": [
-                "sklearn-ex[cpu] implementations",
-                "common ridge parameters",
-                "sklearn ridge parameters",
-                "regression datasets"
-            ]
-        },
-        "sklearn lasso": {
-            "SETS": [
-                "sklearn-ex[cpu] implementations",
-                "common lasso parameters",
-                "regression datasets"
-            ]
-        },
-        "sklearn elasticnet": {
-            "SETS": [
-                "sklearn-ex[cpu] implementations",
-                "common elasticnet parameters",
-                "regression datasets"
-            ]
-        },
         "cuml linear": {
             "SETS": [
                 "cuml implementation",
diff --git a/configs/regular/logreg.json b/configs/regular/logreg.json
index a94a7fcf..172ceb48 100644
--- a/configs/regular/logreg.json
+++ b/configs/regular/logreg.json
@@ -54,7 +54,7 @@
     "TEMPLATES": {
         "sklearn logreg": {
             "SETS": [
-                "sklearn-ex[cpu,gpu] implementations",
+                "sklearn-ex[gpu] implementations",
                 "common logreg parameters",
                 "sklearn logreg parameters",
                 "logreg datasets"
diff --git a/configs/regular/pca.json b/configs/regular/pca.json
index 582acc9e..2300454d 100644
--- a/configs/regular/pca.json
+++ b/configs/regular/pca.json
@@ -46,7 +46,7 @@
     "TEMPLATES": {
         "sklearn pca": {
             "SETS": [
-                "sklearn-ex[cpu,gpu] implementations",
+                "sklearn-ex[gpu] implementations",
                 "pca parameters",
                 "pca datasets"
             ]

From 7bb8fb486724192fc6410ccbd731ad16650563ad Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Mon, 23 Sep 2024 07:10:55 -0700
Subject: [PATCH 006/110] juremove cuml

---
 configs/regular/dbscan.json       |  8 --------
 configs/regular/ensemble.json     | 16 ----------------
 configs/regular/kmeans.json       |  8 --------
 configs/regular/knn.json          | 14 --------------
 configs/regular/linear_model.json | 24 ------------------------
 configs/regular/logreg.json       |  8 --------
 configs/regular/pca.json          |  7 -------
 7 files changed, 85 deletions(-)

diff --git a/configs/regular/dbscan.json b/configs/regular/dbscan.json
index 1d0d732b..711c15cd 100644
--- a/configs/regular/dbscan.json
+++ b/configs/regular/dbscan.json
@@ -63,14 +63,6 @@
                 "sklearn dbscan parameters",
                 "dbscan datasets"
             ]
-        },
-        "cuml dbscan": {
-            "SETS": [
-                "cuml implementation",
-                "common dbscan parameters",
-                "cuml dbscan parameters",
-                "dbscan datasets"
-            ]
         }
     }
 }
diff --git a/configs/regular/ensemble.json b/configs/regular/ensemble.json
index 164cb236..f01c1383 100644
--- a/configs/regular/ensemble.json
+++ b/configs/regular/ensemble.json
@@ -103,22 +103,6 @@
                 "sklearn ensemble regressor params",
                 "ensemble regression data"
             ]
-        },
-        "cuml ensemble classification": {
-            "SETS": [
-                "cuml implementation",
-                "common ensemble params",
-                "cuml ensemble classifier params",
-                "ensemble classification data"
-            ]
-        },
-        "cuml ensemble regression": {
-            "SETS": [
-                "cuml implementation",
-                "common ensemble params",
-                "cuml ensemble regressor params",
-                "ensemble regression data"
-            ]
         }
     }
 }
diff --git a/configs/regular/kmeans.json b/configs/regular/kmeans.json
index 8aba9055..756e2bab 100644
--- a/configs/regular/kmeans.json
+++ b/configs/regular/kmeans.json
@@ -75,14 +75,6 @@
                 "sklearn kmeans parameters",
                 "kmeans datasets"
             ]
-        },
-        "cuml kmeans": {
-            "SETS": [
-                "cuml implementation",
-                "common kmeans parameters",
-                "cuml kmeans parameters",
-                "kmeans datasets"
-            ]
         }
     }
 }
diff --git a/configs/regular/knn.json b/configs/regular/knn.json
index bcbed117..a69c6864 100644
--- a/configs/regular/knn.json
+++ b/configs/regular/knn.json
@@ -87,20 +87,6 @@
                 "sklearn knn parameters",
                 "brute knn algorithm - regression data"
             ]
-        },
-        "cuml brute knn clsf": {
-            "SETS": [
-                "cuml implementation",
-                "common knn parameters",
-                "brute knn algorithm - classification data"
-            ]
-        },
-        "cuml brute knn regr": {
-            "SETS": [
-                "cuml implementation",
-                "common knn parameters",
-                "brute knn algorithm - regression data"
-            ]
         }
     }
 }
diff --git a/configs/regular/linear_model.json b/configs/regular/linear_model.json
index 66667343..3040c82d 100644
--- a/configs/regular/linear_model.json
+++ b/configs/regular/linear_model.json
@@ -98,30 +98,6 @@
                 "cuml L2 parameters",
                 "regression datasets"
             ]
-        },
-        "cuml ridge": {
-            "SETS": [
-                "cuml implementation",
-                "common ridge parameters",
-                "cuml L2 parameters",
-                "regression datasets"
-            ]
-        },
-        "cuml lasso": {
-            "SETS": [
-                "cuml implementation",
-                "common lasso parameters",
-                "cuml L1 parameters",
-                "regression datasets"
-            ]
-        },
-        "cuml elasticnet": {
-            "SETS": [
-                "cuml implementation",
-                "common elasticnet parameters",
-                "cuml L1 parameters",
-                "regression datasets"
-            ]
         }
     }
 }
diff --git a/configs/regular/logreg.json b/configs/regular/logreg.json
index 172ceb48..a8323b02 100644
--- a/configs/regular/logreg.json
+++ b/configs/regular/logreg.json
@@ -59,14 +59,6 @@
                 "sklearn logreg parameters",
                 "logreg datasets"
             ]
-        },
-        "cuml logreg": {
-            "SETS": [
-                "cuml implementation",
-                "common logreg parameters",
-                "cuml logreg parameters",
-                "logreg datasets"
-            ]
         }
     }
 }
diff --git a/configs/regular/pca.json b/configs/regular/pca.json
index 2300454d..e26d3f44 100644
--- a/configs/regular/pca.json
+++ b/configs/regular/pca.json
@@ -50,13 +50,6 @@
                 "pca parameters",
                 "pca datasets"
             ]
-        },
-        "cuml pca": {
-            "SETS": [
-                "cuml implementation",
-                "pca parameters",
-                "pca datasets"
-            ]
         }
     }
 }

From 535c1e49171eea712d04f28769c7ebf697e675f9 Mon Sep 17 00:00:00 2001
From: "Kruglov, Oleg" <oleg.kruglov@intel.com>
Date: Mon, 23 Sep 2024 10:57:42 -0700
Subject: [PATCH 007/110] Add incremental algorithms support

---
 configs/incremental.json                 | 99 ++++++++++++++++++++++++
 sklbench/benchmarks/sklearn_estimator.py | 36 +++++++--
 sklbench/report/implementation.py        | 10 ++-
 test-configuration-linux.yml             |  5 ++
 test-configuration-win.yml               |  4 +
 5 files changed, 144 insertions(+), 10 deletions(-)
 create mode 100644 configs/incremental.json

diff --git a/configs/incremental.json b/configs/incremental.json
new file mode 100644
index 00000000..5f7a5477
--- /dev/null
+++ b/configs/incremental.json
@@ -0,0 +1,99 @@
+{
+    "PARAMETERS_SETS": {
+        "common": {"bench": {"n_runs": 10, "time_limit": 60}},
+        "covariance data": {
+            "data": [
+                {
+                    "source": "make_blobs",
+                    "generation_kwargs": {
+                        "centers": 1,
+                        "n_samples": 1000,
+                        "n_features": [16, 64]
+                    },
+                    "split_kwargs": {"ignore": true}
+                }
+            ]
+        },
+        "basic_statistics data": {
+            "data": {
+                "source": "make_blobs",
+                "generation_kwargs": {
+                    "centers": 1,
+                    "n_samples": 10000,
+                    "n_features": [16, 64]
+                },
+                "split_kwargs": {"ignore": true}
+            }
+        },
+        "linear_regression data": {
+            "data": {
+                "source": "make_regression",
+                "split_kwargs": {"train_size": 0.2, "test_size": 0.8},
+                "generation_kwargs": {
+                    "n_samples": 5000,
+                    "n_features": [40, 100],
+                    "n_informative": 5,
+                    "noise": 2.0
+                }
+            }
+        },
+        "pca data": {
+            "data": {
+                "source": "make_blobs",
+                "generation_kwargs": {
+                    "centers": 1,
+                    "n_samples": 1000,
+                    "n_features": [16, 64]
+                },
+                "split_kwargs": {"ignore": true}
+            }
+        },
+        "covariance": {
+            "algorithm": [
+                {
+                    "estimator": "IncrementalEmpiricalCovariance",
+                    "library": "sklearnex.covariance",
+                    "estimator_methods": {"training": "partial_fit"},
+                    "num_batches": {"training": 2}
+                }
+            ]
+        },
+        "basic_statistics": {
+            "algorithm": [
+                {
+                    "estimator": "IncrementalBasicStatistics",
+                    "library": "sklearnex.basic_statistics",
+                    "num_batches": {"training": 2}
+                }
+            ]
+        },
+        "linear_regression": {
+            "algorithm": [
+                {
+                    "estimator": "IncrementalLinearRegression",
+                    "library": "sklearnex.linear_model",
+                    "num_batches": {"training": 2}
+                }
+            ]
+        },
+        "pca": {
+            "algorithm": [
+                {
+                    "estimator": "IncrementalPCA",
+                    "library": "sklearnex.preview.decomposition",
+                    "num_batches": {"training": 2}
+                }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "covariance": {"SETS": ["common", "covariance", "covariance data"]},
+        "basic_statistics": {
+            "SETS": ["common", "basic_statistics", "basic_statistics data"]
+        },
+        "linear_regression": {
+            "SETS": ["common", "linear_regression", "linear_regression data"]
+        },
+        "pca": {"SETS": ["common", "pca", "pca data"]}
+    }
+}
diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
index f9c0a75e..4cdde86d 100644
--- a/sklbench/benchmarks/sklearn_estimator.py
+++ b/sklbench/benchmarks/sklearn_estimator.py
@@ -74,7 +74,7 @@ def get_estimator(library_name: str, estimator_name: str):
 def get_estimator_methods(bench_case: BenchCase) -> Dict[str, List[str]]:
     # default estimator methods
     estimator_methods = {
-        "training": ["fit"],
+        "training": ["partial_fit", "fit"],
         "inference": ["predict", "predict_proba", "transform"],
     }
     for stage in estimator_methods.keys():
@@ -334,7 +334,9 @@ def verify_patching(stream: io.StringIO, function_name) -> bool:
     return acceleration_lines > 0 and fallback_lines == 0
 
 
-def create_online_function(method_instance, data_args, batch_size):
+def create_online_function(
+    estimator_instance, method_instance, data_args, num_batches, batch_size
+):
     n_batches = data_args[0].shape[0] // batch_size
 
     if "y" in list(inspect.signature(method_instance).parameters):
@@ -345,6 +347,7 @@ def ndarray_function(x, y):
                     x[i * batch_size : (i + 1) * batch_size],
                     y[i * batch_size : (i + 1) * batch_size],
                 )
+            estimator_instance._onedal_finalize_fit()
 
         def dataframe_function(x, y):
             for i in range(n_batches):
@@ -352,16 +355,19 @@ def dataframe_function(x, y):
                     x.iloc[i * batch_size : (i + 1) * batch_size],
                     y.iloc[i * batch_size : (i + 1) * batch_size],
                 )
+            estimator_instance._onedal_finalize_fit()
 
     else:
 
         def ndarray_function(x):
             for i in range(n_batches):
                 method_instance(x[i * batch_size : (i + 1) * batch_size])
+            estimator_instance._onedal_finalize_fit()
 
         def dataframe_function(x):
             for i in range(n_batches):
                 method_instance(x.iloc[i * batch_size : (i + 1) * batch_size])
+            estimator_instance._onedal_finalize_fit()
 
     if "ndarray" in str(type(data_args[0])):
         return ndarray_function
@@ -414,12 +420,28 @@ def measure_sklearn_estimator(
                         data_args = (x_train,)
                     else:
                         data_args = (x_test,)
-                batch_size = get_bench_case_value(
-                    bench_case, f"algorithm:batch_size:{stage}"
-                )
-                if batch_size is not None:
+
+                if method == "partial_fit":
+                    num_batches = get_bench_case_value(bench_case, "data:num_batches")
+                    batch_size = get_bench_case_value(bench_case, "data:batch_size")
+
+                    if batch_size is None:
+                        if num_batches is None:
+                            num_batches = 5
+                        batch_size = (
+                            data_args[0].shape[0] + num_batches - 1
+                        ) // num_batches
+                    if num_batches is None:
+                        num_batches = (
+                            data_args[0].shape[0] + batch_size - 1
+                        ) // batch_size
+
                     method_instance = create_online_function(
-                        method_instance, data_args, batch_size
+                        estimator_instance,
+                        method_instance,
+                        data_args,
+                        num_batches,
+                        batch_size,
                     )
                 # daal4py model builders enabling branch
                 if enable_modelbuilders and stage == "inference":
diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py
index b577ab55..df15b5eb 100644
--- a/sklbench/report/implementation.py
+++ b/sklbench/report/implementation.py
@@ -16,7 +16,7 @@
 
 import argparse
 import json
-from typing import Dict, List
+from typing import Dict, Hashable, List
 
 import openpyxl as xl
 import pandas as pd
@@ -239,6 +239,7 @@ def get_result_tables_as_df(
     bench_cases = pd.DataFrame(
         [flatten_dict(bench_case) for bench_case in results["bench_cases"]]
     )
+    bench_cases = bench_cases.map(lambda x: str(x) if not isinstance(x, Hashable) else x)
 
     if compatibility_mode:
         bench_cases = transform_results_to_compatible(bench_cases)
@@ -248,7 +249,7 @@ def get_result_tables_as_df(
             bench_cases.drop(columns=[column], inplace=True)
             diffby_columns.remove(column)
 
-    return split_df_by_columns(bench_cases, splitby_columns)
+    return split_df_by_columns(bench_cases, splitby_columns, False)
 
 
 def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame:
@@ -258,7 +259,10 @@ def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame:
             # only relative improvements are included in summary currently
             if len(column) > 1 and column[1] == f"{metric_name} relative improvement":
                 metric_columns.append(column)
-    summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T
+    if metric_columns:
+        summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T
+    else:
+        summary = pd.DataFrame()
     summary.index = pd.Index([df_name])
     return summary
 
diff --git a/test-configuration-linux.yml b/test-configuration-linux.yml
index a37769ce..722d1008 100644
--- a/test-configuration-linux.yml
+++ b/test-configuration-linux.yml
@@ -45,6 +45,11 @@ steps:
       conda activate bench-env
       python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json
     displayName: Sklearn example run
+  - script: |
+      source /usr/share/miniconda/etc/profile.d/conda.sh
+      conda activate bench-env
+      python -m sklbench --report -l DEBUG --report -c configs/incremental.json
+    displayName: Incremental algorithms example run
   - script: |
       source /usr/share/miniconda/etc/profile.d/conda.sh
       conda activate bench-env
diff --git a/test-configuration-win.yml b/test-configuration-win.yml
index a1eddaeb..82c3152a 100644
--- a/test-configuration-win.yml
+++ b/test-configuration-win.yml
@@ -43,6 +43,10 @@ steps:
       call activate bench-env
       python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json
     displayName: Sklearn example run
+    - script: |
+      call activate bench-env
+      python -m sklbench --report -l DEBUG --report -c configs/incremental.json
+    displayName: Incremental algorithms example run
   - script: |
       call activate bench-env
       python -m sklbench --report -l DEBUG --report -c configs/xgboost_example.json

From d6952ac74715dcb0910626f9e5dce1c2eb1a3827 Mon Sep 17 00:00:00 2001
From: "Kruglov, Oleg" <oleg.kruglov@intel.com>
Date: Mon, 23 Sep 2024 11:49:37 -0700
Subject: [PATCH 008/110] Fix win yml

---
 test-configuration-win.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test-configuration-win.yml b/test-configuration-win.yml
index 82c3152a..f3ac1595 100644
--- a/test-configuration-win.yml
+++ b/test-configuration-win.yml
@@ -43,7 +43,7 @@ steps:
       call activate bench-env
       python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json
     displayName: Sklearn example run
-    - script: |
+  - script: |
       call activate bench-env
       python -m sklbench --report -l DEBUG --report -c configs/incremental.json
     displayName: Incremental algorithms example run

From 9cf382ee7b58bd68d6d826f17d3cf8adb7e493eb Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Mon, 23 Sep 2024 11:52:03 -0700
Subject: [PATCH 009/110] refactor and kmeans strong

---
 configs/spmd/large_scale/basic_stats.json     |  2 +-
 .../spmd/large_scale/basic_stats_strong.json  |  2 +-
 configs/spmd/large_scale/covariance.json      |  2 +-
 .../spmd/large_scale/covariance_strong.json   |  2 +-
 configs/spmd/large_scale/dbscan.json          |  2 +-
 configs/spmd/large_scale/forest.json          |  2 +-
 configs/spmd/large_scale/forest_strong.json   |  2 +-
 configs/spmd/large_scale/kmeans_strong.json   | 31 +++++++++++++++++++
 configs/spmd/large_scale/knn.json             |  2 +-
 configs/spmd/large_scale/linear_model.json    |  2 +-
 .../spmd/large_scale/linear_model_strong.json |  2 +-
 configs/spmd/large_scale/logreg.json          |  4 +--
 configs/spmd/large_scale/logreg_strong.json   |  4 +--
 configs/spmd/large_scale/pca.json             |  4 +--
 configs/spmd/large_scale/pca_strong.json      |  4 +--
 15 files changed, 49 insertions(+), 18 deletions(-)
 create mode 100644 configs/spmd/large_scale/kmeans_strong.json

diff --git a/configs/spmd/large_scale/basic_stats.json b/configs/spmd/large_scale/basic_stats.json
index 9ac4725f..b484b647 100644
--- a/configs/spmd/large_scale/basic_stats.json
+++ b/configs/spmd/large_scale/basic_stats.json
@@ -22,7 +22,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
-		"synthetic data",
+		        "synthetic data",
                 "spmd basicstats parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/basic_stats_strong.json b/configs/spmd/large_scale/basic_stats_strong.json
index b7aa22cb..6527d8e5 100644
--- a/configs/spmd/large_scale/basic_stats_strong.json
+++ b/configs/spmd/large_scale/basic_stats_strong.json
@@ -21,7 +21,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale strong 2k parameters",
-		"synthetic data",
+		        "synthetic data",
                 "spmd basicstats parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/covariance.json b/configs/spmd/large_scale/covariance.json
index 260befd0..e4d0477a 100644
--- a/configs/spmd/large_scale/covariance.json
+++ b/configs/spmd/large_scale/covariance.json
@@ -22,7 +22,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
-		"synthetic data",
+		        "synthetic data",
                 "spmd basicstats parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/covariance_strong.json b/configs/spmd/large_scale/covariance_strong.json
index 568b4a8f..2b9c5dd0 100644
--- a/configs/spmd/large_scale/covariance_strong.json
+++ b/configs/spmd/large_scale/covariance_strong.json
@@ -21,7 +21,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale strong 2k parameters",
-		"synthetic data",
+		        "synthetic data",
                 "spmd basicstats parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/dbscan.json b/configs/spmd/large_scale/dbscan.json
index c46287d8..b17e2cd8 100644
--- a/configs/spmd/large_scale/dbscan.json
+++ b/configs/spmd/large_scale/dbscan.json
@@ -24,7 +24,7 @@
                 "common dbscan parameters",
                 "synthetic dataset",
                 "sklearnex spmd implementation",
-		"large scale default parameters",
+		        "large scale default parameters",
                 "spmd dbscan parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/forest.json b/configs/spmd/large_scale/forest.json
index ee614ed3..ea6f3ef7 100644
--- a/configs/spmd/large_scale/forest.json
+++ b/configs/spmd/large_scale/forest.json
@@ -18,7 +18,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
-		"synthetic data",
+		        "synthetic data",
                 "spmd forest classification parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/forest_strong.json b/configs/spmd/large_scale/forest_strong.json
index 121aa916..0f1ef40e 100644
--- a/configs/spmd/large_scale/forest_strong.json
+++ b/configs/spmd/large_scale/forest_strong.json
@@ -17,7 +17,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale strong 2k parameters",
-		"synthetic data",
+		        "synthetic data",
                 "spmd forest classification parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/kmeans_strong.json b/configs/spmd/large_scale/kmeans_strong.json
new file mode 100644
index 00000000..29cfc2e7
--- /dev/null
+++ b/configs/spmd/large_scale/kmeans_strong.json
@@ -0,0 +1,31 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/kmeans.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd kmeans parameters": {
+            "algorithm": {
+                "estimator": "KMeans",
+                "estimator_params": {
+                    "algorithm": "lloyd"
+                },
+                "estimator_methods": { "training": "fit", "inference": "predict" }
+            }
+	},
+	"synthetic data": {
+                "data": [
+                        { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000,  "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
+	                { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000,  "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
+			{ "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000,  "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } }
+                ]
+        }
+    },
+    "TEMPLATES": {
+        "kmeans": {
+            "SETS": [
+                "synthetic data",
+                "sklearnex spmd implementation",
+                "large scale strong 2k parameters",
+                "spmd kmeans parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json
index 1ef849f1..8dd39f61 100644
--- a/configs/spmd/large_scale/knn.json
+++ b/configs/spmd/large_scale/knn.json
@@ -32,7 +32,7 @@
                 "common knn parameters",
                 "synthetic classification data",
                 "sklearnex spmd implementation",
-		"large scale 2k parameters",
+		        "large scale 2k parameters",
                 "spmd knn cls parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/linear_model.json b/configs/spmd/large_scale/linear_model.json
index aeda4441..e4bb14a1 100644
--- a/configs/spmd/large_scale/linear_model.json
+++ b/configs/spmd/large_scale/linear_model.json
@@ -19,7 +19,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
-		"synthetic data",
+		        "synthetic data",
                 "spmd linear parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/linear_model_strong.json b/configs/spmd/large_scale/linear_model_strong.json
index 77a9c79e..9d8c3533 100644
--- a/configs/spmd/large_scale/linear_model_strong.json
+++ b/configs/spmd/large_scale/linear_model_strong.json
@@ -18,7 +18,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale strong 2k parameters",
-		"synthetic data",
+		        "synthetic data",
                 "spmd linear parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/logreg.json b/configs/spmd/large_scale/logreg.json
index c5ef6203..ccef906b 100644
--- a/configs/spmd/large_scale/logreg.json
+++ b/configs/spmd/large_scale/logreg.json
@@ -21,8 +21,8 @@
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
                 "spmd logreg parameters",
-		"synthetic data",
-		"spmd logreg2 parameters"
+		        "synthetic data",
+		        "spmd logreg2 parameters"
             ]
         }
     }
diff --git a/configs/spmd/large_scale/logreg_strong.json b/configs/spmd/large_scale/logreg_strong.json
index 2bf1c0f9..a6efd969 100644
--- a/configs/spmd/large_scale/logreg_strong.json
+++ b/configs/spmd/large_scale/logreg_strong.json
@@ -20,8 +20,8 @@
                 "sklearnex spmd implementation",
                 "large scale strong 2k parameters",
                 "spmd logreg parameters",
-		"synthetic data",
-		"spmd logreg2 parameters"
+		        "synthetic data",
+		        "spmd logreg2 parameters"
             ]
         }
     }
diff --git a/configs/spmd/large_scale/pca.json b/configs/spmd/large_scale/pca.json
index 9a6a6b02..3b9da126 100644
--- a/configs/spmd/large_scale/pca.json
+++ b/configs/spmd/large_scale/pca.json
@@ -20,10 +20,10 @@
     "TEMPLATES": {
         "linreg": {
             "SETS": [
-		"sklearnex spmd implementation",
+		        "sklearnex spmd implementation",
                 "large scale 2k parameters",
                 "synthetic data",
-		"spmd pca parameters"
+		        "spmd pca parameters"
             ]
         }
     }
diff --git a/configs/spmd/large_scale/pca_strong.json b/configs/spmd/large_scale/pca_strong.json
index adee3c79..2d302340 100644
--- a/configs/spmd/large_scale/pca_strong.json
+++ b/configs/spmd/large_scale/pca_strong.json
@@ -19,10 +19,10 @@
     "TEMPLATES": {
         "linreg": {
             "SETS": [
-		"sklearnex spmd implementation",
+		        "sklearnex spmd implementation",
                 "large scale strong 2k parameters",
                 "synthetic data",
-		"spmd pca parameters"
+		        "spmd pca parameters"
             ]
         }
     }

From 6c8f529fd74f6f0bad1c36d6d7a8878e168624cb Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Mon, 23 Sep 2024 12:07:49 -0700
Subject: [PATCH 010/110] refactor and add config

---
 configs/spmd/large_scale/kmeans_strong_2.json | 31 ++++++++++++++++
 configs/spmd/large_scale/large_scale.json     | 36 ++++++++++++++-----
 configs/spmd/large_scale/logreg_2.json        | 29 +++++++++++++++
 configs/spmd/large_scale/logreg_strong_2.json | 28 +++++++++++++++
 4 files changed, 115 insertions(+), 9 deletions(-)
 create mode 100644 configs/spmd/large_scale/kmeans_strong_2.json
 create mode 100644 configs/spmd/large_scale/logreg_2.json
 create mode 100644 configs/spmd/large_scale/logreg_strong_2.json

diff --git a/configs/spmd/large_scale/kmeans_strong_2.json b/configs/spmd/large_scale/kmeans_strong_2.json
new file mode 100644
index 00000000..03f2bc59
--- /dev/null
+++ b/configs/spmd/large_scale/kmeans_strong_2.json
@@ -0,0 +1,31 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/kmeans.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd kmeans parameters": {
+            "algorithm": {
+                "estimator": "KMeans",
+                "estimator_params": {
+                    "algorithm": "lloyd"
+                },
+                "estimator_methods": { "training": "fit", "inference": "predict" }
+            }
+	},
+	"synthetic data": {
+                "data": [
+                        { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000,  "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
+	                { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000,  "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
+			{ "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000,  "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } }
+                ]
+        }
+    },
+    "TEMPLATES": {
+        "kmeans": {
+            "SETS": [
+                "synthetic data",
+                "sklearnex spmd implementation",
+                "large scale strong two nodes parameters",
+                "spmd kmeans parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
index 72b808fe..1cde18f6 100644
--- a/configs/spmd/large_scale/large_scale.json
+++ b/configs/spmd/large_scale/large_scale.json
@@ -3,13 +3,13 @@
         "large scale default parameters": {
             "data": {
                 "dtype": "float64",
-		"distributed_split": "None"
+		        "distributed_split": "None"
             },
             "bench": {
                 "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
-	"large scale strong parameters": {
+        "large scale strong parameters": {
             "data": {
                 "dtype": "float64",
                 "distributed_split": "rank_based"
@@ -18,7 +18,7 @@
                 "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
-	"large scale 2k parameters": {
+        "large scale 2k parameters": {
             "data": {
                 "dtype": "float64",
                 "distributed_split": "None"
@@ -27,6 +27,15 @@
                 "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
+        "large scale two nodes parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [24], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
         "large scale strong 2k parameters": {
             "data": {
                 "dtype": "float64",
@@ -36,14 +45,23 @@
                 "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
-	"large scale impi parameters": {
-	    "data": {
-		"dtype": "float64",
+        "large scale strong two nodes parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "rank_based"
+            },
+            "bench": {
+                "mpi_params": {"n": [24], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+        "large scale impi parameters": {
+            "data": {
+                "dtype": "float64",
                 "distributed_split": "no"
             },
             "bench": {
-		    "mpi_params": {"n": [1,2,4,6,9,12], "ppn": 12}
-	    }
-	}
+                "mpi_params": {"n": [1,2,4,6,9,12], "ppn": 12}
+            }
+        }
     }
 }
diff --git a/configs/spmd/large_scale/logreg_2.json b/configs/spmd/large_scale/logreg_2.json
new file mode 100644
index 00000000..d18b2293
--- /dev/null
+++ b/configs/spmd/large_scale/logreg_2.json
@@ -0,0 +1,29 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/logreg.json", "../logreg.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd logreg2 parameters": {
+	    "algorithm":{
+		"estimator": "LogisticRegression",
+                "estimator_methods": { "inference": "predict" },
+		"estimator_params": { "max_iter": 20 }
+            }
+	},
+        "synthetic data": {
+            "data": [
+		{ "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 5001000, "n_features": 10, "n_classes": 2 } },
+                { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 101000, "n_features": 1000, "n_classes": 2 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale two nodes parameters",
+                "spmd logreg parameters",
+		        "synthetic data",
+		        "spmd logreg2 parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/logreg_strong_2.json b/configs/spmd/large_scale/logreg_strong_2.json
new file mode 100644
index 00000000..1a940d90
--- /dev/null
+++ b/configs/spmd/large_scale/logreg_strong_2.json
@@ -0,0 +1,28 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/logreg.json", "../logreg.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd logreg2 parameters": {
+	    "algorithm":{
+		"estimator": "LogisticRegression",
+                "estimator_methods": { "inference": "predict" },
+		"estimator_params": { "max_iter": 30 }
+            }
+	},
+        "synthetic data": {
+            "data": [
+		{ "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 10001000, "n_features": 100, "n_classes": 2 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale strong two nodes parameters",
+                "spmd logreg parameters",
+		        "synthetic data",
+		        "spmd logreg2 parameters"
+            ]
+        }
+    }
+}

From 3867a8607e33ed6378055860bb986a09b123638f Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Mon, 23 Sep 2024 12:48:53 -0700
Subject: [PATCH 011/110] strong reduce nodes

---
 configs/spmd/large_scale/large_scale.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
index 1cde18f6..bf99dd5c 100644
--- a/configs/spmd/large_scale/large_scale.json
+++ b/configs/spmd/large_scale/large_scale.json
@@ -42,7 +42,7 @@
                 "distributed_split": "rank_based"
             },
             "bench": {
-                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
         "large scale strong two nodes parameters": {

From ed875b4f68207e0c745751ef75853a4e1b6a60bf Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Mon, 23 Sep 2024 14:00:02 -0700
Subject: [PATCH 012/110] forest reg config

---
 configs/spmd/large_scale/forest.json        | 2 +-
 configs/spmd/large_scale/forest_strong.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/spmd/large_scale/forest.json b/configs/spmd/large_scale/forest.json
index ea6f3ef7..5aa3d36f 100644
--- a/configs/spmd/large_scale/forest.json
+++ b/configs/spmd/large_scale/forest.json
@@ -14,7 +14,7 @@
         }
     },
     "TEMPLATES": {
-        "basicstats": {
+        "forestCls": {
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
diff --git a/configs/spmd/large_scale/forest_strong.json b/configs/spmd/large_scale/forest_strong.json
index 0f1ef40e..14690846 100644
--- a/configs/spmd/large_scale/forest_strong.json
+++ b/configs/spmd/large_scale/forest_strong.json
@@ -13,7 +13,7 @@
         }
     },
     "TEMPLATES": {
-        "basicstats": {
+        "forestCls": {
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale strong 2k parameters",

From c596a56cc5ba48557368610e74481bf1b8e00b96 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Mon, 23 Sep 2024 14:00:12 -0700
Subject: [PATCH 013/110] forest reg config

---
 configs/spmd/large_scale/forest_reg.json      | 25 +++++++++++++++++++
 .../spmd/large_scale/forest_strong_reg.json   | 25 +++++++++++++++++++
 2 files changed, 50 insertions(+)
 create mode 100644 configs/spmd/large_scale/forest_reg.json
 create mode 100644 configs/spmd/large_scale/forest_strong_reg.json

diff --git a/configs/spmd/large_scale/forest_reg.json b/configs/spmd/large_scale/forest_reg.json
new file mode 100644
index 00000000..ab2a6920
--- /dev/null
+++ b/configs/spmd/large_scale/forest_reg.json
@@ -0,0 +1,25 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd forest regression parameters": {
+            "algorithm": {
+                "estimator": "RandomForestRegressor"
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_regression", "generation_kwargs": { "n_samples": 10000000,  "n_features": 100, "noise": 1.25 }, "split_kwargs": { "train_size": 10000000, "test_size": 5000 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } }}
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "forestReg": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale 2k parameters",
+		        "synthetic data",
+                "spmd forest regression parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/forest_strong_reg.json b/configs/spmd/large_scale/forest_strong_reg.json
new file mode 100644
index 00000000..71afeee6
--- /dev/null
+++ b/configs/spmd/large_scale/forest_strong_reg.json
@@ -0,0 +1,25 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd forest regression parameters": {
+            "algorithm": {
+                "estimator": "RandomForestRegressor"
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_regression", "generation_kwargs": { "n_samples": 10000000,  "n_features": 100, "noise": 1.25 }, "split_kwargs": { "train_size": 10000000, "test_size": 5000 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } }}
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "forestReg": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale strong 2k parameters",
+		        "synthetic data",
+                "spmd forest regression parameters"
+            ]
+        }
+    }
+}

From 4fee9911538d6a7794d89ece46bc0bea6b5bdf44 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Mon, 23 Sep 2024 14:11:50 -0700
Subject: [PATCH 014/110] KNN weak

---
 configs/spmd/large_scale/knn.json | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json
index 8dd39f61..a7672ef5 100644
--- a/configs/spmd/large_scale/knn.json
+++ b/configs/spmd/large_scale/knn.json
@@ -19,10 +19,8 @@
         },
         "synthetic classification data": {
             "data": [
-		{ "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
-		{ "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 500000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
-                { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 50000 },   "generation_kwargs": {  "n_samples": 100000,  "n_features": 1000, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
-                { "source": "make_classification", "split_kwargs": { "train_size": 200000, "test_size": 200000 },   "generation_kwargs": {  "n_samples": 400000,  "n_features": 10, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
+		        { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
+		        { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 500000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
             ]
         }	
     },

From fce0651d81adab57fe2f1b95a85fb0f9e37d246d Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Mon, 23 Sep 2024 14:15:00 -0700
Subject: [PATCH 015/110] KNN strong

---
 configs/spmd/large_scale/knn_strong.json | 38 ++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 configs/spmd/large_scale/knn_strong.json

diff --git a/configs/spmd/large_scale/knn_strong.json b/configs/spmd/large_scale/knn_strong.json
new file mode 100644
index 00000000..15cc0226
--- /dev/null
+++ b/configs/spmd/large_scale/knn_strong.json
@@ -0,0 +1,38 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/knn.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd knn cls parameters": {
+            "algorithm": {
+		"estimator": "KNeighborsClassifier",
+                "estimator_params": {
+                    "algorithm": "brute",
+                    "metric": "minkowski",
+                    "p": 2,
+                    "weights": "uniform",
+		    "n_neighbors": 5
+                },
+		"estimator_methods": {
+			"training": "fit",
+			"inference": "predict"
+		}
+	    }
+        },
+        "synthetic classification data": {
+            "data": [
+		        { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
+		        { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 500000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
+            ]
+        }	
+    },
+    "TEMPLATES": {
+        "knn classifier": {
+            "SETS": [
+                "common knn parameters",
+                "synthetic classification data",
+                "sklearnex spmd implementation",
+		        "large scale strong 2k parameters",
+                "spmd knn cls parameters"
+            ]
+        }
+    }
+}

From e1ff9a0ab0e86ed500059e37c7aca6e6ed2d2b29 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Mon, 23 Sep 2024 23:07:52 -0700
Subject: [PATCH 016/110] experiment with ppn

---
 configs/spmd/large_scale/large_scale.json | 18 ++++++++++++++
 configs/spmd/large_scale/pca_single.json  | 30 +++++++++++++++++++++++
 2 files changed, 48 insertions(+)
 create mode 100644 configs/spmd/large_scale/pca_single.json

diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
index bf99dd5c..fcddc722 100644
--- a/configs/spmd/large_scale/large_scale.json
+++ b/configs/spmd/large_scale/large_scale.json
@@ -18,6 +18,24 @@
                 "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
+        "large scale one node parameters": {
+            "data": {
+                "dtype": "float64",
+		        "distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12], "ppn": [1,2,6,12], "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+        "large scale strong one node parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "rank_based"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12], "ppn": [1,2,6,12], "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
         "large scale 2k parameters": {
             "data": {
                 "dtype": "float64",
diff --git a/configs/spmd/large_scale/pca_single.json b/configs/spmd/large_scale/pca_single.json
new file mode 100644
index 00000000..61b2cf15
--- /dev/null
+++ b/configs/spmd/large_scale/pca_single.json
@@ -0,0 +1,30 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/pca.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd pca parameters": {
+            "algorithm": {
+                "estimator": "PCA",
+                "estimator_methods": { "training": "fit", "inference": "" }
+            },
+            "data": {
+                "split_kwargs": { "test_size": 0.0001 }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+		        "sklearnex spmd implementation",
+                "large scale one node parameters",
+                "synthetic data",
+		        "spmd pca parameters"
+            ]
+        }
+    }
+}

From e3d9a35a79869f6f6efa074c0a16f8b0948e85cc Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Mon, 23 Sep 2024 23:20:26 -0700
Subject: [PATCH 017/110] experiment with ppn

---
 configs/spmd/large_scale/large_scale.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
index fcddc722..cf81cbf0 100644
--- a/configs/spmd/large_scale/large_scale.json
+++ b/configs/spmd/large_scale/large_scale.json
@@ -24,7 +24,7 @@
 		        "distributed_split": "None"
             },
             "bench": {
-                "mpi_params": {"n": [1,2,6,12], "ppn": [1,2,6,12], "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+                "mpi_params": {"n": [1,2,3,4,5,6,7,8,9,10,11,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
         "large scale strong one node parameters": {
@@ -33,7 +33,7 @@
                 "distributed_split": "rank_based"
             },
             "bench": {
-                "mpi_params": {"n": [1,2,6,12], "ppn": [1,2,6,12], "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+                "mpi_params": {"n": [1,2,3,4,5,6,7,8,9,10,11,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
         "large scale 2k parameters": {

From 817710b83d09d1c44b523e10e4365144cbf14113 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Tue, 24 Sep 2024 00:05:42 -0700
Subject: [PATCH 018/110] bf16

---
 configs/regular/bf16/dbscan.json           | 41 ++++++++++++++++++++
 configs/regular/bf16/ensemble.json         | 45 ++++++++++++++++++++++
 configs/regular/bf16/kmeans.json           | 40 +++++++++++++++++++
 configs/regular/bf16/knn.json              | 34 ++++++++++++++++
 configs/regular/bf16/linear_model.json     | 29 ++++++++++++++
 configs/regular/bf16/logreg.json           | 42 ++++++++++++++++++++
 configs/regular/bf16/pca.json              | 33 ++++++++++++++++
 configs/spmd/large_scale/linear_model.json |  2 +-
 configs/spmd/large_scale/logreg.json       |  2 +-
 9 files changed, 266 insertions(+), 2 deletions(-)
 create mode 100644 configs/regular/bf16/dbscan.json
 create mode 100644 configs/regular/bf16/ensemble.json
 create mode 100644 configs/regular/bf16/kmeans.json
 create mode 100644 configs/regular/bf16/knn.json
 create mode 100644 configs/regular/bf16/linear_model.json
 create mode 100644 configs/regular/bf16/logreg.json
 create mode 100644 configs/regular/bf16/pca.json

diff --git a/configs/regular/bf16/dbscan.json b/configs/regular/bf16/dbscan.json
new file mode 100644
index 00000000..26e87ad6
--- /dev/null
+++ b/configs/regular/bf16/dbscan.json
@@ -0,0 +1,41 @@
+{
+    "INCLUDE": ["../common/sklearn.json"],
+    "PARAMETERS_SETS": {
+        "common dbscan parameters": {
+            "algorithm": {
+                "estimator": "DBSCAN",
+                "estimator_params": {
+                    "eps": "[SPECIAL_VALUE]distances_quantile:0.01",
+                    "min_samples": 5,
+                    "metric": "euclidean"
+                }
+            },
+            "data": {
+                "dtype": ["float32"]
+            }
+        },
+        "sklearn dbscan parameters": {
+            "algorithm": {
+                "estimator_params": {
+                    "algorithm": "brute",
+                    "n_jobs": "[SPECIAL_VALUE]physical_cpus"
+                }
+            }
+        },
+        "synthetic dataset": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 10, "centers": 10 }, "algorithm": { "eps": 5, "min_samples": 5 } }
+            ]
+	    }
+    },
+    "TEMPLATES": {
+        "sklearn dbscan": {
+            "SETS": [
+                "sklearn-ex[gpu] implementations",
+                "common dbscan parameters",
+                "sklearn dbscan parameters",
+                "synthetic dataset"
+            ]
+        }
+    }
+}
diff --git a/configs/regular/bf16/ensemble.json b/configs/regular/bf16/ensemble.json
new file mode 100644
index 00000000..f883a7af
--- /dev/null
+++ b/configs/regular/bf16/ensemble.json
@@ -0,0 +1,45 @@
+{
+    "INCLUDE": ["../common/sklearn.json"],
+    "PARAMETERS_SETS": {
+        "common ensemble params": {
+            "algorithm": {
+                "estimator_params": {
+                    "n_estimators": 200,
+                    "max_depth": 16,
+                    "max_samples": 1.0,
+                    "min_samples_split": 5,
+                    "min_samples_leaf": 2,
+                    "min_impurity_decrease": 0.0,
+                    "bootstrap": true,
+                    "random_state": 42
+                }
+            }
+        },
+        "sklearn ensemble classifier params": {
+            "algorithm": {
+                "estimator": ["RandomForestClassifier", "ExtraTreesClassifier"],
+                "estimator_params": {
+                    "criterion": "gini",
+                    "max_features": "sqrt",
+                    "max_leaf_nodes": null,
+                    "n_jobs": "[SPECIAL_VALUE]physical_cpus"
+                }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 501000, "n_features": 10, "n_classes": 2 },  "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "sklearn ensemble classification": {
+            "SETS": [
+                "sklearn-ex[gpu] implementations",
+                "common ensemble params",
+                "sklearn ensemble classifier params",
+                "synthetic data"
+            ]
+        }
+    }
+}
diff --git a/configs/regular/bf16/kmeans.json b/configs/regular/bf16/kmeans.json
new file mode 100644
index 00000000..1141e641
--- /dev/null
+++ b/configs/regular/bf16/kmeans.json
@@ -0,0 +1,40 @@
+{
+    "INCLUDE": ["../common/sklearn.json"],
+    "PARAMETERS_SETS": {
+        "common kmeans parameters": {
+            "algorithm": {
+                "estimator": "KMeans",
+                "estimator_params": {
+                    "n_clusters": "[SPECIAL_VALUE]auto",
+                    "n_init": 1,
+                    "max_iter": 30,
+                    "tol": 1e-3,
+                    "random_state": 42
+                },
+                "estimator_methods": { "inference": "predict" }
+            },
+            "data": {
+                "dtype": ["float32", "float64"],
+                "preprocessing_kwargs": { "normalize": true }
+            }
+        },
+        "sklearn kmeans parameters": {
+            "algorithm": { "estimator_params": { "init": "k-means++", "algorithm": "lloyd" } }
+        },
+        "synthetic data": {
+                "data": [
+                    { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000,  "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } }
+                ]
+        }
+    },
+    "TEMPLATES": {
+        "sklearn kmeans": {
+            "SETS": [
+                "sklearn-ex[gpu] implementations",
+                "common kmeans parameters",
+                "sklearn kmeans parameters",
+                "synthetic data"
+            ]
+        }
+    }
+}
diff --git a/configs/regular/bf16/knn.json b/configs/regular/bf16/knn.json
new file mode 100644
index 00000000..e6bdcf4e
--- /dev/null
+++ b/configs/regular/bf16/knn.json
@@ -0,0 +1,34 @@
+{
+    "INCLUDE": ["../common/sklearn.json"],
+    "PARAMETERS_SETS": {
+        "common knn parameters": {
+            "algorithm": {
+                "estimator_params": {
+                    "n_neighbors": [10, 100],
+                    "weights": "uniform"
+                }
+            },
+            "data": {
+                "preprocessing_kwargs": { "normalize": true }
+            }
+        },
+        "sklearn knn parameters": {
+            "algorithm": { "estimator_params": { "n_jobs": "[SPECIAL_VALUE]physical_cpus" } }
+        },
+        "synthetic classification data": {
+            "data": [
+		        { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
+            ]
+        }	
+    },
+    "TEMPLATES": {
+        "sklearn brute knn clsf": {
+            "SETS": [
+                "sklearn-ex[gpu] implementations",
+                "common knn parameters",
+                "sklearn knn parameters",
+                "synthetic classification data"
+            ]
+        }
+    }
+}
diff --git a/configs/regular/bf16/linear_model.json b/configs/regular/bf16/linear_model.json
new file mode 100644
index 00000000..528f8cca
--- /dev/null
+++ b/configs/regular/bf16/linear_model.json
@@ -0,0 +1,29 @@
+{
+    "INCLUDE": ["../common/sklearn.json"],
+    "PARAMETERS_SETS": {
+        "synthetic data": {
+            "data": [
+                { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 3000000, "test_size": 5000 } }
+            ]
+        },
+        "common linear parameters": {
+            "algorithm": {
+                "estimator": "LinearRegression",
+                "estimator_params": { "fit_intercept": true, "copy_X": true }
+            }
+        },
+        "sklearn linear parameters": {
+            "estimator_params": { "n_jobs": "[SPECIAL_VALUE]physical_cpus" }
+        }
+    },
+    "TEMPLATES": {
+        "sklearn linear": {
+            "SETS": [
+                "sklearn-ex[gpu] implementations",
+                "common linear parameters",
+                "sklearn linear parameters",
+                "synthetic data"
+            ]
+        }
+    }
+}
diff --git a/configs/regular/bf16/logreg.json b/configs/regular/bf16/logreg.json
new file mode 100644
index 00000000..0dd26e40
--- /dev/null
+++ b/configs/regular/bf16/logreg.json
@@ -0,0 +1,42 @@
+{
+    "INCLUDE": ["../common/sklearn.json"],
+    "PARAMETERS_SETS": {
+        "common logreg parameters": {
+            "algorithm": {
+                "estimator": "LogisticRegression",
+                "estimator_methods": { "inference": "predict" },
+                "estimator_params": {
+                    "penalty": "l2",
+                    "tol": 1e-4,
+                    "C": 1.0,
+                    "l1_ratio": null,
+                    "max_iter": 200
+                }
+            }
+        },
+        "sklearn logreg parameters": {
+            "algorithm": {
+                "estimator_params": {
+                    "solver": "lbfgs",
+                    "n_jobs": "[SPECIAL_VALUE]physical_cpus",
+                    "random_state": 42
+                }
+            }
+        },
+        "synthetic data": {
+            "data": [
+		        { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 5001000, "n_features": 10, "n_classes": 2 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "sklearn logreg": {
+            "SETS": [
+                "sklearn-ex[gpu] implementations",
+                "common logreg parameters",
+                "sklearn logreg parameters",
+                "synthetic data"
+            ]
+        }
+    }
+}
diff --git a/configs/regular/bf16/pca.json b/configs/regular/bf16/pca.json
new file mode 100644
index 00000000..9295aea5
--- /dev/null
+++ b/configs/regular/bf16/pca.json
@@ -0,0 +1,33 @@
+{
+    "INCLUDE": ["../common/sklearn.json"],
+    "PARAMETERS_SETS": {
+        "pca parameters": {
+            "algorithm": {
+                "estimator": "PCA",
+                "estimator_params": {
+                    "n_components": 3,
+                    "copy": true,
+                    "whiten": false,
+                    "svd_solver": "covariance_eigh",
+                    "tol": 0.0,
+                    "iterated_power": 15,
+                    "random_state": 42
+                }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "sklearn pca": {
+            "SETS": [
+                "sklearn-ex[gpu] implementations",
+                "pca parameters",
+                "synthetic data"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/linear_model.json b/configs/spmd/large_scale/linear_model.json
index e4bb14a1..e208da7d 100644
--- a/configs/spmd/large_scale/linear_model.json
+++ b/configs/spmd/large_scale/linear_model.json
@@ -10,7 +10,7 @@
         "synthetic data": {
             "data": [
                 { "source": "make_regression", "generation_kwargs": { "n_samples": 30005000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 30000000, "test_size": 5000 } },
-		{ "source": "make_regression", "generation_kwargs": { "n_samples": 305000,  "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 300000, "test_size": 5000 } }
+		        { "source": "make_regression", "generation_kwargs": { "n_samples": 305000,  "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 300000, "test_size": 5000 } }
             ]
         }
     },
diff --git a/configs/spmd/large_scale/logreg.json b/configs/spmd/large_scale/logreg.json
index ccef906b..bbd18f3b 100644
--- a/configs/spmd/large_scale/logreg.json
+++ b/configs/spmd/large_scale/logreg.json
@@ -10,7 +10,7 @@
 	},
         "synthetic data": {
             "data": [
-		{ "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 5001000, "n_features": 10, "n_classes": 2 } },
+		        { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 5001000, "n_features": 10, "n_classes": 2 } },
                 { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 101000, "n_features": 1000, "n_classes": 2 } }
             ]
         }

From aaa00392bfb3e54b7d4fa4f14a12bf8ef8f5fd66 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Tue, 24 Sep 2024 00:21:53 -0700
Subject: [PATCH 019/110] bf16

---
 configs/regular/bf16/dbscan.json       | 2 +-
 configs/regular/bf16/ensemble.json     | 2 +-
 configs/regular/bf16/kmeans.json       | 2 +-
 configs/regular/bf16/knn.json          | 2 +-
 configs/regular/bf16/linear_model.json | 2 +-
 configs/regular/bf16/logreg.json       | 2 +-
 configs/regular/bf16/pca.json          | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/configs/regular/bf16/dbscan.json b/configs/regular/bf16/dbscan.json
index 26e87ad6..b91120e8 100644
--- a/configs/regular/bf16/dbscan.json
+++ b/configs/regular/bf16/dbscan.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../common/sklearn.json"],
+    "INCLUDE": ["../../common/sklearn.json"],
     "PARAMETERS_SETS": {
         "common dbscan parameters": {
             "algorithm": {
diff --git a/configs/regular/bf16/ensemble.json b/configs/regular/bf16/ensemble.json
index f883a7af..d383bcac 100644
--- a/configs/regular/bf16/ensemble.json
+++ b/configs/regular/bf16/ensemble.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../common/sklearn.json"],
+    "INCLUDE": ["../../common/sklearn.json"],
     "PARAMETERS_SETS": {
         "common ensemble params": {
             "algorithm": {
diff --git a/configs/regular/bf16/kmeans.json b/configs/regular/bf16/kmeans.json
index 1141e641..084ae8f4 100644
--- a/configs/regular/bf16/kmeans.json
+++ b/configs/regular/bf16/kmeans.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../common/sklearn.json"],
+    "INCLUDE": ["../../common/sklearn.json"],
     "PARAMETERS_SETS": {
         "common kmeans parameters": {
             "algorithm": {
diff --git a/configs/regular/bf16/knn.json b/configs/regular/bf16/knn.json
index e6bdcf4e..1a62ef89 100644
--- a/configs/regular/bf16/knn.json
+++ b/configs/regular/bf16/knn.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../common/sklearn.json"],
+    "INCLUDE": ["../../common/sklearn.json"],
     "PARAMETERS_SETS": {
         "common knn parameters": {
             "algorithm": {
diff --git a/configs/regular/bf16/linear_model.json b/configs/regular/bf16/linear_model.json
index 528f8cca..7149e490 100644
--- a/configs/regular/bf16/linear_model.json
+++ b/configs/regular/bf16/linear_model.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../common/sklearn.json"],
+    "INCLUDE": ["../../common/sklearn.json"],
     "PARAMETERS_SETS": {
         "synthetic data": {
             "data": [
diff --git a/configs/regular/bf16/logreg.json b/configs/regular/bf16/logreg.json
index 0dd26e40..cde74c25 100644
--- a/configs/regular/bf16/logreg.json
+++ b/configs/regular/bf16/logreg.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../common/sklearn.json"],
+    "INCLUDE": ["../../common/sklearn.json"],
     "PARAMETERS_SETS": {
         "common logreg parameters": {
             "algorithm": {
diff --git a/configs/regular/bf16/pca.json b/configs/regular/bf16/pca.json
index 9295aea5..945c2939 100644
--- a/configs/regular/bf16/pca.json
+++ b/configs/regular/bf16/pca.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../common/sklearn.json"],
+    "INCLUDE": ["../../common/sklearn.json"],
     "PARAMETERS_SETS": {
         "pca parameters": {
             "algorithm": {

From 03a152a13c62eef3fa66b61109b76874d4e9b2b1 Mon Sep 17 00:00:00 2001
From: "Kruglov, Oleg" <oleg.kruglov@intel.com>
Date: Tue, 24 Sep 2024 02:46:36 -0700
Subject: [PATCH 020/110] Remove samples/ms info

---
 sklbench/benchmarks/sklearn_estimator.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
index 4cdde86d..7e616273 100644
--- a/sklbench/benchmarks/sklearn_estimator.py
+++ b/sklbench/benchmarks/sklearn_estimator.py
@@ -458,10 +458,6 @@ def measure_sklearn_estimator(
                     metrics[method]["time std[ms]"],
                     _,
                 ) = measure_case(bench_case, method_instance, *data_args)
-                if batch_size is not None:
-                    metrics[method]["throughput[samples/ms]"] = (
-                        (data_args[0].shape[0] // batch_size) * batch_size
-                    ) / metrics[method]["time[ms]"]
                 if ensure_sklearnex_patching:
                     full_method_name = f"{estimator_class.__name__}.{method}"
                     sklearnex_logging_stream.seek(0)

From b7d962e0c51d815a5f93da4c792ec8a8f1f9e4ce Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Tue, 24 Sep 2024 06:32:47 -0700
Subject: [PATCH 021/110] knn

---
 configs/spmd/large_scale/knn.json        | 4 ++--
 configs/spmd/large_scale/knn_strong.json | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json
index a7672ef5..cfd096cf 100644
--- a/configs/spmd/large_scale/knn.json
+++ b/configs/spmd/large_scale/knn.json
@@ -19,8 +19,8 @@
         },
         "synthetic classification data": {
             "data": [
-		        { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
-		        { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 500000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
+		        { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 5000 },   "generation_kwargs": {  "n_samples": 55000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
+		        { "source": "make_classification", "split_kwargs": { "train_size": 5000, "test_size": 50000 },   "generation_kwargs": {  "n_samples": 55000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
             ]
         }	
     },
diff --git a/configs/spmd/large_scale/knn_strong.json b/configs/spmd/large_scale/knn_strong.json
index 15cc0226..7682dc5e 100644
--- a/configs/spmd/large_scale/knn_strong.json
+++ b/configs/spmd/large_scale/knn_strong.json
@@ -19,8 +19,8 @@
         },
         "synthetic classification data": {
             "data": [
-		        { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
-		        { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 500000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
+		        { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 5000 },   "generation_kwargs": {  "n_samples": 505000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
+		        { "source": "make_classification", "split_kwargs": { "train_size": 5000, "test_size": 500000 },   "generation_kwargs": {  "n_samples": 500500,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
             ]
         }	
     },

From 3ac5c236eb6255892e607a6122d4d2187e4c5451 Mon Sep 17 00:00:00 2001
From: "Kruglov, Oleg" <oleg.kruglov@intel.com>
Date: Tue, 24 Sep 2024 06:45:42 -0700
Subject: [PATCH 022/110] Remove BS from config (need to add after pip version
 update)

---
 configs/incremental.json | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/configs/incremental.json b/configs/incremental.json
index 5f7a5477..c9ffb19c 100644
--- a/configs/incremental.json
+++ b/configs/incremental.json
@@ -88,9 +88,6 @@
     },
     "TEMPLATES": {
         "covariance": {"SETS": ["common", "covariance", "covariance data"]},
-        "basic_statistics": {
-            "SETS": ["common", "basic_statistics", "basic_statistics data"]
-        },
         "linear_regression": {
             "SETS": ["common", "linear_regression", "linear_regression data"]
         },

From 87b6fa6674f4b2e222c765e77a9f74b2bc786959 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Tue, 24 Sep 2024 22:32:38 -0700
Subject: [PATCH 023/110] basic stat single

---
 .../spmd/large_scale/basic_stats_single.json  | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 configs/spmd/large_scale/basic_stats_single.json

diff --git a/configs/spmd/large_scale/basic_stats_single.json b/configs/spmd/large_scale/basic_stats_single.json
new file mode 100644
index 00000000..e106b2a9
--- /dev/null
+++ b/configs/spmd/large_scale/basic_stats_single.json
@@ -0,0 +1,30 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd basicstats parameters": {
+            "algorithm": {
+                "estimator": "BasicStatistics",
+                "estimator_methods": { "training": "fit" }
+            },
+	    "data": {
+		"split_kwargs": { "test_size": 0.0001 }
+	    }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
+		        { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "basicstats": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale one node parameters",
+		        "synthetic data",
+                "spmd basicstats parameters"
+            ]
+        }
+    }
+}

From 9461fad69a00ecbf69a3e5fcef662fb1bafd4253 Mon Sep 17 00:00:00 2001
From: "Kruglov, Oleg" <oleg.kruglov@intel.com>
Date: Wed, 25 Sep 2024 02:00:29 -0700
Subject: [PATCH 024/110] Add condition for finalize

---
 sklbench/benchmarks/sklearn_estimator.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
index 7e616273..52f5bf4e 100644
--- a/sklbench/benchmarks/sklearn_estimator.py
+++ b/sklbench/benchmarks/sklearn_estimator.py
@@ -347,7 +347,8 @@ def ndarray_function(x, y):
                     x[i * batch_size : (i + 1) * batch_size],
                     y[i * batch_size : (i + 1) * batch_size],
                 )
-            estimator_instance._onedal_finalize_fit()
+            if hasattr(estimator_instance, "_onedal_finalize_fit"):
+                estimator_instance._onedal_finalize_fit()
 
         def dataframe_function(x, y):
             for i in range(n_batches):
@@ -355,19 +356,22 @@ def dataframe_function(x, y):
                     x.iloc[i * batch_size : (i + 1) * batch_size],
                     y.iloc[i * batch_size : (i + 1) * batch_size],
                 )
-            estimator_instance._onedal_finalize_fit()
+            if hasattr(estimator_instance, "_onedal_finalize_fit"):
+                estimator_instance._onedal_finalize_fit()
 
     else:
 
         def ndarray_function(x):
             for i in range(n_batches):
                 method_instance(x[i * batch_size : (i + 1) * batch_size])
-            estimator_instance._onedal_finalize_fit()
+            if hasattr(estimator_instance, "_onedal_finalize_fit"):
+                estimator_instance._onedal_finalize_fit()
 
         def dataframe_function(x):
             for i in range(n_batches):
                 method_instance(x.iloc[i * batch_size : (i + 1) * batch_size])
-            estimator_instance._onedal_finalize_fit()
+            if hasattr(estimator_instance, "_onedal_finalize_fit"):
+                estimator_instance._onedal_finalize_fit()
 
     if "ndarray" in str(type(data_args[0])):
         return ndarray_function

From b82d772f26c1af7d261b78bf94ae97280c23c9e2 Mon Sep 17 00:00:00 2001
From: "Kruglov, Oleg" <oleg.kruglov@intel.com>
Date: Wed, 25 Sep 2024 09:51:39 -0700
Subject: [PATCH 025/110] Fix num_batches usage

---
 sklbench/benchmarks/sklearn_estimator.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
index 52f5bf4e..3f8b1641 100644
--- a/sklbench/benchmarks/sklearn_estimator.py
+++ b/sklbench/benchmarks/sklearn_estimator.py
@@ -337,12 +337,11 @@ def verify_patching(stream: io.StringIO, function_name) -> bool:
 def create_online_function(
     estimator_instance, method_instance, data_args, num_batches, batch_size
 ):
-    n_batches = data_args[0].shape[0] // batch_size
 
     if "y" in list(inspect.signature(method_instance).parameters):
 
         def ndarray_function(x, y):
-            for i in range(n_batches):
+            for i in range(num_batches):
                 method_instance(
                     x[i * batch_size : (i + 1) * batch_size],
                     y[i * batch_size : (i + 1) * batch_size],
@@ -351,7 +350,7 @@ def ndarray_function(x, y):
                 estimator_instance._onedal_finalize_fit()
 
         def dataframe_function(x, y):
-            for i in range(n_batches):
+            for i in range(num_batches):
                 method_instance(
                     x.iloc[i * batch_size : (i + 1) * batch_size],
                     y.iloc[i * batch_size : (i + 1) * batch_size],
@@ -362,13 +361,13 @@ def dataframe_function(x, y):
     else:
 
         def ndarray_function(x):
-            for i in range(n_batches):
+            for i in range(num_batches):
                 method_instance(x[i * batch_size : (i + 1) * batch_size])
             if hasattr(estimator_instance, "_onedal_finalize_fit"):
                 estimator_instance._onedal_finalize_fit()
 
         def dataframe_function(x):
-            for i in range(n_batches):
+            for i in range(num_batches):
                 method_instance(x.iloc[i * batch_size : (i + 1) * batch_size])
             if hasattr(estimator_instance, "_onedal_finalize_fit"):
                 estimator_instance._onedal_finalize_fit()

From c70e1222a94d25c51e2239dff8430545383b7f56 Mon Sep 17 00:00:00 2001
From: ethanglaser <ethan.glaser@intel.com>
Date: Fri, 30 Aug 2024 23:27:21 +0000
Subject: [PATCH 026/110] Creating branch for large scale measurements

---
 configs/spmd/large_scale/basic_stats.json  | 30 +++++++++++++++
 configs/spmd/large_scale/covariance.json   | 30 +++++++++++++++
 configs/spmd/large_scale/dbscan.json       | 32 ++++++++++++++++
 configs/spmd/large_scale/kmeans.json       | 32 ++++++++++++++++
 configs/spmd/large_scale/knn.json          | 43 ++++++++++++++++++++++
 configs/spmd/large_scale/large_scale.json  | 31 ++++++++++++++++
 configs/spmd/large_scale/linear_model.json | 27 ++++++++++++++
 configs/spmd/large_scale/logreg.json       | 29 +++++++++++++++
 configs/spmd/large_scale/pca.json          | 30 +++++++++++++++
 sklbench/benchmarks/sklearn_estimator.py   |  8 ++--
 sklbench/datasets/transformer.py           |  9 +++--
 sklbench/runner/commands_helper.py         |  3 ++
 12 files changed, 298 insertions(+), 6 deletions(-)
 create mode 100644 configs/spmd/large_scale/basic_stats.json
 create mode 100644 configs/spmd/large_scale/covariance.json
 create mode 100644 configs/spmd/large_scale/dbscan.json
 create mode 100644 configs/spmd/large_scale/kmeans.json
 create mode 100644 configs/spmd/large_scale/knn.json
 create mode 100644 configs/spmd/large_scale/large_scale.json
 create mode 100644 configs/spmd/large_scale/linear_model.json
 create mode 100644 configs/spmd/large_scale/logreg.json
 create mode 100644 configs/spmd/large_scale/pca.json

diff --git a/configs/spmd/large_scale/basic_stats.json b/configs/spmd/large_scale/basic_stats.json
new file mode 100644
index 00000000..a9542017
--- /dev/null
+++ b/configs/spmd/large_scale/basic_stats.json
@@ -0,0 +1,30 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd basicstats parameters": {
+            "algorithm": {
+                "estimator": "BasicStatistics",
+                "estimator_methods": { "training": "compute" }
+            },
+	    "data": {
+		"split_kwargs": { "test_size": 0.0001 }
+	    }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
+		{ "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "basicstats": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale default parameters",
+		"synthetic data",
+                "spmd basicstats parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/covariance.json b/configs/spmd/large_scale/covariance.json
new file mode 100644
index 00000000..3280bf77
--- /dev/null
+++ b/configs/spmd/large_scale/covariance.json
@@ -0,0 +1,30 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd basicstats parameters": {
+            "algorithm": {
+                "estimator": "EmpiricalCovariance",
+                "estimator_methods": { "training": "fit" }
+            },
+            "data": {
+                "split_kwargs": { "test_size": 0.0001 }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "covariance": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale default parameters",
+		"synthetic data",
+                "spmd basicstats parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/dbscan.json b/configs/spmd/large_scale/dbscan.json
new file mode 100644
index 00000000..c46287d8
--- /dev/null
+++ b/configs/spmd/large_scale/dbscan.json
@@ -0,0 +1,32 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/dbscan.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd dbscan parameters": {
+	    "algorithm": {
+		"estimator": "DBSCAN",
+		"estimator_methods": {
+		    "training": "fit"
+		}
+	    },
+	    "data": {
+		"dtype": "float64"
+	    }
+	},
+	"synthetic dataset": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 10, "centers": 10 }, "algorithm": { "eps": 5, "min_samples": 5 } }
+            ]
+	}
+    },
+    "TEMPLATES": {
+        "dbscan": {
+            "SETS": [
+                "common dbscan parameters",
+                "synthetic dataset",
+                "sklearnex spmd implementation",
+		"large scale default parameters",
+                "spmd dbscan parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/kmeans.json b/configs/spmd/large_scale/kmeans.json
new file mode 100644
index 00000000..3b490f14
--- /dev/null
+++ b/configs/spmd/large_scale/kmeans.json
@@ -0,0 +1,32 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/kmeans.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd kmeans parameters": {
+            "algorithm": {
+                "estimator": "KMeans",
+                "estimator_params": {
+                    "algorithm": "lloyd"
+                },
+                "estimator_methods": { "training": "fit" }
+            },
+            "bench": {
+                "mpi_params": {"n": 48}
+            }
+	},
+	"synthetic data": {
+                "data": [
+                        { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000,  "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }
+                ]
+        }
+    },
+    "TEMPLATES": {
+        "kmeans": {
+            "SETS": [
+                "synthetic data",
+                "sklearnex spmd implementation",
+                "large scale default parameters",
+                "spmd kmeans parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json
new file mode 100644
index 00000000..8b82094d
--- /dev/null
+++ b/configs/spmd/large_scale/knn.json
@@ -0,0 +1,43 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/knn.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd knn cls parameters": {
+            "algorithm": {
+		"estimator": "KNeighborsClassifier",
+                "estimator_params": {
+                    "algorithm": "brute",
+                    "metric": "minkowski",
+                    "p": 2,
+                    "weights": "uniform",
+		    "n_neighbors": 5
+                },
+		"estimator_methods": {
+			"training": "fit",
+			"inference": "predict"
+		}
+	    },
+	    "bench": {
+	        "mpi_params": {}
+	    }
+        },
+        "synthetic classification data": {
+            "data": [
+		{ "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
+		{ "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 500000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
+                { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 50000 },   "generation_kwargs": {  "n_samples": 100000,  "n_features": 1000, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
+                { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 500000 },   "generation_kwargs": {  "n_samples": 1000000,  "n_features": 10, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
+            ]
+        }	
+    },
+    "TEMPLATES": {
+        "knn classifier": {
+            "SETS": [
+                "common knn parameters",
+                "synthetic classification data",
+                "sklearnex spmd implementation",
+		"large scale 2k parameters",
+                "spmd knn cls parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
new file mode 100644
index 00000000..4b39d5e2
--- /dev/null
+++ b/configs/spmd/large_scale/large_scale.json
@@ -0,0 +1,31 @@
+{
+    "PARAMETERS_SETS": {
+        "large scale default parameters": {
+            "data": {
+                "dtype": "float64",
+		"distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,3,4,5,6,7,8,9,10,11,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+	"large scale 2k parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+	"large scale impi parameters": {
+	    "data": {
+		"dtype": "float64",
+                "distributed_split": "no"
+            },
+            "bench": {
+		    "mpi_params": {"n": [1,2,4,6,12,24], "ppn": 12}
+	    }
+	}
+    }
+}
diff --git a/configs/spmd/large_scale/linear_model.json b/configs/spmd/large_scale/linear_model.json
new file mode 100644
index 00000000..4c861caa
--- /dev/null
+++ b/configs/spmd/large_scale/linear_model.json
@@ -0,0 +1,27 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/linear_model.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd linear parameters": {
+            "algorithm": {
+                "estimator": "LinearRegression",
+                "estimator_methods": { "training": "fit" }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_regression", "generation_kwargs": { "n_samples": 30005000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 30000000, "test_size": 5000 } },
+		{ "source": "make_regression", "generation_kwargs": { "n_samples": 305000,  "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 300000, "test_size": 5000 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale default parameters",
+		"synthetic data",
+                "spmd linear parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/logreg.json b/configs/spmd/large_scale/logreg.json
new file mode 100644
index 00000000..c5ef6203
--- /dev/null
+++ b/configs/spmd/large_scale/logreg.json
@@ -0,0 +1,29 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/logreg.json", "../logreg.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd logreg2 parameters": {
+	    "algorithm":{
+		"estimator": "LogisticRegression",
+                "estimator_methods": { "inference": "predict" },
+		"estimator_params": { "max_iter": 20 }
+            }
+	},
+        "synthetic data": {
+            "data": [
+		{ "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 5001000, "n_features": 10, "n_classes": 2 } },
+                { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 101000, "n_features": 1000, "n_classes": 2 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale 2k parameters",
+                "spmd logreg parameters",
+		"synthetic data",
+		"spmd logreg2 parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/pca.json b/configs/spmd/large_scale/pca.json
new file mode 100644
index 00000000..35c1942a
--- /dev/null
+++ b/configs/spmd/large_scale/pca.json
@@ -0,0 +1,30 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/pca.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd pca parameters": {
+            "algorithm": {
+                "estimator": "PCA",
+                "estimator_methods": { "training": "fit", "inference": "" }
+            },
+            "data": {
+                "split_kwargs": { "test_size": 0.0001 }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+		"sklearnex spmd implementation",
+                "large scale default parameters",
+                "synthetic data",
+		"spmd pca parameters"
+            ]
+        }
+    }
+}
diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
index 3f8b1641..cf977ad8 100644
--- a/sklbench/benchmarks/sklearn_estimator.py
+++ b/sklbench/benchmarks/sklearn_estimator.py
@@ -134,6 +134,9 @@ def get_subset_metrics_of_estimator(
                 and isinstance(iterations[0], Union[Numeric, NumpyNumeric].__args__)
             ):
                 metrics.update({"iterations": int(iterations[0])})
+        if hasattr(estimator_instance, "_n_inner_iter"):
+            inner_iters = estimator_instance._n_inner_iter
+            metrics.update({"inner_iters": int(inner_iters)})
     if task == "classification":
         y_pred = convert_to_numpy(estimator_instance.predict(x))
         metrics.update(
@@ -142,7 +145,7 @@ def get_subset_metrics_of_estimator(
                 "balanced accuracy": float(balanced_accuracy_score(y_compat, y_pred)),
             }
         )
-        if hasattr(estimator_instance, "predict_proba") and not (
+        '''if hasattr(estimator_instance, "predict_proba") and not (
             hasattr(estimator_instance, "probability")
             and getattr(estimator_instance, "probability") == False
         ):
@@ -162,7 +165,7 @@ def get_subset_metrics_of_estimator(
                     ),
                     "logloss": float(log_loss(y_compat, y_pred_proba)),
                 }
-            )
+            )'''
     elif task == "regression":
         y_pred = convert_to_numpy(estimator_instance.predict(x))
         metrics.update(
@@ -454,7 +457,6 @@ def measure_sklearn_estimator(
                         estimator_instance.get_booster()
                     )
                     method_instance = getattr(daal_model, method)
-
                 metrics[method] = dict()
                 (
                     metrics[method]["time[ms]"],
diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index d2e63e9e..1ac7d7bc 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -109,7 +109,8 @@ def split_and_transform_data(bench_case, data, data_description):
         y_train, y_test = None, None
 
     distributed_split = get_bench_case_value(bench_case, "data:distributed_split", None)
-    if distributed_split == "rank_based":
+    knn_split_train = "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "") and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1
+    if distributed_split == "rank_based" or knn_split_train:
         from mpi4py import MPI
 
         comm = MPI.COMM_WORLD
@@ -129,10 +130,12 @@ def split_and_transform_data(bench_case, data, data_description):
                 x_train[train_start:train_end],
                 y_train[train_start:train_end],
             )
-            x_test, y_test = x_test[test_start:test_end], y_test[test_start:test_end]
+            if distributed_split == "rank_based":
+                x_test, y_test = x_test[test_start:test_end], y_test[test_start:test_end]
         else:
             x_train = x_train[train_start:train_end]
-            x_test = x_test[test_start:test_end]
+            if distributed_split == "rank_based":
+                x_test = x_test[test_start:test_end]
 
     device = get_bench_case_value(bench_case, "algorithm:device", None)
     common_data_format = get_bench_case_value(bench_case, "data:format", "pandas")
diff --git a/sklbench/runner/commands_helper.py b/sklbench/runner/commands_helper.py
index b66da011..a63686c6 100644
--- a/sklbench/runner/commands_helper.py
+++ b/sklbench/runner/commands_helper.py
@@ -45,6 +45,9 @@ def generate_benchmark_command(
         mpi_prefix = "mpirun"
         for mpi_param_name, mpi_param_value in mpi_params.items():
             mpi_prefix += f" -{mpi_param_name} {mpi_param_value}"
+            if mpi_param_name == "-hostfile":
+                import os
+                mpi_prefix += os.environ.get("PBS_NODEFILE")
         command_prefix = f"{mpi_prefix} {command_prefix}"
     # 3. Intel(R) VTune* profiling command prefix
     vtune_profiling = get_bench_case_value(bench_case, "bench:vtune_profiling")

From 8d74f6d3b6f4514e1a3ecc165d900b4c7928f70e Mon Sep 17 00:00:00 2001
From: ethanglaser <ethan.glaser@intel.com>
Date: Wed, 18 Sep 2024 18:01:07 +0000
Subject: [PATCH 027/110] strong scaling, config updates, minor revisions

---
 configs/spmd/large_scale/basic_stats.json     |  4 +--
 .../spmd/large_scale/basic_stats_strong.json  | 29 +++++++++++++++++
 configs/spmd/large_scale/covariance.json      |  2 +-
 .../spmd/large_scale/covariance_strong.json   | 29 +++++++++++++++++
 configs/spmd/large_scale/kmeans.json          | 11 +++----
 configs/spmd/large_scale/knn.json             |  5 +--
 configs/spmd/large_scale/large_scale.json     | 24 ++++++++++++--
 configs/spmd/large_scale/linear_model.json    |  2 +-
 .../spmd/large_scale/linear_model_strong.json | 26 ++++++++++++++++
 configs/spmd/large_scale/logreg_strong.json   | 28 +++++++++++++++++
 configs/spmd/large_scale/pca.json             |  2 +-
 configs/spmd/large_scale/pca_strong.json      | 29 +++++++++++++++++
 sklbench/benchmarks/sklearn_estimator.py      |  4 ++-
 sklbench/utils/measurement.py                 | 31 ++++++++++++++-----
 14 files changed, 200 insertions(+), 26 deletions(-)
 create mode 100644 configs/spmd/large_scale/basic_stats_strong.json
 create mode 100644 configs/spmd/large_scale/covariance_strong.json
 create mode 100644 configs/spmd/large_scale/linear_model_strong.json
 create mode 100644 configs/spmd/large_scale/logreg_strong.json
 create mode 100644 configs/spmd/large_scale/pca_strong.json

diff --git a/configs/spmd/large_scale/basic_stats.json b/configs/spmd/large_scale/basic_stats.json
index a9542017..9ac4725f 100644
--- a/configs/spmd/large_scale/basic_stats.json
+++ b/configs/spmd/large_scale/basic_stats.json
@@ -4,7 +4,7 @@
         "spmd basicstats parameters": {
             "algorithm": {
                 "estimator": "BasicStatistics",
-                "estimator_methods": { "training": "compute" }
+                "estimator_methods": { "training": "fit" }
             },
 	    "data": {
 		"split_kwargs": { "test_size": 0.0001 }
@@ -21,7 +21,7 @@
         "basicstats": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale default parameters",
+                "large scale 2k parameters",
 		"synthetic data",
                 "spmd basicstats parameters"
             ]
diff --git a/configs/spmd/large_scale/basic_stats_strong.json b/configs/spmd/large_scale/basic_stats_strong.json
new file mode 100644
index 00000000..b7aa22cb
--- /dev/null
+++ b/configs/spmd/large_scale/basic_stats_strong.json
@@ -0,0 +1,29 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd basicstats parameters": {
+            "algorithm": {
+                "estimator": "BasicStatistics",
+                "estimator_methods": { "training": "fit" }
+            },
+	    "data": {
+		"split_kwargs": { "test_size": 0.0001 }
+	    }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000,  "n_features": 100, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "basicstats": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale strong 2k parameters",
+		"synthetic data",
+                "spmd basicstats parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/covariance.json b/configs/spmd/large_scale/covariance.json
index 3280bf77..260befd0 100644
--- a/configs/spmd/large_scale/covariance.json
+++ b/configs/spmd/large_scale/covariance.json
@@ -21,7 +21,7 @@
         "covariance": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale default parameters",
+                "large scale 2k parameters",
 		"synthetic data",
                 "spmd basicstats parameters"
             ]
diff --git a/configs/spmd/large_scale/covariance_strong.json b/configs/spmd/large_scale/covariance_strong.json
new file mode 100644
index 00000000..568b4a8f
--- /dev/null
+++ b/configs/spmd/large_scale/covariance_strong.json
@@ -0,0 +1,29 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd basicstats parameters": {
+            "algorithm": {
+                "estimator": "EmpiricalCovariance",
+                "estimator_methods": { "training": "fit" }
+            },
+            "data": {
+                "split_kwargs": { "test_size": 0.0001 }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000,  "n_features": 100, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "covariance": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale strong 2k parameters",
+		"synthetic data",
+                "spmd basicstats parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/kmeans.json b/configs/spmd/large_scale/kmeans.json
index 3b490f14..89524965 100644
--- a/configs/spmd/large_scale/kmeans.json
+++ b/configs/spmd/large_scale/kmeans.json
@@ -7,15 +7,14 @@
                 "estimator_params": {
                     "algorithm": "lloyd"
                 },
-                "estimator_methods": { "training": "fit" }
-            },
-            "bench": {
-                "mpi_params": {"n": 48}
+                "estimator_methods": { "training": "fit", "inference": "predict" }
             }
 	},
 	"synthetic data": {
                 "data": [
-                        { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000,  "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }
+                        { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000,  "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
+	                { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000,  "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
+			{ "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000,  "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } }
                 ]
         }
     },
@@ -24,7 +23,7 @@
             "SETS": [
                 "synthetic data",
                 "sklearnex spmd implementation",
-                "large scale default parameters",
+                "large scale 2k parameters",
                 "spmd kmeans parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json
index 8b82094d..e979e2aa 100644
--- a/configs/spmd/large_scale/knn.json
+++ b/configs/spmd/large_scale/knn.json
@@ -15,9 +15,6 @@
 			"training": "fit",
 			"inference": "predict"
 		}
-	    },
-	    "bench": {
-	        "mpi_params": {}
 	    }
         },
         "synthetic classification data": {
@@ -35,7 +32,7 @@
                 "common knn parameters",
                 "synthetic classification data",
                 "sklearnex spmd implementation",
-		"large scale 2k parameters",
+		"large scale default parameters",
                 "spmd knn cls parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
index 4b39d5e2..72b808fe 100644
--- a/configs/spmd/large_scale/large_scale.json
+++ b/configs/spmd/large_scale/large_scale.json
@@ -6,7 +6,16 @@
 		"distributed_split": "None"
             },
             "bench": {
-                "mpi_params": {"n": [1,2,3,4,5,6,7,8,9,10,11,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+                "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+	"large scale strong parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "rank_based"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
 	"large scale 2k parameters": {
@@ -15,7 +24,16 @@
                 "distributed_split": "None"
             },
             "bench": {
-                "mpi_params": {"n": [192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+        "large scale strong 2k parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "rank_based"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
 	"large scale impi parameters": {
@@ -24,7 +42,7 @@
                 "distributed_split": "no"
             },
             "bench": {
-		    "mpi_params": {"n": [1,2,4,6,12,24], "ppn": 12}
+		    "mpi_params": {"n": [1,2,4,6,9,12], "ppn": 12}
 	    }
 	}
     }
diff --git a/configs/spmd/large_scale/linear_model.json b/configs/spmd/large_scale/linear_model.json
index 4c861caa..aeda4441 100644
--- a/configs/spmd/large_scale/linear_model.json
+++ b/configs/spmd/large_scale/linear_model.json
@@ -18,7 +18,7 @@
         "linreg": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale default parameters",
+                "large scale 2k parameters",
 		"synthetic data",
                 "spmd linear parameters"
             ]
diff --git a/configs/spmd/large_scale/linear_model_strong.json b/configs/spmd/large_scale/linear_model_strong.json
new file mode 100644
index 00000000..77a9c79e
--- /dev/null
+++ b/configs/spmd/large_scale/linear_model_strong.json
@@ -0,0 +1,26 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/linear_model.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd linear parameters": {
+            "algorithm": {
+                "estimator": "LinearRegression",
+                "estimator_methods": { "training": "fit" }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_regression", "generation_kwargs": { "n_samples": 25005000,  "n_features": 100, "noise": 1.25 }, "split_kwargs": { "train_size": 25000000, "test_size": 5000 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale strong 2k parameters",
+		"synthetic data",
+                "spmd linear parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/logreg_strong.json b/configs/spmd/large_scale/logreg_strong.json
new file mode 100644
index 00000000..2bf1c0f9
--- /dev/null
+++ b/configs/spmd/large_scale/logreg_strong.json
@@ -0,0 +1,28 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/logreg.json", "../logreg.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd logreg2 parameters": {
+	    "algorithm":{
+		"estimator": "LogisticRegression",
+                "estimator_methods": { "inference": "predict" },
+		"estimator_params": { "max_iter": 30 }
+            }
+	},
+        "synthetic data": {
+            "data": [
+		{ "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 10001000, "n_features": 100, "n_classes": 2 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale strong 2k parameters",
+                "spmd logreg parameters",
+		"synthetic data",
+		"spmd logreg2 parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/pca.json b/configs/spmd/large_scale/pca.json
index 35c1942a..9a6a6b02 100644
--- a/configs/spmd/large_scale/pca.json
+++ b/configs/spmd/large_scale/pca.json
@@ -21,7 +21,7 @@
         "linreg": {
             "SETS": [
 		"sklearnex spmd implementation",
-                "large scale default parameters",
+                "large scale 2k parameters",
                 "synthetic data",
 		"spmd pca parameters"
             ]
diff --git a/configs/spmd/large_scale/pca_strong.json b/configs/spmd/large_scale/pca_strong.json
new file mode 100644
index 00000000..adee3c79
--- /dev/null
+++ b/configs/spmd/large_scale/pca_strong.json
@@ -0,0 +1,29 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/pca.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd pca parameters": {
+            "algorithm": {
+                "estimator": "PCA",
+                "estimator_methods": { "training": "fit", "inference": "" }
+            },
+            "data": {
+                "split_kwargs": { "test_size": 0.0001 }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000,  "n_features": 100, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+		"sklearnex spmd implementation",
+                "large scale strong 2k parameters",
+                "synthetic data",
+		"spmd pca parameters"
+            ]
+        }
+    }
+}
diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
index cf977ad8..0fc4874e 100644
--- a/sklbench/benchmarks/sklearn_estimator.py
+++ b/sklbench/benchmarks/sklearn_estimator.py
@@ -461,7 +461,9 @@ def measure_sklearn_estimator(
                 (
                     metrics[method]["time[ms]"],
                     metrics[method]["time std[ms]"],
-                    _,
+                    metrics[method]["first iter[ms]"],
+                    metrics[method]["box filter mean[ms]"],
+                    metrics[method]["box filter std[ms]"]
                 ) = measure_case(bench_case, method_instance, *data_args)
                 if ensure_sklearnex_patching:
                     full_method_name = f"{estimator_class.__name__}.{method}"
diff --git a/sklbench/utils/measurement.py b/sklbench/utils/measurement.py
index 989daefd..df74e8da 100644
--- a/sklbench/utils/measurement.py
+++ b/sklbench/utils/measurement.py
@@ -40,6 +40,22 @@ def box_filter(timing, left=0.2, right=0.8):
     return np.mean(result) * 1000, np.std(result) * 1000
 
 
+def large_scale_measurements(timing):
+    first_iter = timing[0] * 1000
+    mean = np.mean(timing[1:]) * 1000
+    stdev = np.std(timing[1:]) * 1000
+    timing_sorted = np.sort(timing)
+    Q1, Q3 = np.percentile(timing_sorted, [25, 75])
+    IQ = Q3 - Q1
+    lower, upper = Q1 - 1.5 * IQ, Q3 + 1.5 * IQ
+    
+    filtered_times = timing_sorted[(timing_sorted >= lower) & (timing_sorted <= upper)]
+    
+    box_filter_mean = np.mean(filtered_times) * 1000 if filtered_times.size > 0 else 0
+    box_filter_stdev = np.std(filtered_times) * 1000 if filtered_times.size > 0 else 0
+    return mean, stdev, first_iter, box_filter_mean, box_filter_stdev
+
+
 def measure_time(
     func,
     *args,
@@ -72,13 +88,14 @@ def measure_time(
                 f"exceeded time limit ({time_limit} seconds)"
             )
             break
-    mean, std = box_filter(times)
-    if std / mean > std_mean_ratio:
-        logger.warning(
-            f'Measured "std / mean" time ratio of "{str(func)}" function is higher '
-            f"than threshold ({round(std / mean, 3)} vs. {std_mean_ratio})"
-        )
-    return mean, std, func_return_value
+    logger.debug(times)
+    #mean, std = box_filter(times)
+    #if std / mean > std_mean_ratio:
+    #    logger.warning(
+    #        f'Measured "std / mean" time ratio of "{str(func)}" function is higher '
+    #        f"than threshold ({round(std / mean, 3)} vs. {std_mean_ratio})"
+    #    )
+    return large_scale_measurements(times)
 
 
 # wrapper to get measurement params from benchmarking case

From 192744f91ee7199d98986f8ec953def96c0153eb Mon Sep 17 00:00:00 2001
From: ethanglaser <ethan.glaser@intel.com>
Date: Sat, 21 Sep 2024 05:25:44 +0000
Subject: [PATCH 028/110] knn and forest config updates

---
 configs/spmd/large_scale/forest.json        | 26 +++++++++++++++++++++
 configs/spmd/large_scale/forest_strong.json | 25 ++++++++++++++++++++
 configs/spmd/large_scale/knn.json           |  4 ++--
 sklbench/benchmarks/sklearn_estimator.py    |  4 ++--
 sklbench/datasets/transformer.py            |  2 +-
 5 files changed, 56 insertions(+), 5 deletions(-)
 create mode 100644 configs/spmd/large_scale/forest.json
 create mode 100644 configs/spmd/large_scale/forest_strong.json

diff --git a/configs/spmd/large_scale/forest.json b/configs/spmd/large_scale/forest.json
new file mode 100644
index 00000000..ee614ed3
--- /dev/null
+++ b/configs/spmd/large_scale/forest.json
@@ -0,0 +1,26 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd forest classification parameters": {
+            "algorithm": {
+                "estimator": "RandomForestClassifier"
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 501000, "n_features": 10, "n_classes": 2 },  "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } },
+                { "source": "make_classification", "split_kwargs": { "train_size": 10000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 11000, "n_features": 1000, "n_classes": 2 },  "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "basicstats": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale 2k parameters",
+		"synthetic data",
+                "spmd forest classification parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/forest_strong.json b/configs/spmd/large_scale/forest_strong.json
new file mode 100644
index 00000000..121aa916
--- /dev/null
+++ b/configs/spmd/large_scale/forest_strong.json
@@ -0,0 +1,25 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd forest classification parameters": {
+            "algorithm": {
+                "estimator": "RandomForestClassifier"
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 10001000, "n_features": 100, "n_classes": 2 },  "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "basicstats": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale strong 2k parameters",
+		"synthetic data",
+                "spmd forest classification parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json
index e979e2aa..1ef849f1 100644
--- a/configs/spmd/large_scale/knn.json
+++ b/configs/spmd/large_scale/knn.json
@@ -22,7 +22,7 @@
 		{ "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
 		{ "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 500000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
                 { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 50000 },   "generation_kwargs": {  "n_samples": 100000,  "n_features": 1000, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
-                { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 500000 },   "generation_kwargs": {  "n_samples": 1000000,  "n_features": 10, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
+                { "source": "make_classification", "split_kwargs": { "train_size": 200000, "test_size": 200000 },   "generation_kwargs": {  "n_samples": 400000,  "n_features": 10, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
             ]
         }	
     },
@@ -32,7 +32,7 @@
                 "common knn parameters",
                 "synthetic classification data",
                 "sklearnex spmd implementation",
-		"large scale default parameters",
+		"large scale 2k parameters",
                 "spmd knn cls parameters"
             ]
         }
diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
index 0fc4874e..296a5e17 100644
--- a/sklbench/benchmarks/sklearn_estimator.py
+++ b/sklbench/benchmarks/sklearn_estimator.py
@@ -546,8 +546,8 @@ def main(bench_case: BenchCase, filters: List[BenchCase]):
     result_template = enrich_result(result_template, bench_case)
     if "assume_finite" in context_params:
         result_template["assume_finite"] = context_params["assume_finite"]
-    if hasattr(estimator_instance, "get_params"):
-        estimator_params = estimator_instance.get_params()
+    #if hasattr(estimator_instance, "get_params"):
+    #    estimator_params = estimator_instance.get_params()
     # note: "handle" is not JSON-serializable
     if "handle" in estimator_params:
         del estimator_params["handle"]
diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index 1ac7d7bc..55cfc245 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -181,7 +181,7 @@ def split_and_transform_data(bench_case, data, data_description):
                 "format": data_format,
                 "order": data_order,
                 "dtype": data_dtype,
-                "samples": converted_data.shape[0],
+                "samples (per rank)": converted_data.shape[0],
             }
             if len(converted_data.shape) == 2 and converted_data.shape[1] > 1:
                 data_description[subset_name]["features"] = converted_data.shape[1]

From b1f2c1589cf8cae48b416ab70268643d9f0a2d6c Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Mon, 23 Sep 2024 04:53:12 -0700
Subject: [PATCH 029/110] lint

---
 sklbench/benchmarks/sklearn_estimator.py | 8 ++++----
 sklbench/datasets/transformer.py         | 5 ++++-
 sklbench/runner/commands_helper.py       | 1 +
 sklbench/utils/measurement.py            | 8 ++++----
 4 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
index 296a5e17..b4d4f3ee 100644
--- a/sklbench/benchmarks/sklearn_estimator.py
+++ b/sklbench/benchmarks/sklearn_estimator.py
@@ -145,7 +145,7 @@ def get_subset_metrics_of_estimator(
                 "balanced accuracy": float(balanced_accuracy_score(y_compat, y_pred)),
             }
         )
-        '''if hasattr(estimator_instance, "predict_proba") and not (
+        """if hasattr(estimator_instance, "predict_proba") and not (
             hasattr(estimator_instance, "probability")
             and getattr(estimator_instance, "probability") == False
         ):
@@ -165,7 +165,7 @@ def get_subset_metrics_of_estimator(
                     ),
                     "logloss": float(log_loss(y_compat, y_pred_proba)),
                 }
-            )'''
+            )"""
     elif task == "regression":
         y_pred = convert_to_numpy(estimator_instance.predict(x))
         metrics.update(
@@ -463,7 +463,7 @@ def measure_sklearn_estimator(
                     metrics[method]["time std[ms]"],
                     metrics[method]["first iter[ms]"],
                     metrics[method]["box filter mean[ms]"],
-                    metrics[method]["box filter std[ms]"]
+                    metrics[method]["box filter std[ms]"],
                 ) = measure_case(bench_case, method_instance, *data_args)
                 if ensure_sklearnex_patching:
                     full_method_name = f"{estimator_class.__name__}.{method}"
@@ -546,7 +546,7 @@ def main(bench_case: BenchCase, filters: List[BenchCase]):
     result_template = enrich_result(result_template, bench_case)
     if "assume_finite" in context_params:
         result_template["assume_finite"] = context_params["assume_finite"]
-    #if hasattr(estimator_instance, "get_params"):
+    # if hasattr(estimator_instance, "get_params"):
     #    estimator_params = estimator_instance.get_params()
     # note: "handle" is not JSON-serializable
     if "handle" in estimator_params:
diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index 55cfc245..86944ead 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -109,7 +109,10 @@ def split_and_transform_data(bench_case, data, data_description):
         y_train, y_test = None, None
 
     distributed_split = get_bench_case_value(bench_case, "data:distributed_split", None)
-    knn_split_train = "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "") and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1
+    knn_split_train = (
+        "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "")
+        and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1
+    )
     if distributed_split == "rank_based" or knn_split_train:
         from mpi4py import MPI
 
diff --git a/sklbench/runner/commands_helper.py b/sklbench/runner/commands_helper.py
index a63686c6..2441085a 100644
--- a/sklbench/runner/commands_helper.py
+++ b/sklbench/runner/commands_helper.py
@@ -47,6 +47,7 @@ def generate_benchmark_command(
             mpi_prefix += f" -{mpi_param_name} {mpi_param_value}"
             if mpi_param_name == "-hostfile":
                 import os
+
                 mpi_prefix += os.environ.get("PBS_NODEFILE")
         command_prefix = f"{mpi_prefix} {command_prefix}"
     # 3. Intel(R) VTune* profiling command prefix
diff --git a/sklbench/utils/measurement.py b/sklbench/utils/measurement.py
index df74e8da..7495e258 100644
--- a/sklbench/utils/measurement.py
+++ b/sklbench/utils/measurement.py
@@ -48,9 +48,9 @@ def large_scale_measurements(timing):
     Q1, Q3 = np.percentile(timing_sorted, [25, 75])
     IQ = Q3 - Q1
     lower, upper = Q1 - 1.5 * IQ, Q3 + 1.5 * IQ
-    
+
     filtered_times = timing_sorted[(timing_sorted >= lower) & (timing_sorted <= upper)]
-    
+
     box_filter_mean = np.mean(filtered_times) * 1000 if filtered_times.size > 0 else 0
     box_filter_stdev = np.std(filtered_times) * 1000 if filtered_times.size > 0 else 0
     return mean, stdev, first_iter, box_filter_mean, box_filter_stdev
@@ -89,8 +89,8 @@ def measure_time(
             )
             break
     logger.debug(times)
-    #mean, std = box_filter(times)
-    #if std / mean > std_mean_ratio:
+    # mean, std = box_filter(times)
+    # if std / mean > std_mean_ratio:
     #    logger.warning(
     #        f'Measured "std / mean" time ratio of "{str(func)}" function is higher '
     #        f"than threshold ({round(std / mean, 3)} vs. {std_mean_ratio})"

From f3be7377caa262f6240a6414cff73de1d3a94c18 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Mon, 23 Sep 2024 06:28:03 -0700
Subject: [PATCH 030/110] just gpu for regular

---
 configs/common/sklearn.json       |  5 +++++
 configs/regular/dbscan.json       |  2 +-
 configs/regular/ensemble.json     |  4 ++--
 configs/regular/kmeans.json       |  2 +-
 configs/regular/knn.json          | 20 ++------------------
 configs/regular/linear_model.json | 24 +-----------------------
 configs/regular/logreg.json       |  2 +-
 configs/regular/pca.json          |  2 +-
 8 files changed, 14 insertions(+), 47 deletions(-)

diff --git a/configs/common/sklearn.json b/configs/common/sklearn.json
index d7b13188..43051093 100644
--- a/configs/common/sklearn.json
+++ b/configs/common/sklearn.json
@@ -12,6 +12,11 @@
                 { "library": "sklearnex", "device": ["cpu", "gpu"] }
             ]
         },
+        "sklearn-ex[gpu] implementations": {
+            "algorithm": [
+                { "library": "sklearnex", "device": ["gpu"] }
+            ]
+        },
         "sklearn-ex[preview] implementations": {
             "algorithm": [
                 { "library": "sklearn", "device": "cpu" },
diff --git a/configs/regular/dbscan.json b/configs/regular/dbscan.json
index 71dcdc9b..1d0d732b 100644
--- a/configs/regular/dbscan.json
+++ b/configs/regular/dbscan.json
@@ -58,7 +58,7 @@
     "TEMPLATES": {
         "sklearn dbscan": {
             "SETS": [
-                "sklearn-ex[cpu,gpu] implementations",
+                "sklearn-ex[gpu] implementations",
                 "common dbscan parameters",
                 "sklearn dbscan parameters",
                 "dbscan datasets"
diff --git a/configs/regular/ensemble.json b/configs/regular/ensemble.json
index 56e37e77..164cb236 100644
--- a/configs/regular/ensemble.json
+++ b/configs/regular/ensemble.json
@@ -90,7 +90,7 @@
     "TEMPLATES": {
         "sklearn ensemble classification": {
             "SETS": [
-                "sklearn-ex[cpu,gpu] implementations",
+                "sklearn-ex[gpu] implementations",
                 "common ensemble params",
                 "sklearn ensemble classifier params",
                 "ensemble classification data"
@@ -98,7 +98,7 @@
         },
         "sklearn ensemble regression": {
             "SETS": [
-                "sklearn-ex[cpu,gpu] implementations",
+                "sklearn-ex[gpu] implementations",
                 "common ensemble params",
                 "sklearn ensemble regressor params",
                 "ensemble regression data"
diff --git a/configs/regular/kmeans.json b/configs/regular/kmeans.json
index d4953615..8aba9055 100644
--- a/configs/regular/kmeans.json
+++ b/configs/regular/kmeans.json
@@ -70,7 +70,7 @@
     "TEMPLATES": {
         "sklearn kmeans": {
             "SETS": [
-                "sklearn-ex[preview] implementations",
+                "sklearn-ex[gpu] implementations",
                 "common kmeans parameters",
                 "sklearn kmeans parameters",
                 "kmeans datasets"
diff --git a/configs/regular/knn.json b/configs/regular/knn.json
index e1cd8a75..bcbed117 100644
--- a/configs/regular/knn.json
+++ b/configs/regular/knn.json
@@ -74,36 +74,20 @@
     "TEMPLATES": {
         "sklearn brute knn clsf": {
             "SETS": [
-                "sklearn-ex[cpu,gpu] implementations",
+                "sklearn-ex[gpu] implementations",
                 "common knn parameters",
                 "sklearn knn parameters",
                 "brute knn algorithm - classification data"
             ]
         },
-        "sklearn kd_tree knn clsf": {
-            "SETS": [
-                "sklearn-ex[cpu] implementations",
-                "common knn parameters",
-                "sklearn knn parameters",
-                "kd_tree knn algorithm - classification data"
-            ]
-        },
         "sklearn brute knn regr": {
             "SETS": [
-                "sklearn-ex[cpu,gpu] implementations",
+                "sklearn-ex[gpu] implementations",
                 "common knn parameters",
                 "sklearn knn parameters",
                 "brute knn algorithm - regression data"
             ]
         },
-        "sklearn kd_tree knn regr": {
-            "SETS": [
-                "sklearn-ex[cpu] implementations",
-                "common knn parameters",
-                "sklearn knn parameters",
-                "kd_tree knn algorithm - regression data"
-            ]
-        },
         "cuml brute knn clsf": {
             "SETS": [
                 "cuml implementation",
diff --git a/configs/regular/linear_model.json b/configs/regular/linear_model.json
index eb1b79ba..66667343 100644
--- a/configs/regular/linear_model.json
+++ b/configs/regular/linear_model.json
@@ -85,34 +85,12 @@
     "TEMPLATES": {
         "sklearn linear": {
             "SETS": [
-                "sklearn-ex[cpu,gpu] implementations",
+                "sklearn-ex[gpu] implementations",
                 "common linear parameters",
                 "sklearn linear parameters",
                 "regression datasets"
             ]
         },
-        "sklearn ridge": {
-            "SETS": [
-                "sklearn-ex[cpu] implementations",
-                "common ridge parameters",
-                "sklearn ridge parameters",
-                "regression datasets"
-            ]
-        },
-        "sklearn lasso": {
-            "SETS": [
-                "sklearn-ex[cpu] implementations",
-                "common lasso parameters",
-                "regression datasets"
-            ]
-        },
-        "sklearn elasticnet": {
-            "SETS": [
-                "sklearn-ex[cpu] implementations",
-                "common elasticnet parameters",
-                "regression datasets"
-            ]
-        },
         "cuml linear": {
             "SETS": [
                 "cuml implementation",
diff --git a/configs/regular/logreg.json b/configs/regular/logreg.json
index a94a7fcf..172ceb48 100644
--- a/configs/regular/logreg.json
+++ b/configs/regular/logreg.json
@@ -54,7 +54,7 @@
     "TEMPLATES": {
         "sklearn logreg": {
             "SETS": [
-                "sklearn-ex[cpu,gpu] implementations",
+                "sklearn-ex[gpu] implementations",
                 "common logreg parameters",
                 "sklearn logreg parameters",
                 "logreg datasets"
diff --git a/configs/regular/pca.json b/configs/regular/pca.json
index 582acc9e..2300454d 100644
--- a/configs/regular/pca.json
+++ b/configs/regular/pca.json
@@ -46,7 +46,7 @@
     "TEMPLATES": {
         "sklearn pca": {
             "SETS": [
-                "sklearn-ex[cpu,gpu] implementations",
+                "sklearn-ex[gpu] implementations",
                 "pca parameters",
                 "pca datasets"
             ]

From ee8c74b5ef53b450876ae6b00d926f8377692038 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Mon, 23 Sep 2024 07:10:55 -0700
Subject: [PATCH 031/110] juremove cuml

---
 configs/regular/dbscan.json       |  8 --------
 configs/regular/ensemble.json     | 16 ----------------
 configs/regular/kmeans.json       |  8 --------
 configs/regular/knn.json          | 14 --------------
 configs/regular/linear_model.json | 24 ------------------------
 configs/regular/logreg.json       |  8 --------
 configs/regular/pca.json          |  7 -------
 7 files changed, 85 deletions(-)

diff --git a/configs/regular/dbscan.json b/configs/regular/dbscan.json
index 1d0d732b..711c15cd 100644
--- a/configs/regular/dbscan.json
+++ b/configs/regular/dbscan.json
@@ -63,14 +63,6 @@
                 "sklearn dbscan parameters",
                 "dbscan datasets"
             ]
-        },
-        "cuml dbscan": {
-            "SETS": [
-                "cuml implementation",
-                "common dbscan parameters",
-                "cuml dbscan parameters",
-                "dbscan datasets"
-            ]
         }
     }
 }
diff --git a/configs/regular/ensemble.json b/configs/regular/ensemble.json
index 164cb236..f01c1383 100644
--- a/configs/regular/ensemble.json
+++ b/configs/regular/ensemble.json
@@ -103,22 +103,6 @@
                 "sklearn ensemble regressor params",
                 "ensemble regression data"
             ]
-        },
-        "cuml ensemble classification": {
-            "SETS": [
-                "cuml implementation",
-                "common ensemble params",
-                "cuml ensemble classifier params",
-                "ensemble classification data"
-            ]
-        },
-        "cuml ensemble regression": {
-            "SETS": [
-                "cuml implementation",
-                "common ensemble params",
-                "cuml ensemble regressor params",
-                "ensemble regression data"
-            ]
         }
     }
 }
diff --git a/configs/regular/kmeans.json b/configs/regular/kmeans.json
index 8aba9055..756e2bab 100644
--- a/configs/regular/kmeans.json
+++ b/configs/regular/kmeans.json
@@ -75,14 +75,6 @@
                 "sklearn kmeans parameters",
                 "kmeans datasets"
             ]
-        },
-        "cuml kmeans": {
-            "SETS": [
-                "cuml implementation",
-                "common kmeans parameters",
-                "cuml kmeans parameters",
-                "kmeans datasets"
-            ]
         }
     }
 }
diff --git a/configs/regular/knn.json b/configs/regular/knn.json
index bcbed117..a69c6864 100644
--- a/configs/regular/knn.json
+++ b/configs/regular/knn.json
@@ -87,20 +87,6 @@
                 "sklearn knn parameters",
                 "brute knn algorithm - regression data"
             ]
-        },
-        "cuml brute knn clsf": {
-            "SETS": [
-                "cuml implementation",
-                "common knn parameters",
-                "brute knn algorithm - classification data"
-            ]
-        },
-        "cuml brute knn regr": {
-            "SETS": [
-                "cuml implementation",
-                "common knn parameters",
-                "brute knn algorithm - regression data"
-            ]
         }
     }
 }
diff --git a/configs/regular/linear_model.json b/configs/regular/linear_model.json
index 66667343..3040c82d 100644
--- a/configs/regular/linear_model.json
+++ b/configs/regular/linear_model.json
@@ -98,30 +98,6 @@
                 "cuml L2 parameters",
                 "regression datasets"
             ]
-        },
-        "cuml ridge": {
-            "SETS": [
-                "cuml implementation",
-                "common ridge parameters",
-                "cuml L2 parameters",
-                "regression datasets"
-            ]
-        },
-        "cuml lasso": {
-            "SETS": [
-                "cuml implementation",
-                "common lasso parameters",
-                "cuml L1 parameters",
-                "regression datasets"
-            ]
-        },
-        "cuml elasticnet": {
-            "SETS": [
-                "cuml implementation",
-                "common elasticnet parameters",
-                "cuml L1 parameters",
-                "regression datasets"
-            ]
         }
     }
 }
diff --git a/configs/regular/logreg.json b/configs/regular/logreg.json
index 172ceb48..a8323b02 100644
--- a/configs/regular/logreg.json
+++ b/configs/regular/logreg.json
@@ -59,14 +59,6 @@
                 "sklearn logreg parameters",
                 "logreg datasets"
             ]
-        },
-        "cuml logreg": {
-            "SETS": [
-                "cuml implementation",
-                "common logreg parameters",
-                "cuml logreg parameters",
-                "logreg datasets"
-            ]
         }
     }
 }
diff --git a/configs/regular/pca.json b/configs/regular/pca.json
index 2300454d..e26d3f44 100644
--- a/configs/regular/pca.json
+++ b/configs/regular/pca.json
@@ -50,13 +50,6 @@
                 "pca parameters",
                 "pca datasets"
             ]
-        },
-        "cuml pca": {
-            "SETS": [
-                "cuml implementation",
-                "pca parameters",
-                "pca datasets"
-            ]
         }
     }
 }

From 93eae2f09c46ce0afa9a1aa62bc9169d8518dacb Mon Sep 17 00:00:00 2001
From: "Kruglov, Oleg" <oleg.kruglov@intel.com>
Date: Tue, 24 Sep 2024 05:34:56 -0700
Subject: [PATCH 032/110] Add metrics to list for proper report generation

---
 sklbench/report/implementation.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py
index df15b5eb..f1bda219 100644
--- a/sklbench/report/implementation.py
+++ b/sklbench/report/implementation.py
@@ -32,6 +32,9 @@
 METRICS = {
     "lower is better": [
         "time[ms]",
+        "first iter[ms]",
+        "box filter mean[ms]",
+        "box filter std[ms]",
         "iterations",
         # classification
         "logloss",

From 574ff2a35509643d6bae5a457f71e59676c8ef2d Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Wed, 25 Sep 2024 23:54:03 -0700
Subject: [PATCH 033/110] batch for online

---
 configs/regular/batch_for_online.json | 97 +++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 configs/regular/batch_for_online.json

diff --git a/configs/regular/batch_for_online.json b/configs/regular/batch_for_online.json
new file mode 100644
index 00000000..8acd604d
--- /dev/null
+++ b/configs/regular/batch_for_online.json
@@ -0,0 +1,97 @@
+{
+    "INCLUDE": ["./common/sklearn.json"],
+    "PARAMETERS_SETS": {
+        "common": {"bench": {"n_runs": 10, "time_limit": 60}},
+        "covariance data": {
+            "data": [
+                {
+                    "source": "make_blobs",
+                    "generation_kwargs": {
+                        "centers": 1,
+                        "n_samples": 1200000,
+                        "n_features": [10, 100]
+                    },
+                    "split_kwargs": {"ignore": true}
+                }
+            ]
+        },
+        "basic_statistics data": {
+            "data": {
+                "source": "make_blobs",
+                "generation_kwargs": {
+                    "centers": 1,
+                    "n_samples": 1200000,
+                    "n_features": [10, 100]
+                },
+                "split_kwargs": {"ignore": true}
+            }
+        },
+        "linear_regression data": {
+            "data": {
+                "source": "make_regression",
+                "split_kwargs": {"train_size": 0.2, "test_size": 0.8},
+                "generation_kwargs": {
+                    "n_samples": 1200000,
+                    "n_features": [10, 100],
+                    "n_informative": 5,
+                    "noise": 2.0
+                }
+            }
+        },
+        "pca data": {
+            "data": {
+                "source": "make_blobs",
+                "generation_kwargs": {
+                    "centers": 1,
+                    "n_samples": 1200000,
+                    "n_features": [10, 100]
+                },
+                "split_kwargs": {"ignore": true}
+            }
+        },
+        "covariance": {
+            "algorithm": [
+                {
+                    "estimator": "EmpiricalCovariance",
+                    "library": "sklearnex.covariance",
+                    "estimator_methods": {"training": "fit"}
+                }
+            ]
+        },
+        "basic_statistics": {
+            "algorithm": [
+                {
+                    "estimator": "BasicStatistics",
+                    "library": "sklearnex.basic_statistics",
+                    "estimator_methods": {"training": "fit"}
+                }
+            ]
+        },
+        "linear_regression": {
+            "algorithm": [
+                {
+                    "estimator": "LinearRegression",
+                    "library": "sklearnex.linear_model",
+                    "estimator_methods": {"training": "fit"}
+                }
+            ]
+        },
+        "pca": {
+            "algorithm": [
+                {
+                    "estimator": "IncrementalPCA",
+                    "library": "sklearnex.decomposition",
+                    "estimator_methods": {"training": "fit"}
+                }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "basic_statistics": {"SETS": ["common", "basic_statistics", "basic_statistics data", "sklearn-ex[gpu] implementations"]},
+        "covariance": {"SETS": ["common", "covariance", "covariance data", "sklearn-ex[gpu] implementations"]},
+        "linear_regression": {
+            "SETS": ["common", "linear_regression", "linear_regression data", "sklearn-ex[gpu] implementations"]
+        },
+        "pca": {"SETS": ["common", "pca", "pca data", "sklearn-ex[gpu] implementations"]}
+    }
+}
\ No newline at end of file

From da7f425920cba0701e5bc7b22e4262f6c5da6aac Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Thu, 26 Sep 2024 00:11:03 -0700
Subject: [PATCH 034/110] online vs spmd

---
 configs/spmd/large_scale/large_scale.json     | 18 ++++
 configs/spmd/large_scale/spmd_for_online.json | 96 +++++++++++++++++++
 .../large_scale/spmd_for_online_strong.json   | 96 +++++++++++++++++++
 3 files changed, 210 insertions(+)
 create mode 100644 configs/spmd/large_scale/spmd_for_online.json
 create mode 100644 configs/spmd/large_scale/spmd_for_online_strong.json

diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
index cf81cbf0..6469b8aa 100644
--- a/configs/spmd/large_scale/large_scale.json
+++ b/configs/spmd/large_scale/large_scale.json
@@ -36,6 +36,24 @@
                 "mpi_params": {"n": [1,2,3,4,5,6,7,8,9,10,11,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
+        "large scale full one node parameters": {
+            "data": {
+                "dtype": "float64",
+		        "distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+        "large scale strong full one node parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "rank_based"
+            },
+            "bench": {
+                "mpi_params": {"n": [12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
         "large scale 2k parameters": {
             "data": {
                 "dtype": "float64",
diff --git a/configs/spmd/large_scale/spmd_for_online.json b/configs/spmd/large_scale/spmd_for_online.json
new file mode 100644
index 00000000..8e3af579
--- /dev/null
+++ b/configs/spmd/large_scale/spmd_for_online.json
@@ -0,0 +1,96 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "covariance data": {
+            "data": [
+                {
+                    "source": "make_blobs",
+                    "generation_kwargs": {
+                        "centers": 1,
+                        "n_samples": 100000,
+                        "n_features": [10, 100]
+                    },
+                    "split_kwargs": {"ignore": true}
+                }
+            ]
+        },
+        "basic_statistics data": {
+            "data": {
+                "source": "make_blobs",
+                "generation_kwargs": {
+                    "centers": 1,
+                    "n_samples": 100000,
+                    "n_features": [10, 100]
+                },
+                "split_kwargs": {"ignore": true}
+            }
+        },
+        "linear_regression data": {
+            "data": {
+                "source": "make_regression",
+                "split_kwargs": {"train_size": 0.2, "test_size": 0.8},
+                "generation_kwargs": {
+                    "n_samples": 100000,
+                    "n_features": [10, 100],
+                    "n_informative": 5,
+                    "noise": 2.0
+                }
+            }
+        },
+        "pca data": {
+            "data": {
+                "source": "make_blobs",
+                "generation_kwargs": {
+                    "centers": 1,
+                    "n_samples": 100000,
+                    "n_features": [10, 100]
+                },
+                "split_kwargs": {"ignore": true}
+            }
+        },
+        "covariance": {
+            "algorithm": [
+                {
+                    "estimator": "EmpiricalCovariance",
+                    "library": "sklearnex.covariance",
+                    "estimator_methods": {"training": "fit"}
+                }
+            ]
+        },
+        "basic_statistics": {
+            "algorithm": [
+                {
+                    "estimator": "BasicStatistics",
+                    "library": "sklearnex.basic_statistics",
+                    "estimator_methods": {"training": "fit"}
+                }
+            ]
+        },
+        "linear_regression": {
+            "algorithm": [
+                {
+                    "estimator": "LinearRegression",
+                    "library": "sklearnex.linear_model",
+                    "estimator_methods": {"training": "fit"}
+                }
+            ]
+        },
+        "pca": {
+            "algorithm": [
+                {
+                    "estimator": "IncrementalPCA",
+                    "library": "sklearnex.decomposition",
+                    "estimator_methods": {"training": "fit"}
+                }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "basic_statistics": {"SETS": ["basic_statistics", "basic_statistics data", "large scale strong full one node parameters", "sklearnex spmd implementation"]},
+        "covariance": {"SETS": ["covariance", "covariance data", "large scale strong full one node parameters", "sklearnex spmd implementation"]},
+        "linear_regression": {
+            "SETS": ["linear_regression", "linear_regression data", "large scale strong full one node parameters", "sklearnex spmd implementation"]
+        },
+        "pca": {"SETS": ["pca", "pca data", "large scale strong full one node parameters", "sklearnex spmd implementation"]}
+    }
+}
\ No newline at end of file
diff --git a/configs/spmd/large_scale/spmd_for_online_strong.json b/configs/spmd/large_scale/spmd_for_online_strong.json
new file mode 100644
index 00000000..abcff3ad
--- /dev/null
+++ b/configs/spmd/large_scale/spmd_for_online_strong.json
@@ -0,0 +1,96 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "covariance data": {
+            "data": [
+                {
+                    "source": "make_blobs",
+                    "generation_kwargs": {
+                        "centers": 1,
+                        "n_samples": 1200000,
+                        "n_features": [10, 100]
+                    },
+                    "split_kwargs": {"ignore": true}
+                }
+            ]
+        },
+        "basic_statistics data": {
+            "data": {
+                "source": "make_blobs",
+                "generation_kwargs": {
+                    "centers": 1,
+                    "n_samples": 1200000,
+                    "n_features": [10, 100]
+                },
+                "split_kwargs": {"ignore": true}
+            }
+        },
+        "linear_regression data": {
+            "data": {
+                "source": "make_regression",
+                "split_kwargs": {"train_size": 0.2, "test_size": 0.8},
+                "generation_kwargs": {
+                    "n_samples": 1200000,
+                    "n_features": [10, 100],
+                    "n_informative": 5,
+                    "noise": 2.0
+                }
+            }
+        },
+        "pca data": {
+            "data": {
+                "source": "make_blobs",
+                "generation_kwargs": {
+                    "centers": 1,
+                    "n_samples": 1200000,
+                    "n_features": [10, 100]
+                },
+                "split_kwargs": {"ignore": true}
+            }
+        },
+        "covariance": {
+            "algorithm": [
+                {
+                    "estimator": "EmpiricalCovariance",
+                    "library": "sklearnex.covariance",
+                    "estimator_methods": {"training": "fit"}
+                }
+            ]
+        },
+        "basic_statistics": {
+            "algorithm": [
+                {
+                    "estimator": "BasicStatistics",
+                    "library": "sklearnex.basic_statistics",
+                    "estimator_methods": {"training": "fit"}
+                }
+            ]
+        },
+        "linear_regression": {
+            "algorithm": [
+                {
+                    "estimator": "LinearRegression",
+                    "library": "sklearnex.linear_model",
+                    "estimator_methods": {"training": "fit"}
+                }
+            ]
+        },
+        "pca": {
+            "algorithm": [
+                {
+                    "estimator": "IncrementalPCA",
+                    "library": "sklearnex.decomposition",
+                    "estimator_methods": {"training": "fit"}
+                }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "basic_statistics": {"SETS": ["basic_statistics", "basic_statistics data", "large scale strong full one node parameters", "sklearnex spmd implementation"]},
+        "covariance": {"SETS": ["covariance", "covariance data", "large scale strong full one node parameters", "sklearnex spmd implementation"]},
+        "linear_regression": {
+            "SETS": ["linear_regression", "linear_regression data", "large scale strong full one node parameters", "sklearnex spmd implementation"]
+        },
+        "pca": {"SETS": ["pca", "pca data", "large scale strong full one node parameters", "sklearnex spmd implementation"]}
+    }
+}
\ No newline at end of file

From 2377a9e9f803e304cb05696278311bf583d04fcc Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Thu, 26 Sep 2024 00:35:30 -0700
Subject: [PATCH 035/110] spmd vs online fix

---
 configs/spmd/large_scale/spmd_for_online.json | 46 ++-----------------
 .../large_scale/spmd_for_online_strong.json   | 46 ++-----------------
 2 files changed, 10 insertions(+), 82 deletions(-)

diff --git a/configs/spmd/large_scale/spmd_for_online.json b/configs/spmd/large_scale/spmd_for_online.json
index 8e3af579..0a4bc9da 100644
--- a/configs/spmd/large_scale/spmd_for_online.json
+++ b/configs/spmd/large_scale/spmd_for_online.json
@@ -47,50 +47,14 @@
                 },
                 "split_kwargs": {"ignore": true}
             }
-        },
-        "covariance": {
-            "algorithm": [
-                {
-                    "estimator": "EmpiricalCovariance",
-                    "library": "sklearnex.covariance",
-                    "estimator_methods": {"training": "fit"}
-                }
-            ]
-        },
-        "basic_statistics": {
-            "algorithm": [
-                {
-                    "estimator": "BasicStatistics",
-                    "library": "sklearnex.basic_statistics",
-                    "estimator_methods": {"training": "fit"}
-                }
-            ]
-        },
-        "linear_regression": {
-            "algorithm": [
-                {
-                    "estimator": "LinearRegression",
-                    "library": "sklearnex.linear_model",
-                    "estimator_methods": {"training": "fit"}
-                }
-            ]
-        },
-        "pca": {
-            "algorithm": [
-                {
-                    "estimator": "IncrementalPCA",
-                    "library": "sklearnex.decomposition",
-                    "estimator_methods": {"training": "fit"}
-                }
-            ]
         }
     },
     "TEMPLATES": {
-        "basic_statistics": {"SETS": ["basic_statistics", "basic_statistics data", "large scale strong full one node parameters", "sklearnex spmd implementation"]},
-        "covariance": {"SETS": ["covariance", "covariance data", "large scale strong full one node parameters", "sklearnex spmd implementation"]},
+        "basic_statistics": {"SETS": ["basic_statistics data", "spmd default parameters", "sklearnex spmd implementation", "large scale full one node parameters"]},
+        "covariance": {"SETS": ["covariance data", "spmd default parameters","sklearnex spmd implementation", "large scale full one node parameters"]},
         "linear_regression": {
-            "SETS": ["linear_regression", "linear_regression data", "large scale strong full one node parameters", "sklearnex spmd implementation"]
+            "SETS": ["linear_regression data", "spmd default parameters", "sklearnex spmd implementation", "large scale full one node parameters"]
         },
-        "pca": {"SETS": ["pca", "pca data", "large scale strong full one node parameters", "sklearnex spmd implementation"]}
+        "pca": {"SETS": ["pca data", "spmd default parameters", "sklearnex spmd implementation", "large scale full one node parameters"]}
     }
-}
\ No newline at end of file
+}
diff --git a/configs/spmd/large_scale/spmd_for_online_strong.json b/configs/spmd/large_scale/spmd_for_online_strong.json
index abcff3ad..152e94f3 100644
--- a/configs/spmd/large_scale/spmd_for_online_strong.json
+++ b/configs/spmd/large_scale/spmd_for_online_strong.json
@@ -47,50 +47,14 @@
                 },
                 "split_kwargs": {"ignore": true}
             }
-        },
-        "covariance": {
-            "algorithm": [
-                {
-                    "estimator": "EmpiricalCovariance",
-                    "library": "sklearnex.covariance",
-                    "estimator_methods": {"training": "fit"}
-                }
-            ]
-        },
-        "basic_statistics": {
-            "algorithm": [
-                {
-                    "estimator": "BasicStatistics",
-                    "library": "sklearnex.basic_statistics",
-                    "estimator_methods": {"training": "fit"}
-                }
-            ]
-        },
-        "linear_regression": {
-            "algorithm": [
-                {
-                    "estimator": "LinearRegression",
-                    "library": "sklearnex.linear_model",
-                    "estimator_methods": {"training": "fit"}
-                }
-            ]
-        },
-        "pca": {
-            "algorithm": [
-                {
-                    "estimator": "IncrementalPCA",
-                    "library": "sklearnex.decomposition",
-                    "estimator_methods": {"training": "fit"}
-                }
-            ]
         }
     },
     "TEMPLATES": {
-        "basic_statistics": {"SETS": ["basic_statistics", "basic_statistics data", "large scale strong full one node parameters", "sklearnex spmd implementation"]},
-        "covariance": {"SETS": ["covariance", "covariance data", "large scale strong full one node parameters", "sklearnex spmd implementation"]},
+        "basic_statistics": {"SETS": ["basic_statistics data", "spmd default parameters", "sklearnex spmd implementation", "large scale strong full one node parameters"]},
+        "covariance": {"SETS": ["covariance data", "spmd default parameters","sklearnex spmd implementation", "large scale strong full one node parameters"]},
         "linear_regression": {
-            "SETS": ["linear_regression", "linear_regression data", "large scale strong full one node parameters", "sklearnex spmd implementation"]
+            "SETS": ["linear_regression data", "spmd default parameters", "sklearnex spmd implementation", "large scale strong full one node parameters"]
         },
-        "pca": {"SETS": ["pca", "pca data", "large scale strong full one node parameters", "sklearnex spmd implementation"]}
+        "pca": {"SETS": ["pca data", "spmd default parameters", "sklearnex spmd implementation", "large scale strong full one node parameters"]}
     }
-}
\ No newline at end of file
+}

From 3e4333e7271fe0658b91d35913393f6ea9589bd4 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Thu, 26 Sep 2024 00:40:07 -0700
Subject: [PATCH 036/110] batch vs online fix

---
 configs/regular/batch_for_online.json | 25 +------------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

diff --git a/configs/regular/batch_for_online.json b/configs/regular/batch_for_online.json
index 8acd604d..5dd0c131 100644
--- a/configs/regular/batch_for_online.json
+++ b/configs/regular/batch_for_online.json
@@ -2,19 +2,6 @@
     "INCLUDE": ["./common/sklearn.json"],
     "PARAMETERS_SETS": {
         "common": {"bench": {"n_runs": 10, "time_limit": 60}},
-        "covariance data": {
-            "data": [
-                {
-                    "source": "make_blobs",
-                    "generation_kwargs": {
-                        "centers": 1,
-                        "n_samples": 1200000,
-                        "n_features": [10, 100]
-                    },
-                    "split_kwargs": {"ignore": true}
-                }
-            ]
-        },
         "basic_statistics data": {
             "data": {
                 "source": "make_blobs",
@@ -49,15 +36,6 @@
                 "split_kwargs": {"ignore": true}
             }
         },
-        "covariance": {
-            "algorithm": [
-                {
-                    "estimator": "EmpiricalCovariance",
-                    "library": "sklearnex.covariance",
-                    "estimator_methods": {"training": "fit"}
-                }
-            ]
-        },
         "basic_statistics": {
             "algorithm": [
                 {
@@ -79,7 +57,7 @@
         "pca": {
             "algorithm": [
                 {
-                    "estimator": "IncrementalPCA",
+                    "estimator": "PCA",
                     "library": "sklearnex.decomposition",
                     "estimator_methods": {"training": "fit"}
                 }
@@ -88,7 +66,6 @@
     },
     "TEMPLATES": {
         "basic_statistics": {"SETS": ["common", "basic_statistics", "basic_statistics data", "sklearn-ex[gpu] implementations"]},
-        "covariance": {"SETS": ["common", "covariance", "covariance data", "sklearn-ex[gpu] implementations"]},
         "linear_regression": {
             "SETS": ["common", "linear_regression", "linear_regression data", "sklearn-ex[gpu] implementations"]
         },

From 40ad9d51434d1a079ffd1ae6883fe4a4437afdb5 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Thu, 26 Sep 2024 01:04:45 -0700
Subject: [PATCH 037/110] increase online data size

---
 configs/regular/batch_for_online.json                | 6 +++---
 configs/spmd/large_scale/spmd_for_online.json        | 8 ++++----
 configs/spmd/large_scale/spmd_for_online_strong.json | 8 ++++----
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/configs/regular/batch_for_online.json b/configs/regular/batch_for_online.json
index 5dd0c131..cbcdaa5c 100644
--- a/configs/regular/batch_for_online.json
+++ b/configs/regular/batch_for_online.json
@@ -7,7 +7,7 @@
                 "source": "make_blobs",
                 "generation_kwargs": {
                     "centers": 1,
-                    "n_samples": 1200000,
+                    "n_samples": 12000000,
                     "n_features": [10, 100]
                 },
                 "split_kwargs": {"ignore": true}
@@ -18,7 +18,7 @@
                 "source": "make_regression",
                 "split_kwargs": {"train_size": 0.2, "test_size": 0.8},
                 "generation_kwargs": {
-                    "n_samples": 1200000,
+                    "n_samples": 12000000,
                     "n_features": [10, 100],
                     "n_informative": 5,
                     "noise": 2.0
@@ -30,7 +30,7 @@
                 "source": "make_blobs",
                 "generation_kwargs": {
                     "centers": 1,
-                    "n_samples": 1200000,
+                    "n_samples": 12000000,
                     "n_features": [10, 100]
                 },
                 "split_kwargs": {"ignore": true}
diff --git a/configs/spmd/large_scale/spmd_for_online.json b/configs/spmd/large_scale/spmd_for_online.json
index 0a4bc9da..7f258e9b 100644
--- a/configs/spmd/large_scale/spmd_for_online.json
+++ b/configs/spmd/large_scale/spmd_for_online.json
@@ -7,7 +7,7 @@
                     "source": "make_blobs",
                     "generation_kwargs": {
                         "centers": 1,
-                        "n_samples": 100000,
+                        "n_samples": 1000000,
                         "n_features": [10, 100]
                     },
                     "split_kwargs": {"ignore": true}
@@ -19,7 +19,7 @@
                 "source": "make_blobs",
                 "generation_kwargs": {
                     "centers": 1,
-                    "n_samples": 100000,
+                    "n_samples": 1000000,
                     "n_features": [10, 100]
                 },
                 "split_kwargs": {"ignore": true}
@@ -30,7 +30,7 @@
                 "source": "make_regression",
                 "split_kwargs": {"train_size": 0.2, "test_size": 0.8},
                 "generation_kwargs": {
-                    "n_samples": 100000,
+                    "n_samples": 1000000,
                     "n_features": [10, 100],
                     "n_informative": 5,
                     "noise": 2.0
@@ -42,7 +42,7 @@
                 "source": "make_blobs",
                 "generation_kwargs": {
                     "centers": 1,
-                    "n_samples": 100000,
+                    "n_samples": 1000000,
                     "n_features": [10, 100]
                 },
                 "split_kwargs": {"ignore": true}
diff --git a/configs/spmd/large_scale/spmd_for_online_strong.json b/configs/spmd/large_scale/spmd_for_online_strong.json
index 152e94f3..77a25075 100644
--- a/configs/spmd/large_scale/spmd_for_online_strong.json
+++ b/configs/spmd/large_scale/spmd_for_online_strong.json
@@ -7,7 +7,7 @@
                     "source": "make_blobs",
                     "generation_kwargs": {
                         "centers": 1,
-                        "n_samples": 1200000,
+                        "n_samples": 12000000,
                         "n_features": [10, 100]
                     },
                     "split_kwargs": {"ignore": true}
@@ -19,7 +19,7 @@
                 "source": "make_blobs",
                 "generation_kwargs": {
                     "centers": 1,
-                    "n_samples": 1200000,
+                    "n_samples": 12000000,
                     "n_features": [10, 100]
                 },
                 "split_kwargs": {"ignore": true}
@@ -30,7 +30,7 @@
                 "source": "make_regression",
                 "split_kwargs": {"train_size": 0.2, "test_size": 0.8},
                 "generation_kwargs": {
-                    "n_samples": 1200000,
+                    "n_samples": 12000000,
                     "n_features": [10, 100],
                     "n_informative": 5,
                     "noise": 2.0
@@ -42,7 +42,7 @@
                 "source": "make_blobs",
                 "generation_kwargs": {
                     "centers": 1,
-                    "n_samples": 1200000,
+                    "n_samples": 12000000,
                     "n_features": [10, 100]
                 },
                 "split_kwargs": {"ignore": true}

From 894ed1d5678cc1d116b8c74446494af3d9b54550 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Thu, 26 Sep 2024 01:24:27 -0700
Subject: [PATCH 038/110] batch vs online fix

---
 configs/regular/batch_for_online.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/regular/batch_for_online.json b/configs/regular/batch_for_online.json
index cbcdaa5c..d4239c65 100644
--- a/configs/regular/batch_for_online.json
+++ b/configs/regular/batch_for_online.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["./common/sklearn.json"],
+    "INCLUDE": ["../common/sklearn.json"],
     "PARAMETERS_SETS": {
         "common": {"bench": {"n_runs": 10, "time_limit": 60}},
         "basic_statistics data": {

From 36c57c3734f7cf1ac2c44d7ba9e4c7bd47210725 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Thu, 26 Sep 2024 01:41:52 -0700
Subject: [PATCH 039/110] separate nodes

---
 configs/spmd/large_scale/large_scale.json | 80 +++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
index 6469b8aa..74388728 100644
--- a/configs/spmd/large_scale/large_scale.json
+++ b/configs/spmd/large_scale/large_scale.json
@@ -63,6 +63,75 @@
                 "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
+        "large scale 64 parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+        "large scale 128 to 2k parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+
+        "large scale 128 parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [1536], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+
+        "large scale 256 parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [3072], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+
+        "large scale 512 parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [6144], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+
+        "large scale 1024 parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [12288], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+
+        "large scale 2048 parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+
         "large scale two nodes parameters": {
             "data": {
                 "dtype": "float64",
@@ -73,6 +142,16 @@
             }
         },
         "large scale strong 2k parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "rank_based"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+
+        "large scale strong 64 parameters": {
             "data": {
                 "dtype": "float64",
                 "distributed_split": "rank_based"
@@ -81,6 +160,7 @@
                 "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
+
         "large scale strong two nodes parameters": {
             "data": {
                 "dtype": "float64",

From 08f0aa89d030b644b238a185be5226d884997d89 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Thu, 26 Sep 2024 02:11:35 -0700
Subject: [PATCH 040/110] minor

---
 configs/spmd/large_scale/knn_strong.json      | 2 +-
 configs/spmd/large_scale/large_scale.json     | 4 ++--
 configs/spmd/large_scale/spmd_for_online.json | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/configs/spmd/large_scale/knn_strong.json b/configs/spmd/large_scale/knn_strong.json
index 7682dc5e..a3236c74 100644
--- a/configs/spmd/large_scale/knn_strong.json
+++ b/configs/spmd/large_scale/knn_strong.json
@@ -30,7 +30,7 @@
                 "common knn parameters",
                 "synthetic classification data",
                 "sklearnex spmd implementation",
-		        "large scale strong 2k parameters",
+		        "large scale strong 32 parameters",
                 "spmd knn cls parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
index 74388728..55a70fbf 100644
--- a/configs/spmd/large_scale/large_scale.json
+++ b/configs/spmd/large_scale/large_scale.json
@@ -151,13 +151,13 @@
             }
         },
 
-        "large scale strong 64 parameters": {
+        "large scale strong 32 parameters": {
             "data": {
                 "dtype": "float64",
                 "distributed_split": "rank_based"
             },
             "bench": {
-                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
 
diff --git a/configs/spmd/large_scale/spmd_for_online.json b/configs/spmd/large_scale/spmd_for_online.json
index 7f258e9b..ec42a050 100644
--- a/configs/spmd/large_scale/spmd_for_online.json
+++ b/configs/spmd/large_scale/spmd_for_online.json
@@ -51,7 +51,7 @@
     },
     "TEMPLATES": {
         "basic_statistics": {"SETS": ["basic_statistics data", "spmd default parameters", "sklearnex spmd implementation", "large scale full one node parameters"]},
-        "covariance": {"SETS": ["covariance data", "spmd default parameters","sklearnex spmd implementation", "large scale full one node parameters"]},
+        "covariance": {"SETS": ["covariance data", "spmd default parameters", "sklearnex spmd implementation", "large scale full one node parameters"]},
         "linear_regression": {
             "SETS": ["linear_regression data", "spmd default parameters", "sklearnex spmd implementation", "large scale full one node parameters"]
         },

From 33022127814a52e26fea361fc24c4f304e46f587 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Thu, 26 Sep 2024 02:27:15 -0700
Subject: [PATCH 041/110] dbscan

---
 configs/regular/batch_for_online.json       |  2 +-
 configs/spmd/large_scale/dbscan_strong.json | 31 +++++++++++++++++++++
 configs/spmd/large_scale/kmeans_strong.json |  2 +-
 3 files changed, 33 insertions(+), 2 deletions(-)
 create mode 100644 configs/spmd/large_scale/dbscan_strong.json

diff --git a/configs/regular/batch_for_online.json b/configs/regular/batch_for_online.json
index d4239c65..9e53081b 100644
--- a/configs/regular/batch_for_online.json
+++ b/configs/regular/batch_for_online.json
@@ -1,7 +1,7 @@
 {
     "INCLUDE": ["../common/sklearn.json"],
     "PARAMETERS_SETS": {
-        "common": {"bench": {"n_runs": 10, "time_limit": 60}},
+        "common": {"bench": {"n_runs": 10}},
         "basic_statistics data": {
             "data": {
                 "source": "make_blobs",
diff --git a/configs/spmd/large_scale/dbscan_strong.json b/configs/spmd/large_scale/dbscan_strong.json
new file mode 100644
index 00000000..4e96eafa
--- /dev/null
+++ b/configs/spmd/large_scale/dbscan_strong.json
@@ -0,0 +1,31 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/dbscan.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd dbscan parameters": {
+	    "algorithm": {
+		"estimator": "DBSCAN",
+		"estimator_methods": {
+		    "training": "fit"
+		}
+	    },
+	    "data": {
+		"dtype": "float64"
+	    }
+	},
+	"synthetic dataset": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 10, "centers": 10 }, "algorithm": { "eps": 5, "min_samples": 5 } }
+            ]
+	}
+    },
+    "TEMPLATES": {
+        "dbscan": {
+            "SETS": [
+                "synthetic dataset",
+                "sklearnex spmd implementation",
+		        "large scale strong 32 parameters",
+                "spmd dbscan parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/kmeans_strong.json b/configs/spmd/large_scale/kmeans_strong.json
index 29cfc2e7..6f095af0 100644
--- a/configs/spmd/large_scale/kmeans_strong.json
+++ b/configs/spmd/large_scale/kmeans_strong.json
@@ -23,7 +23,7 @@
             "SETS": [
                 "synthetic data",
                 "sklearnex spmd implementation",
-                "large scale strong 2k parameters",
+                "large scale strong 32 parameters",
                 "spmd kmeans parameters"
             ]
         }

From 1779a9f7b8b57f79f7031894da657fe9e53072b6 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Thu, 26 Sep 2024 21:48:26 -0700
Subject: [PATCH 042/110] config fixes

---
 configs/spmd/large_scale/knn_strong.json      | 2 +-
 configs/spmd/large_scale/spmd_for_online.json | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/configs/spmd/large_scale/knn_strong.json b/configs/spmd/large_scale/knn_strong.json
index a3236c74..4afe8684 100644
--- a/configs/spmd/large_scale/knn_strong.json
+++ b/configs/spmd/large_scale/knn_strong.json
@@ -20,7 +20,7 @@
         "synthetic classification data": {
             "data": [
 		        { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 5000 },   "generation_kwargs": {  "n_samples": 505000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
-		        { "source": "make_classification", "split_kwargs": { "train_size": 5000, "test_size": 500000 },   "generation_kwargs": {  "n_samples": 500500,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
+		        { "source": "make_classification", "split_kwargs": { "train_size": 5000, "test_size": 500000 },   "generation_kwargs": {  "n_samples": 505000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
             ]
         }	
     },
diff --git a/configs/spmd/large_scale/spmd_for_online.json b/configs/spmd/large_scale/spmd_for_online.json
index ec42a050..11b0b159 100644
--- a/configs/spmd/large_scale/spmd_for_online.json
+++ b/configs/spmd/large_scale/spmd_for_online.json
@@ -50,11 +50,11 @@
         }
     },
     "TEMPLATES": {
-        "basic_statistics": {"SETS": ["basic_statistics data", "spmd default parameters", "sklearnex spmd implementation", "large scale full one node parameters"]},
-        "covariance": {"SETS": ["covariance data", "spmd default parameters", "sklearnex spmd implementation", "large scale full one node parameters"]},
+        "basic_statistics": {"SETS": ["basic_statistics data", "sklearnex spmd implementation", "large scale full one node parameters"]},
+        "covariance": {"SETS": ["covariance data", "sklearnex spmd implementation", "large scale full one node parameters"]},
         "linear_regression": {
-            "SETS": ["linear_regression data", "spmd default parameters", "sklearnex spmd implementation", "large scale full one node parameters"]
+            "SETS": ["linear_regression data", "sklearnex spmd implementation", "large scale full one node parameters"]
         },
-        "pca": {"SETS": ["pca data", "spmd default parameters", "sklearnex spmd implementation", "large scale full one node parameters"]}
+        "pca": {"SETS": ["pca data", "sklearnex spmd implementation", "large scale full one node parameters"]}
     }
 }

From 4ac119e3e86d6e534ed96f57ab5f371264d8a6d4 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Thu, 26 Sep 2024 22:08:09 -0700
Subject: [PATCH 043/110] config fix

---
 configs/spmd/large_scale/spmd_for_online.json | 44 +++++++++++++++++--
 1 file changed, 40 insertions(+), 4 deletions(-)

diff --git a/configs/spmd/large_scale/spmd_for_online.json b/configs/spmd/large_scale/spmd_for_online.json
index 11b0b159..53ac660e 100644
--- a/configs/spmd/large_scale/spmd_for_online.json
+++ b/configs/spmd/large_scale/spmd_for_online.json
@@ -47,14 +47,50 @@
                 },
                 "split_kwargs": {"ignore": true}
             }
+        },
+        "basic_statistics": {
+            "algorithm": [
+                {
+                    "estimator": "BasicStatistics",
+                    "library": "sklearnex.spmd",
+                    "estimator_methods": {"training": "fit"}
+                }
+            ]
+        },
+        "covariance": {
+            "algorithm": [
+                {
+                    "estimator": "EmpiricalCovariance",
+                    "library": "sklearnex.spmd.covariance",
+                    "estimator_methods": {"training": "fit"}
+                }
+            ]
+        },
+        "linear_regression": {
+            "algorithm": [
+                {
+                    "estimator": "LinearRegression",
+                    "library": "sklearnex.spmd.linear_model",
+                    "estimator_methods": {"training": "fit"}
+                }
+            ]
+        },
+        "pca": {
+            "algorithm": [
+                {
+                    "estimator": "PCA",
+                    "library": "sklearnex.spmd.decomposition",
+                    "estimator_methods": {"training": "fit"}
+                }
+            ]
         }
     },
     "TEMPLATES": {
-        "basic_statistics": {"SETS": ["basic_statistics data", "sklearnex spmd implementation", "large scale full one node parameters"]},
-        "covariance": {"SETS": ["covariance data", "sklearnex spmd implementation", "large scale full one node parameters"]},
+        "basic_statistics": {"SETS": ["basic_statistics", "basic_statistics data", "sklearnex spmd implementation", "large scale full one node parameters"]},
+        "covariance": {"SETS": ["covariance", "covariance data", "sklearnex spmd implementation", "large scale full one node parameters"]},
         "linear_regression": {
-            "SETS": ["linear_regression data", "sklearnex spmd implementation", "large scale full one node parameters"]
+            "SETS": ["linear_regression", "linear_regression data", "sklearnex spmd implementation", "large scale full one node parameters"]
         },
-        "pca": {"SETS": ["pca data", "sklearnex spmd implementation", "large scale full one node parameters"]}
+        "pca": {"SETS": ["pca", "pca data", "sklearnex spmd implementation", "large scale full one node parameters"]}
     }
 }

From 902f0ec0df358d91fa200c42771c9ad96a169ecb Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Fri, 27 Sep 2024 06:25:12 -0700
Subject: [PATCH 044/110] forest regression

---
 configs/spmd/large_scale/forest_strong_reg.json | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/configs/spmd/large_scale/forest_strong_reg.json b/configs/spmd/large_scale/forest_strong_reg.json
index 71afeee6..a5a0c253 100644
--- a/configs/spmd/large_scale/forest_strong_reg.json
+++ b/configs/spmd/large_scale/forest_strong_reg.json
@@ -8,7 +8,9 @@
         },
         "synthetic data": {
             "data": [
-                { "source": "make_regression", "generation_kwargs": { "n_samples": 10000000,  "n_features": 100, "noise": 1.25 }, "split_kwargs": { "train_size": 10000000, "test_size": 5000 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } }}
+                { "source": "make_regression", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 9090000, "test_size": 100000 }, "algorithm": { "estimator_params": { "n_estimators": 5, "max_depth": 4 } }},
+                { "source": "make_regression", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 90000, "test_size": 10000 }, "algorithm": { "estimator_params": { "n_estimators": 10, "max_depth": 4 } }}
+            
             ]
         }
     },
@@ -16,7 +18,7 @@
         "forestReg": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale strong 2k parameters",
+                "large scale strong 32 parameters",
 		        "synthetic data",
                 "spmd forest regression parameters"
             ]

From d40389eec8c72207174e8f9fdf9e67dedf0c518b Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Fri, 27 Sep 2024 06:35:13 -0700
Subject: [PATCH 045/110] forest regression

---
 configs/spmd/large_scale/forest_reg.json | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/configs/spmd/large_scale/forest_reg.json b/configs/spmd/large_scale/forest_reg.json
index ab2a6920..3191eb28 100644
--- a/configs/spmd/large_scale/forest_reg.json
+++ b/configs/spmd/large_scale/forest_reg.json
@@ -8,7 +8,9 @@
         },
         "synthetic data": {
             "data": [
-                { "source": "make_regression", "generation_kwargs": { "n_samples": 10000000,  "n_features": 100, "noise": 1.25 }, "split_kwargs": { "train_size": 10000000, "test_size": 5000 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } }}
+                { "source": "make_regression", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 9090000, "test_size": 100000 }, "algorithm": { "estimator_params": { "n_estimators": 5, "max_depth": 4 } }},
+                { "source": "make_regression", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 90000, "test_size": 10000 }, "algorithm": { "estimator_params": { "n_estimators": 10, "max_depth": 4 } }}
+            
             ]
         }
     },

From 906de020380c5b23336374ceaf33a54eaf47e294 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Fri, 27 Sep 2024 07:06:51 -0700
Subject: [PATCH 046/110] forest regression

---
 configs/spmd/large_scale/forest_reg.json        | 4 ++--
 configs/spmd/large_scale/forest_strong_reg.json | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/configs/spmd/large_scale/forest_reg.json b/configs/spmd/large_scale/forest_reg.json
index 3191eb28..daab32c4 100644
--- a/configs/spmd/large_scale/forest_reg.json
+++ b/configs/spmd/large_scale/forest_reg.json
@@ -8,8 +8,8 @@
         },
         "synthetic data": {
             "data": [
-                { "source": "make_regression", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 9090000, "test_size": 100000 }, "algorithm": { "estimator_params": { "n_estimators": 5, "max_depth": 4 } }},
-                { "source": "make_regression", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 90000, "test_size": 10000 }, "algorithm": { "estimator_params": { "n_estimators": 10, "max_depth": 4 } }}
+                { "source": "make_regression", "generation_kwargs": { "n_samples": 1000000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 900000, "test_size": 100000 }, "algorithm": { "estimator_params": { "n_estimators": 5, "max_depth": 4 } }},
+                { "source": "make_regression", "generation_kwargs": { "n_samples": 100000,  "n_features": 100, "noise": 1.25 }, "split_kwargs": { "train_size": 90000, "test_size": 10000 }, "algorithm": { "estimator_params": { "n_estimators": 10, "max_depth": 4 } }}
             
             ]
         }
diff --git a/configs/spmd/large_scale/forest_strong_reg.json b/configs/spmd/large_scale/forest_strong_reg.json
index a5a0c253..8d738389 100644
--- a/configs/spmd/large_scale/forest_strong_reg.json
+++ b/configs/spmd/large_scale/forest_strong_reg.json
@@ -8,8 +8,8 @@
         },
         "synthetic data": {
             "data": [
-                { "source": "make_regression", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 9090000, "test_size": 100000 }, "algorithm": { "estimator_params": { "n_estimators": 5, "max_depth": 4 } }},
-                { "source": "make_regression", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 90000, "test_size": 10000 }, "algorithm": { "estimator_params": { "n_estimators": 10, "max_depth": 4 } }}
+                { "source": "make_regression", "generation_kwargs": { "n_samples": 1000000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 900000, "test_size": 10000 }, "algorithm": { "estimator_params": { "n_estimators": 5, "max_depth": 4 } }},
+                { "source": "make_regression", "generation_kwargs": { "n_samples": 100000,  "n_features": 100, "noise": 1.25 }, "split_kwargs": { "train_size": 90000, "test_size": 10000 }, "algorithm": { "estimator_params": { "n_estimators": 10, "max_depth": 4 } }}
             
             ]
         }

From 7348b42870900b75cc99842564516f8bf082fe22 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Tue, 1 Oct 2024 16:56:52 -0700
Subject: [PATCH 047/110] kmeans and logreg update

---
 configs/spmd/large_scale/kmeans.json | 5 ++---
 configs/spmd/large_scale/logreg.json | 3 ++-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/configs/spmd/large_scale/kmeans.json b/configs/spmd/large_scale/kmeans.json
index 89524965..caba8a79 100644
--- a/configs/spmd/large_scale/kmeans.json
+++ b/configs/spmd/large_scale/kmeans.json
@@ -12,9 +12,8 @@
 	},
 	"synthetic data": {
                 "data": [
-                        { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000,  "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
-	                { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000,  "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
-			{ "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000,  "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } }
+                        { "source": "make_blobs", "generation_kwargs": { "n_samples": 3750000,  "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
+	                { "source": "make_blobs", "generation_kwargs": { "n_samples": 18750,  "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }
                 ]
         }
     },
diff --git a/configs/spmd/large_scale/logreg.json b/configs/spmd/large_scale/logreg.json
index bbd18f3b..a23a745e 100644
--- a/configs/spmd/large_scale/logreg.json
+++ b/configs/spmd/large_scale/logreg.json
@@ -11,7 +11,8 @@
         "synthetic data": {
             "data": [
 		        { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 5001000, "n_features": 10, "n_classes": 2 } },
-                { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 101000, "n_features": 1000, "n_classes": 2 } }
+                { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 101000, "n_features": 1000, "n_classes": 2 } },
+                { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 501000, "n_features": 200, "n_classes":2, "n_clusters_per_class": 3, "flip_y":0.05 } }
             ]
         }
     },

From 270c8417a73392e364f5ade6f4f4f83320190286 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Tue, 1 Oct 2024 17:24:27 -0700
Subject: [PATCH 048/110] forest reg data same as cls

---
 configs/spmd/large_scale/forest_reg.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/spmd/large_scale/forest_reg.json b/configs/spmd/large_scale/forest_reg.json
index daab32c4..58cb3962 100644
--- a/configs/spmd/large_scale/forest_reg.json
+++ b/configs/spmd/large_scale/forest_reg.json
@@ -8,8 +8,8 @@
         },
         "synthetic data": {
             "data": [
-                { "source": "make_regression", "generation_kwargs": { "n_samples": 1000000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 900000, "test_size": 100000 }, "algorithm": { "estimator_params": { "n_estimators": 5, "max_depth": 4 } }},
-                { "source": "make_regression", "generation_kwargs": { "n_samples": 100000,  "n_features": 100, "noise": 1.25 }, "split_kwargs": { "train_size": 90000, "test_size": 10000 }, "algorithm": { "estimator_params": { "n_estimators": 10, "max_depth": 4 } }}
+                { "source": "make_regression", "generation_kwargs": { "n_samples": 501000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } }},
+                { "source": "make_regression", "generation_kwargs": { "n_samples": 11000,  "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 10000, "test_size": 1000 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } }}
             
             ]
         }

From d172d2a2dc93f7acf31d33918809124510a6709a Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Tue, 1 Oct 2024 18:50:19 -0700
Subject: [PATCH 049/110] knn bf16

---
 configs/regular/bf16/knn.json | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/configs/regular/bf16/knn.json b/configs/regular/bf16/knn.json
index 1a62ef89..049ed492 100644
--- a/configs/regular/bf16/knn.json
+++ b/configs/regular/bf16/knn.json
@@ -17,9 +17,14 @@
         },
         "synthetic classification data": {
             "data": [
-		        { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
+		        { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 1000 },   "generation_kwargs": {  "n_samples": 51000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
             ]
-        }	
+        },
+        "synthetic regression data": {
+            "data": [
+		        { "source": "make_regression", "split_kwargs": { "train_size": 50000, "test_size": 1000 },   "generation_kwargs": {  "n_samples": 51000,  "n_features": 100, "noise":1.5 } }
+            ]
+        }
     },
     "TEMPLATES": {
         "sklearn brute knn clsf": {
@@ -27,7 +32,8 @@
                 "sklearn-ex[gpu] implementations",
                 "common knn parameters",
                 "sklearn knn parameters",
-                "synthetic classification data"
+                "synthetic classification data",
+                "synthetic regression data"
             ]
         }
     }

From 29ea28838d4458aa866e3658f81c0c011c744b3d Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Wed, 2 Oct 2024 11:38:06 -0700
Subject: [PATCH 050/110] cov regular prev

---
 configs/regular/batch_for_online.json | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/configs/regular/batch_for_online.json b/configs/regular/batch_for_online.json
index 9e53081b..2bcdc47f 100644
--- a/configs/regular/batch_for_online.json
+++ b/configs/regular/batch_for_online.json
@@ -45,6 +45,15 @@
                 }
             ]
         },
+        "covariance": {
+            "algorithm": [
+                {
+                    "estimator": "EmpiricalCovariance",
+                    "library": "sklearnex.preview.covariance",
+                    "estimator_methods": {"training": "fit"}
+                }
+            ]
+        },
         "linear_regression": {
             "algorithm": [
                 {
@@ -66,6 +75,7 @@
     },
     "TEMPLATES": {
         "basic_statistics": {"SETS": ["common", "basic_statistics", "basic_statistics data", "sklearn-ex[gpu] implementations"]},
+        "covariance": {"SETS": ["common", "basic_statistics data", "sklearn-ex[gpu] implementations", "covariance"]},
         "linear_regression": {
             "SETS": ["common", "linear_regression", "linear_regression data", "sklearn-ex[gpu] implementations"]
         },

From 13c0514e64227134bfae874f58f557121073b450 Mon Sep 17 00:00:00 2001
From: Ian Faust <icfaust@gmail.com>
Date: Mon, 7 Oct 2024 22:44:01 +0200
Subject: [PATCH 051/110] Update logreg.json

---
 configs/regular/bf16/logreg.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/regular/bf16/logreg.json b/configs/regular/bf16/logreg.json
index cde74c25..e2babdd1 100644
--- a/configs/regular/bf16/logreg.json
+++ b/configs/regular/bf16/logreg.json
@@ -10,14 +10,14 @@
                     "tol": 1e-4,
                     "C": 1.0,
                     "l1_ratio": null,
-                    "max_iter": 200
+                    "max_iter": 20
                 }
             }
         },
         "sklearn logreg parameters": {
             "algorithm": {
                 "estimator_params": {
-                    "solver": "lbfgs",
+                    "solver": "newton-cg",
                     "n_jobs": "[SPECIAL_VALUE]physical_cpus",
                     "random_state": 42
                 }

From 8532908888208b263918bfecec0ed758bfd8433b Mon Sep 17 00:00:00 2001
From: Ian Faust <icfaust@gmail.com>
Date: Mon, 7 Oct 2024 23:17:11 +0200
Subject: [PATCH 052/110] Update ensemble.json

---
 configs/regular/bf16/ensemble.json | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/configs/regular/bf16/ensemble.json b/configs/regular/bf16/ensemble.json
index d383bcac..556da67a 100644
--- a/configs/regular/bf16/ensemble.json
+++ b/configs/regular/bf16/ensemble.json
@@ -13,6 +13,9 @@
                     "bootstrap": true,
                     "random_state": 42
                 }
+            },
+            "data": {
+                "dtype": ["float32"]
             }
         },
         "sklearn ensemble classifier params": {

From c3ac4bb08b12d429779f3a5b93f3d591489f2fed Mon Sep 17 00:00:00 2001
From: Ian Faust <icfaust@gmail.com>
Date: Mon, 7 Oct 2024 23:17:33 +0200
Subject: [PATCH 053/110] Update kmeans.json

---
 configs/regular/bf16/kmeans.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/regular/bf16/kmeans.json b/configs/regular/bf16/kmeans.json
index 084ae8f4..8a5323c5 100644
--- a/configs/regular/bf16/kmeans.json
+++ b/configs/regular/bf16/kmeans.json
@@ -14,7 +14,7 @@
                 "estimator_methods": { "inference": "predict" }
             },
             "data": {
-                "dtype": ["float32", "float64"],
+                "dtype": ["float32"],
                 "preprocessing_kwargs": { "normalize": true }
             }
         },

From a8d898b1016d5e14df189ee80f896cee205eaf8c Mon Sep 17 00:00:00 2001
From: Ian Faust <icfaust@gmail.com>
Date: Mon, 7 Oct 2024 23:18:37 +0200
Subject: [PATCH 054/110] Update knn.json

---
 configs/regular/bf16/knn.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/configs/regular/bf16/knn.json b/configs/regular/bf16/knn.json
index 049ed492..527dcbe4 100644
--- a/configs/regular/bf16/knn.json
+++ b/configs/regular/bf16/knn.json
@@ -9,6 +9,7 @@
                 }
             },
             "data": {
+                "dtype": ["float32"],
                 "preprocessing_kwargs": { "normalize": true }
             }
         },

From fe90de288ab5439d3a02f77136f252e7c38d0cff Mon Sep 17 00:00:00 2001
From: Ian Faust <icfaust@gmail.com>
Date: Mon, 7 Oct 2024 23:19:32 +0200
Subject: [PATCH 055/110] Update logreg.json

---
 configs/regular/bf16/logreg.json | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/configs/regular/bf16/logreg.json b/configs/regular/bf16/logreg.json
index e2babdd1..863d67f9 100644
--- a/configs/regular/bf16/logreg.json
+++ b/configs/regular/bf16/logreg.json
@@ -12,6 +12,9 @@
                     "l1_ratio": null,
                     "max_iter": 20
                 }
+            },
+	    "data": {
+                "dtype": ["float32"]
             }
         },
         "sklearn logreg parameters": {

From 7ab1cc3a482daefe33dd82c631345f897c904cd7 Mon Sep 17 00:00:00 2001
From: Ian Faust <icfaust@gmail.com>
Date: Mon, 7 Oct 2024 23:21:22 +0200
Subject: [PATCH 056/110] Update pca.json

---
 configs/regular/bf16/pca.json | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/configs/regular/bf16/pca.json b/configs/regular/bf16/pca.json
index 945c2939..e5113261 100644
--- a/configs/regular/bf16/pca.json
+++ b/configs/regular/bf16/pca.json
@@ -13,6 +13,9 @@
                     "iterated_power": 15,
                     "random_state": 42
                 }
+            },
+            "data": {
+                "dtype": ["float32"]
             }
         },
         "synthetic data": {

From 595a7ee974b6789a5fdddf89bf3e24adab11cb13 Mon Sep 17 00:00:00 2001
From: Ian Faust <icfaust@gmail.com>
Date: Mon, 7 Oct 2024 23:22:22 +0200
Subject: [PATCH 057/110] Update linear_model.json

---
 configs/regular/bf16/linear_model.json | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/configs/regular/bf16/linear_model.json b/configs/regular/bf16/linear_model.json
index 7149e490..393b2c64 100644
--- a/configs/regular/bf16/linear_model.json
+++ b/configs/regular/bf16/linear_model.json
@@ -10,6 +10,9 @@
             "algorithm": {
                 "estimator": "LinearRegression",
                 "estimator_params": { "fit_intercept": true, "copy_X": true }
+            },
+            "data": {
+                "dtype": ["float32"]
             }
         },
         "sklearn linear parameters": {

From 80257199e245c0685664f541219ee97533f6a1cc Mon Sep 17 00:00:00 2001
From: ethanglaser <ethan.glaser@intel.com>
Date: Mon, 7 Oct 2024 21:45:55 +0000
Subject: [PATCH 058/110] dbscan large scale support and logreg details

---
 configs/spmd/large_scale/dbscan_strong.json | 32 +++++++++++++++++++++
 configs/spmd/large_scale/large_scale.json   | 27 +++++++++++++++++
 configs/spmd/large_scale/logreg_strong.json |  2 +-
 sklbench/utils/measurement.py               |  7 +++++
 4 files changed, 67 insertions(+), 1 deletion(-)
 create mode 100644 configs/spmd/large_scale/dbscan_strong.json

diff --git a/configs/spmd/large_scale/dbscan_strong.json b/configs/spmd/large_scale/dbscan_strong.json
new file mode 100644
index 00000000..1843cd8c
--- /dev/null
+++ b/configs/spmd/large_scale/dbscan_strong.json
@@ -0,0 +1,32 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/dbscan.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd dbscan parameters": {
+	    "algorithm": {
+		"estimator": "DBSCAN",
+		"estimator_methods": {
+		    "training": "fit"
+		}
+	    },
+	    "data": {
+		"dtype": "float64"
+	    }
+	},
+	"synthetic dataset": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 400000,  "n_features": 100, "centers": 10 }, "algorithm": { "eps": 5, "min_samples": 5 } }
+            ]
+	}
+    },
+    "TEMPLATES": {
+        "dbscan": {
+            "SETS": [
+                "common dbscan parameters",
+                "synthetic dataset",
+                "sklearnex spmd implementation",
+		"large scale strong parameters",
+                "spmd dbscan parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
index 72b808fe..06a8db16 100644
--- a/configs/spmd/large_scale/large_scale.json
+++ b/configs/spmd/large_scale/large_scale.json
@@ -27,6 +27,24 @@
                 "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
+        "large scale <64 parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+        "large scale >64 parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
         "large scale strong 2k parameters": {
             "data": {
                 "dtype": "float64",
@@ -36,6 +54,15 @@
                 "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
+        "large scale strong <64 parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "rank_based"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
 	"large scale impi parameters": {
 	    "data": {
 		"dtype": "float64",
diff --git a/configs/spmd/large_scale/logreg_strong.json b/configs/spmd/large_scale/logreg_strong.json
index 2bf1c0f9..8787f6b6 100644
--- a/configs/spmd/large_scale/logreg_strong.json
+++ b/configs/spmd/large_scale/logreg_strong.json
@@ -5,7 +5,7 @@
 	    "algorithm":{
 		"estimator": "LogisticRegression",
                 "estimator_methods": { "inference": "predict" },
-		"estimator_params": { "max_iter": 30 }
+		"estimator_params": { "max_iter": 16 }
             }
 	},
         "synthetic data": {
diff --git a/sklbench/utils/measurement.py b/sklbench/utils/measurement.py
index df74e8da..ea86d29f 100644
--- a/sklbench/utils/measurement.py
+++ b/sklbench/utils/measurement.py
@@ -72,12 +72,16 @@ def measure_time(
         )
     times = []
     func_return_value = None
+    inners, iters = [], []
     while len(times) < n_runs:
         if enable_itt and itt_is_available:
             itt.resume()
         t0 = timeit.default_timer()
         func_return_value = func(*args, **kwargs)
         t1 = timeit.default_timer()
+        if hasattr(func.__self__, "_n_inner_iter"):
+            inners.append(func.__self__._n_inner_iter)
+            iters.append(func.__self__.n_iter_)
         if enable_itt and itt_is_available:
             itt.pause()
         times.append(t1 - t0)
@@ -88,6 +92,9 @@ def measure_time(
                 f"exceeded time limit ({time_limit} seconds)"
             )
             break
+    from mpi4py import MPI
+    if MPI.COMM_WORLD.Get_rank() == 0:
+        logger.debug("iters across n runs: " + str(iters) + ", inner iters across n runs: " + str(inners))
     logger.debug(times)
     #mean, std = box_filter(times)
     #if std / mean > std_mean_ratio:

From fcaa9077f9f987450642a7ef4d42924b5551780e Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Tue, 8 Oct 2024 00:51:01 -0400
Subject: [PATCH 059/110] reformat

---
 configs/regular/batch_for_online.json             |  3 ++-
 configs/spmd/large_scale/basic_stats.json         |  2 +-
 configs/spmd/large_scale/basic_stats_single.json  |  2 +-
 configs/spmd/large_scale/basic_stats_strong.json  |  2 +-
 configs/spmd/large_scale/covariance.json          |  2 +-
 configs/spmd/large_scale/covariance_strong.json   |  2 +-
 configs/spmd/large_scale/dbscan.json              |  2 +-
 configs/spmd/large_scale/dbscan_strong.json       |  2 +-
 configs/spmd/large_scale/forest.json              |  2 +-
 configs/spmd/large_scale/forest_reg.json          |  2 +-
 configs/spmd/large_scale/forest_strong.json       |  2 +-
 configs/spmd/large_scale/forest_strong_reg.json   |  2 +-
 configs/spmd/large_scale/knn.json                 |  2 +-
 configs/spmd/large_scale/knn_strong.json          |  2 +-
 configs/spmd/large_scale/linear_model.json        |  2 +-
 configs/spmd/large_scale/linear_model_strong.json |  2 +-
 configs/spmd/large_scale/logreg.json              | 11 -----------
 configs/spmd/large_scale/logreg_2.json            |  4 ++--
 configs/spmd/large_scale/logreg_strong.json       |  4 ++--
 configs/spmd/large_scale/logreg_strong_2.json     |  4 ++--
 configs/spmd/large_scale/pca.json                 |  4 ++--
 configs/spmd/large_scale/pca_single.json          |  4 ++--
 configs/spmd/large_scale/pca_strong.json          |  4 ++--
 23 files changed, 29 insertions(+), 39 deletions(-)

diff --git a/configs/regular/batch_for_online.json b/configs/regular/batch_for_online.json
index 2bcdc47f..973c4ed4 100644
--- a/configs/regular/batch_for_online.json
+++ b/configs/regular/batch_for_online.json
@@ -81,4 +81,5 @@
         },
         "pca": {"SETS": ["common", "pca", "pca data", "sklearn-ex[gpu] implementations"]}
     }
-}
\ No newline at end of file
+}
+
diff --git a/configs/spmd/large_scale/basic_stats.json b/configs/spmd/large_scale/basic_stats.json
index b484b647..9ac4725f 100644
--- a/configs/spmd/large_scale/basic_stats.json
+++ b/configs/spmd/large_scale/basic_stats.json
@@ -22,7 +22,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
-		        "synthetic data",
+		"synthetic data",
                 "spmd basicstats parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/basic_stats_single.json b/configs/spmd/large_scale/basic_stats_single.json
index e106b2a9..832bd3b2 100644
--- a/configs/spmd/large_scale/basic_stats_single.json
+++ b/configs/spmd/large_scale/basic_stats_single.json
@@ -22,7 +22,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale one node parameters",
-		        "synthetic data",
+		"synthetic data",
                 "spmd basicstats parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/basic_stats_strong.json b/configs/spmd/large_scale/basic_stats_strong.json
index 6527d8e5..b7aa22cb 100644
--- a/configs/spmd/large_scale/basic_stats_strong.json
+++ b/configs/spmd/large_scale/basic_stats_strong.json
@@ -21,7 +21,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale strong 2k parameters",
-		        "synthetic data",
+		"synthetic data",
                 "spmd basicstats parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/covariance.json b/configs/spmd/large_scale/covariance.json
index e4d0477a..260befd0 100644
--- a/configs/spmd/large_scale/covariance.json
+++ b/configs/spmd/large_scale/covariance.json
@@ -22,7 +22,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
-		        "synthetic data",
+		"synthetic data",
                 "spmd basicstats parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/covariance_strong.json b/configs/spmd/large_scale/covariance_strong.json
index 2b9c5dd0..568b4a8f 100644
--- a/configs/spmd/large_scale/covariance_strong.json
+++ b/configs/spmd/large_scale/covariance_strong.json
@@ -21,7 +21,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale strong 2k parameters",
-		        "synthetic data",
+		"synthetic data",
                 "spmd basicstats parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/dbscan.json b/configs/spmd/large_scale/dbscan.json
index b17e2cd8..c46287d8 100644
--- a/configs/spmd/large_scale/dbscan.json
+++ b/configs/spmd/large_scale/dbscan.json
@@ -24,7 +24,7 @@
                 "common dbscan parameters",
                 "synthetic dataset",
                 "sklearnex spmd implementation",
-		        "large scale default parameters",
+		"large scale default parameters",
                 "spmd dbscan parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/dbscan_strong.json b/configs/spmd/large_scale/dbscan_strong.json
index 4e96eafa..05b00d39 100644
--- a/configs/spmd/large_scale/dbscan_strong.json
+++ b/configs/spmd/large_scale/dbscan_strong.json
@@ -23,7 +23,7 @@
             "SETS": [
                 "synthetic dataset",
                 "sklearnex spmd implementation",
-		        "large scale strong 32 parameters",
+		"large scale strong 32 parameters",
                 "spmd dbscan parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/forest.json b/configs/spmd/large_scale/forest.json
index 5aa3d36f..9cab46be 100644
--- a/configs/spmd/large_scale/forest.json
+++ b/configs/spmd/large_scale/forest.json
@@ -18,7 +18,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
-		        "synthetic data",
+		"synthetic data",
                 "spmd forest classification parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/forest_reg.json b/configs/spmd/large_scale/forest_reg.json
index 58cb3962..a5ec73cd 100644
--- a/configs/spmd/large_scale/forest_reg.json
+++ b/configs/spmd/large_scale/forest_reg.json
@@ -19,7 +19,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
-		        "synthetic data",
+		"synthetic data",
                 "spmd forest regression parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/forest_strong.json b/configs/spmd/large_scale/forest_strong.json
index 14690846..9259e7ea 100644
--- a/configs/spmd/large_scale/forest_strong.json
+++ b/configs/spmd/large_scale/forest_strong.json
@@ -17,7 +17,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale strong 2k parameters",
-		        "synthetic data",
+		"synthetic data",
                 "spmd forest classification parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/forest_strong_reg.json b/configs/spmd/large_scale/forest_strong_reg.json
index 8d738389..305e729b 100644
--- a/configs/spmd/large_scale/forest_strong_reg.json
+++ b/configs/spmd/large_scale/forest_strong_reg.json
@@ -19,7 +19,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale strong 32 parameters",
-		        "synthetic data",
+		"synthetic data",
                 "spmd forest regression parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json
index cfd096cf..f1e0678d 100644
--- a/configs/spmd/large_scale/knn.json
+++ b/configs/spmd/large_scale/knn.json
@@ -30,7 +30,7 @@
                 "common knn parameters",
                 "synthetic classification data",
                 "sklearnex spmd implementation",
-		        "large scale 2k parameters",
+		"large scale 2k parameters",
                 "spmd knn cls parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/knn_strong.json b/configs/spmd/large_scale/knn_strong.json
index 4afe8684..67398123 100644
--- a/configs/spmd/large_scale/knn_strong.json
+++ b/configs/spmd/large_scale/knn_strong.json
@@ -30,7 +30,7 @@
                 "common knn parameters",
                 "synthetic classification data",
                 "sklearnex spmd implementation",
-		        "large scale strong 32 parameters",
+		"large scale strong 32 parameters",
                 "spmd knn cls parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/linear_model.json b/configs/spmd/large_scale/linear_model.json
index e208da7d..f9d17b5b 100644
--- a/configs/spmd/large_scale/linear_model.json
+++ b/configs/spmd/large_scale/linear_model.json
@@ -19,7 +19,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
-		        "synthetic data",
+		"synthetic data",
                 "spmd linear parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/linear_model_strong.json b/configs/spmd/large_scale/linear_model_strong.json
index 9d8c3533..77a9c79e 100644
--- a/configs/spmd/large_scale/linear_model_strong.json
+++ b/configs/spmd/large_scale/linear_model_strong.json
@@ -18,7 +18,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale strong 2k parameters",
-		        "synthetic data",
+		"synthetic data",
                 "spmd linear parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/logreg.json b/configs/spmd/large_scale/logreg.json
index 9fbaee71..c5ef6203 100644
--- a/configs/spmd/large_scale/logreg.json
+++ b/configs/spmd/large_scale/logreg.json
@@ -10,14 +10,8 @@
 	},
         "synthetic data": {
             "data": [
-<<<<<<< HEAD
-		        { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 5001000, "n_features": 10, "n_classes": 2 } },
-                { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 101000, "n_features": 1000, "n_classes": 2 } },
-                { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 501000, "n_features": 200, "n_classes":2, "n_clusters_per_class": 3, "flip_y":0.05 } }
-=======
 		{ "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 5001000, "n_features": 10, "n_classes": 2 } },
                 { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 101000, "n_features": 1000, "n_classes": 2 } }
->>>>>>> oleg_online/inc-dist-support
             ]
         }
     },
@@ -27,13 +21,8 @@
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
                 "spmd logreg parameters",
-<<<<<<< HEAD
-		        "synthetic data",
-		        "spmd logreg2 parameters"
-=======
 		"synthetic data",
 		"spmd logreg2 parameters"
->>>>>>> oleg_online/inc-dist-support
             ]
         }
     }
diff --git a/configs/spmd/large_scale/logreg_2.json b/configs/spmd/large_scale/logreg_2.json
index d18b2293..796eb8ad 100644
--- a/configs/spmd/large_scale/logreg_2.json
+++ b/configs/spmd/large_scale/logreg_2.json
@@ -21,8 +21,8 @@
                 "sklearnex spmd implementation",
                 "large scale two nodes parameters",
                 "spmd logreg parameters",
-		        "synthetic data",
-		        "spmd logreg2 parameters"
+		"synthetic data",
+		"spmd logreg2 parameters"
             ]
         }
     }
diff --git a/configs/spmd/large_scale/logreg_strong.json b/configs/spmd/large_scale/logreg_strong.json
index a6efd969..2bf1c0f9 100644
--- a/configs/spmd/large_scale/logreg_strong.json
+++ b/configs/spmd/large_scale/logreg_strong.json
@@ -20,8 +20,8 @@
                 "sklearnex spmd implementation",
                 "large scale strong 2k parameters",
                 "spmd logreg parameters",
-		        "synthetic data",
-		        "spmd logreg2 parameters"
+		"synthetic data",
+		"spmd logreg2 parameters"
             ]
         }
     }
diff --git a/configs/spmd/large_scale/logreg_strong_2.json b/configs/spmd/large_scale/logreg_strong_2.json
index 1a940d90..998e3bb7 100644
--- a/configs/spmd/large_scale/logreg_strong_2.json
+++ b/configs/spmd/large_scale/logreg_strong_2.json
@@ -20,8 +20,8 @@
                 "sklearnex spmd implementation",
                 "large scale strong two nodes parameters",
                 "spmd logreg parameters",
-		        "synthetic data",
-		        "spmd logreg2 parameters"
+		"synthetic data",
+		"spmd logreg2 parameters"
             ]
         }
     }
diff --git a/configs/spmd/large_scale/pca.json b/configs/spmd/large_scale/pca.json
index 3b9da126..9a6a6b02 100644
--- a/configs/spmd/large_scale/pca.json
+++ b/configs/spmd/large_scale/pca.json
@@ -20,10 +20,10 @@
     "TEMPLATES": {
         "linreg": {
             "SETS": [
-		        "sklearnex spmd implementation",
+		"sklearnex spmd implementation",
                 "large scale 2k parameters",
                 "synthetic data",
-		        "spmd pca parameters"
+		"spmd pca parameters"
             ]
         }
     }
diff --git a/configs/spmd/large_scale/pca_single.json b/configs/spmd/large_scale/pca_single.json
index 61b2cf15..07775a6a 100644
--- a/configs/spmd/large_scale/pca_single.json
+++ b/configs/spmd/large_scale/pca_single.json
@@ -20,10 +20,10 @@
     "TEMPLATES": {
         "linreg": {
             "SETS": [
-		        "sklearnex spmd implementation",
+	        "sklearnex spmd implementation",
                 "large scale one node parameters",
                 "synthetic data",
-		        "spmd pca parameters"
+		"spmd pca parameters"
             ]
         }
     }
diff --git a/configs/spmd/large_scale/pca_strong.json b/configs/spmd/large_scale/pca_strong.json
index 2d302340..9063c22e 100644
--- a/configs/spmd/large_scale/pca_strong.json
+++ b/configs/spmd/large_scale/pca_strong.json
@@ -19,10 +19,10 @@
     "TEMPLATES": {
         "linreg": {
             "SETS": [
-		        "sklearnex spmd implementation",
+	        "sklearnex spmd implementation",
                 "large scale strong 2k parameters",
                 "synthetic data",
-		        "spmd pca parameters"
+		"spmd pca parameters"
             ]
         }
     }

From a4653a12d0d6e997961cb0a976031e7c37a250a5 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Tue, 8 Oct 2024 01:03:07 -0400
Subject: [PATCH 060/110] knn bf16

---
 configs/regular/bf16/knn.json | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/configs/regular/bf16/knn.json b/configs/regular/bf16/knn.json
index 527dcbe4..c39b577e 100644
--- a/configs/regular/bf16/knn.json
+++ b/configs/regular/bf16/knn.json
@@ -17,11 +17,19 @@
             "algorithm": { "estimator_params": { "n_jobs": "[SPECIAL_VALUE]physical_cpus" } }
         },
         "synthetic classification data": {
+	    "algorithm": {
+                "estimator": "KNeighborsClassifier",
+                "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": [1, 2] }
+            },
             "data": [
 		        { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 1000 },   "generation_kwargs": {  "n_samples": 51000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
             ]
         },
         "synthetic regression data": {
+	    "algorithm": {
+                "estimator": "KNeighborsRegressor",
+                "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": [1, 2] }
+            },
             "data": [
 		        { "source": "make_regression", "split_kwargs": { "train_size": 50000, "test_size": 1000 },   "generation_kwargs": {  "n_samples": 51000,  "n_features": 100, "noise":1.5 } }
             ]
@@ -33,7 +41,14 @@
                 "sklearn-ex[gpu] implementations",
                 "common knn parameters",
                 "sklearn knn parameters",
-                "synthetic classification data",
+                "synthetic classification data"
+            ]
+        },
+	"sklearn brute knn reg": {
+            "SETS": [
+                "sklearn-ex[gpu] implementations",
+                "common knn parameters",
+                "sklearn knn parameters",
                 "synthetic regression data"
             ]
         }

From 4f65e1faff0694ede8704ed5b88360e8944e9e0b Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Tue, 8 Oct 2024 01:24:35 -0400
Subject: [PATCH 061/110] add bf16 cases

---
 configs/regular/bf16/basic_statistics.json | 27 +++++++++++++++++++++
 configs/regular/bf16/covariance.json       | 28 ++++++++++++++++++++++
 2 files changed, 55 insertions(+)
 create mode 100644 configs/regular/bf16/basic_statistics.json
 create mode 100644 configs/regular/bf16/covariance.json

diff --git a/configs/regular/bf16/basic_statistics.json b/configs/regular/bf16/basic_statistics.json
new file mode 100644
index 00000000..671521ab
--- /dev/null
+++ b/configs/regular/bf16/basic_statistics.json
@@ -0,0 +1,27 @@
+{
+    "INCLUDE": ["../../common/sklearn.json"],
+    "PARAMETERS_SETS": {
+        "basic stats parameters": {
+            "algorithm": {
+                "estimator": "BasicStatistics"
+            },
+            "data": {
+                "dtype": ["float32"]
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "basic_statistics": {
+            "SETS": [
+                "sklearn-ex[gpu] implementations",
+                "basic stats parameters",
+                "synthetic data"
+            ]
+        }
+    }
+}
diff --git a/configs/regular/bf16/covariance.json b/configs/regular/bf16/covariance.json
new file mode 100644
index 00000000..1cd6ef4a
--- /dev/null
+++ b/configs/regular/bf16/covariance.json
@@ -0,0 +1,28 @@
+{
+    "INCLUDE": ["../../common/sklearn.json"],
+    "PARAMETERS_SETS": {
+        "covariance parameters": {
+            "algorithm": {
+                "estimator": "EmpiricalCovariance",
+		"library": "sklearnex.preview.covariance"
+            },
+            "data": {
+                "dtype": ["float32"]
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "covariance": {
+            "SETS": [
+                "sklearn-ex[gpu] implementations",
+                "covariance parameters",
+                "synthetic data"
+            ]
+        }
+    }
+}

From c8522797fb6d02163f85bf6582e9d5eb585807d6 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Tue, 8 Oct 2024 01:47:32 -0400
Subject: [PATCH 062/110] forest bf16

---
 configs/regular/bf16/ensemble.json | 48 ------------------------------
 configs/regular/bf16/forest.json   | 34 +++++++++++++++++++++
 2 files changed, 34 insertions(+), 48 deletions(-)
 delete mode 100644 configs/regular/bf16/ensemble.json
 create mode 100644 configs/regular/bf16/forest.json

diff --git a/configs/regular/bf16/ensemble.json b/configs/regular/bf16/ensemble.json
deleted file mode 100644
index 556da67a..00000000
--- a/configs/regular/bf16/ensemble.json
+++ /dev/null
@@ -1,48 +0,0 @@
-{
-    "INCLUDE": ["../../common/sklearn.json"],
-    "PARAMETERS_SETS": {
-        "common ensemble params": {
-            "algorithm": {
-                "estimator_params": {
-                    "n_estimators": 200,
-                    "max_depth": 16,
-                    "max_samples": 1.0,
-                    "min_samples_split": 5,
-                    "min_samples_leaf": 2,
-                    "min_impurity_decrease": 0.0,
-                    "bootstrap": true,
-                    "random_state": 42
-                }
-            },
-            "data": {
-                "dtype": ["float32"]
-            }
-        },
-        "sklearn ensemble classifier params": {
-            "algorithm": {
-                "estimator": ["RandomForestClassifier", "ExtraTreesClassifier"],
-                "estimator_params": {
-                    "criterion": "gini",
-                    "max_features": "sqrt",
-                    "max_leaf_nodes": null,
-                    "n_jobs": "[SPECIAL_VALUE]physical_cpus"
-                }
-            }
-        },
-        "synthetic data": {
-            "data": [
-                { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 501000, "n_features": 10, "n_classes": 2 },  "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } }
-            ]
-        }
-    },
-    "TEMPLATES": {
-        "sklearn ensemble classification": {
-            "SETS": [
-                "sklearn-ex[gpu] implementations",
-                "common ensemble params",
-                "sklearn ensemble classifier params",
-                "synthetic data"
-            ]
-        }
-    }
-}
diff --git a/configs/regular/bf16/forest.json b/configs/regular/bf16/forest.json
new file mode 100644
index 00000000..845b73a2
--- /dev/null
+++ b/configs/regular/bf16/forest.json
@@ -0,0 +1,34 @@
+{
+    "INCLUDE": ["../../common/sklearn.json"],
+    "PARAMETERS_SETS": {
+        "common forest params": {
+            "data": {
+                "dtype": ["float32"]
+            }
+        },
+        "forest classifier params": {
+            "algorithm": {"estimator": "RandomForestClassifier"},
+	    "data": { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 501000, "n_features": 10, "n_classes": 2 },  "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } }
+        },
+	"forest regression params": {
+            "algorithm": {"estimator": "RandomForestRegressor"},
+            "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 501000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } }}
+        }
+    },
+    "TEMPLATES": {
+        "forest cls": {
+            "SETS": [
+                "sklearn-ex[gpu] implementations",
+                "common forest params",
+                "forest classifier params"
+            ]
+        },
+	"forest reg": {
+            "SETS": [
+                "sklearn-ex[gpu] implementations",
+                "common forest params",
+                "forest regression params"
+            ]
+        }
+    }
+}

From 698d884441b27a68b6718ea5d09b0f837bac9a26 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Tue, 8 Oct 2024 03:33:12 -0400
Subject: [PATCH 063/110] incremental

---
 configs/incremental.json | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/configs/incremental.json b/configs/incremental.json
index c9ffb19c..e1f589a4 100644
--- a/configs/incremental.json
+++ b/configs/incremental.json
@@ -1,4 +1,4 @@
-{
+{   "INCLUDE": ["./common/sklearn.json"],
     "PARAMETERS_SETS": {
         "common": {"bench": {"n_runs": 10, "time_limit": 60}},
         "covariance data": {
@@ -7,8 +7,8 @@
                     "source": "make_blobs",
                     "generation_kwargs": {
                         "centers": 1,
-                        "n_samples": 1000,
-                        "n_features": [16, 64]
+                        "n_samples": 12000000,
+                        "n_features": [10, 100]
                     },
                     "split_kwargs": {"ignore": true}
                 }
@@ -19,8 +19,8 @@
                 "source": "make_blobs",
                 "generation_kwargs": {
                     "centers": 1,
-                    "n_samples": 10000,
-                    "n_features": [16, 64]
+                    "n_samples": 12000000,
+                    "n_features": [10, 100]
                 },
                 "split_kwargs": {"ignore": true}
             }
@@ -30,8 +30,8 @@
                 "source": "make_regression",
                 "split_kwargs": {"train_size": 0.2, "test_size": 0.8},
                 "generation_kwargs": {
-                    "n_samples": 5000,
-                    "n_features": [40, 100],
+		    "n_samples": 12000000,
+                    "n_features": [10, 100],
                     "n_informative": 5,
                     "noise": 2.0
                 }
@@ -42,8 +42,8 @@
                 "source": "make_blobs",
                 "generation_kwargs": {
                     "centers": 1,
-                    "n_samples": 1000,
-                    "n_features": [16, 64]
+                    "n_samples": 12000000,
+                    "n_features": [10, 100]
                 },
                 "split_kwargs": {"ignore": true}
             }
@@ -54,16 +54,17 @@
                     "estimator": "IncrementalEmpiricalCovariance",
                     "library": "sklearnex.covariance",
                     "estimator_methods": {"training": "partial_fit"},
-                    "num_batches": {"training": 2}
+                    "num_batches": {"training": 12}
                 }
             ]
-        },
+	 },
         "basic_statistics": {
             "algorithm": [
                 {
                     "estimator": "IncrementalBasicStatistics",
                     "library": "sklearnex.basic_statistics",
-                    "num_batches": {"training": 2}
+                    "estimator_methods": {"training": "partial_fit"},
+                    "num_batches": {"training": 12}
                 }
             ]
         },
@@ -72,7 +73,8 @@
                 {
                     "estimator": "IncrementalLinearRegression",
                     "library": "sklearnex.linear_model",
-                    "num_batches": {"training": 2}
+                    "estimator_methods": {"training": "partial_fit"},
+                    "num_batches": {"training": 12}
                 }
             ]
         },
@@ -81,16 +83,18 @@
                 {
                     "estimator": "IncrementalPCA",
                     "library": "sklearnex.preview.decomposition",
-                    "num_batches": {"training": 2}
-                }
+                    "estimator_methods": {"training": "partial_fit"},
+                    "num_batches": {"training": 12}
+		    }
             ]
         }
     },
     "TEMPLATES": {
-        "covariance": {"SETS": ["common", "covariance", "covariance data"]},
+        "basic_statistics": {"SETS": ["common", "basic_statistics", "basic_statistics data", "sklearn-ex[gpu] implementations"]},
+        "covariance": {"SETS": ["common", "covariance", "covariance data", "sklearn-ex[gpu] implementations"]},
         "linear_regression": {
-            "SETS": ["common", "linear_regression", "linear_regression data"]
+            "SETS": ["common", "linear_regression", "linear_regression data", "sklearn-ex[gpu] implementations"]
         },
-        "pca": {"SETS": ["common", "pca", "pca data"]}
+        "pca": {"SETS": ["common", "pca", "pca data", "sklearn-ex[gpu] implementations"]}
     }
 }

From 5592d315512f76d0e47008e8b442b6308470e1bd Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Tue, 8 Oct 2024 14:34:31 -0400
Subject: [PATCH 064/110] spmd online

---
 configs/spmd/large_scale/spmd_for_online.json | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/configs/spmd/large_scale/spmd_for_online.json b/configs/spmd/large_scale/spmd_for_online.json
index 53ac660e..7a0a08b4 100644
--- a/configs/spmd/large_scale/spmd_for_online.json
+++ b/configs/spmd/large_scale/spmd_for_online.json
@@ -61,7 +61,7 @@
             "algorithm": [
                 {
                     "estimator": "EmpiricalCovariance",
-                    "library": "sklearnex.spmd.covariance",
+                    "library": "sklearnex.spmd",
                     "estimator_methods": {"training": "fit"}
                 }
             ]
@@ -70,16 +70,16 @@
             "algorithm": [
                 {
                     "estimator": "LinearRegression",
-                    "library": "sklearnex.spmd.linear_model",
+                    "library": "sklearnex.spmd",
                     "estimator_methods": {"training": "fit"}
                 }
             ]
         },
-        "pca": {
+        "decomposition": {
             "algorithm": [
                 {
                     "estimator": "PCA",
-                    "library": "sklearnex.spmd.decomposition",
+                    "library": "sklearnex.spmd",
                     "estimator_methods": {"training": "fit"}
                 }
             ]
@@ -91,6 +91,6 @@
         "linear_regression": {
             "SETS": ["linear_regression", "linear_regression data", "sklearnex spmd implementation", "large scale full one node parameters"]
         },
-        "pca": {"SETS": ["pca", "pca data", "sklearnex spmd implementation", "large scale full one node parameters"]}
+        "pca": {"SETS": ["decomposition", "pca data", "sklearnex spmd implementation", "large scale full one node parameters"]}
     }
 }

From c47649a1cf678747ef746213b1156daa2085ee80 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Tue, 8 Oct 2024 14:58:42 -0400
Subject: [PATCH 065/110] fix

---
 configs/spmd/large_scale/spmd_for_online.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/configs/spmd/large_scale/spmd_for_online.json b/configs/spmd/large_scale/spmd_for_online.json
index 7a0a08b4..2ef60f5b 100644
--- a/configs/spmd/large_scale/spmd_for_online.json
+++ b/configs/spmd/large_scale/spmd_for_online.json
@@ -75,12 +75,12 @@
                 }
             ]
         },
-        "decomposition": {
+        "pca": {
             "algorithm": [
                 {
                     "estimator": "PCA",
                     "library": "sklearnex.spmd",
-                    "estimator_methods": {"training": "fit"}
+                    "estimator_methods": {"training": "fit", "inference": ""}
                 }
             ]
         }
@@ -91,6 +91,6 @@
         "linear_regression": {
             "SETS": ["linear_regression", "linear_regression data", "sklearnex spmd implementation", "large scale full one node parameters"]
         },
-        "pca": {"SETS": ["decomposition", "pca data", "sklearnex spmd implementation", "large scale full one node parameters"]}
+        "pca": {"SETS": ["pca", "pca data", "sklearnex spmd implementation", "large scale full one node parameters"]}
     }
 }

From 687178b5d8b4af725aef323ef06c2cfcf61089e0 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Thu, 10 Oct 2024 03:02:25 -0400
Subject: [PATCH 066/110] incremental spmd

---
 .../large_scale/incremental/basic_stats.json  | 31 +++++++++++++++++++
 .../large_scale/incremental/covariance.json   | 31 +++++++++++++++++++
 .../large_scale/incremental/linear_model.json | 28 +++++++++++++++++
 configs/spmd/large_scale/incremental/pca.json | 31 +++++++++++++++++++
 4 files changed, 121 insertions(+)
 create mode 100644 configs/spmd/large_scale/incremental/basic_stats.json
 create mode 100644 configs/spmd/large_scale/incremental/covariance.json
 create mode 100644 configs/spmd/large_scale/incremental/linear_model.json
 create mode 100644 configs/spmd/large_scale/incremental/pca.json

diff --git a/configs/spmd/large_scale/incremental/basic_stats.json b/configs/spmd/large_scale/incremental/basic_stats.json
new file mode 100644
index 00000000..88ad4c8d
--- /dev/null
+++ b/configs/spmd/large_scale/incremental/basic_stats.json
@@ -0,0 +1,31 @@
+{
+    "INCLUDE": ["../../../common/sklearn.json", "../../../spmd/stats_covariance.json", "../large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd basicstats parameters": {
+            "algorithm": {
+                "estimator": "IncrementalBasicStatistics",
+                "estimator_methods": { "training": "fit" },
+                "num_batches": {"training": 10}
+            },
+	    "data": {
+		"split_kwargs": { "test_size": 0.0001 }
+	    }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
+		{ "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "basicstats": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale 64 parameters",
+		"synthetic data",
+                "spmd basicstats parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/incremental/covariance.json b/configs/spmd/large_scale/incremental/covariance.json
new file mode 100644
index 00000000..06c8e4ca
--- /dev/null
+++ b/configs/spmd/large_scale/incremental/covariance.json
@@ -0,0 +1,31 @@
+{
+    "INCLUDE": ["../../../common/sklearn.json", "../../../spmd/stats_covariance.json", "../large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd covariance parameters": {
+            "algorithm": {
+                "estimator": "IncrementalEmpiricalCovariance",
+                "estimator_methods": { "training": "fit" },
+                "num_batches": {"training": 10}
+            },
+            "data": {
+                "split_kwargs": { "test_size": 0.0001 }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "covariance": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale 64 parameters",
+		"synthetic data",
+                "spmd covariance parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/incremental/linear_model.json b/configs/spmd/large_scale/incremental/linear_model.json
new file mode 100644
index 00000000..19882482
--- /dev/null
+++ b/configs/spmd/large_scale/incremental/linear_model.json
@@ -0,0 +1,28 @@
+{
+    "INCLUDE": ["../../../common/sklearn.json", "../../../regular/linear_model.json", "../large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd linear parameters": {
+            "algorithm": {
+                "estimator": "IncrementalLinearRegression",
+                "estimator_methods": { "training": "fit" },
+                "num_batches": {"training": 10}
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_regression", "generation_kwargs": { "n_samples": 30005000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 30000000, "test_size": 5000 } },
+		        { "source": "make_regression", "generation_kwargs": { "n_samples": 305000,  "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 300000, "test_size": 5000 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale 64 parameters",
+		"synthetic data",
+                "spmd linear parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/incremental/pca.json b/configs/spmd/large_scale/incremental/pca.json
new file mode 100644
index 00000000..f1a264ea
--- /dev/null
+++ b/configs/spmd/large_scale/incremental/pca.json
@@ -0,0 +1,31 @@
+{
+    "INCLUDE": ["../../../common/sklearn.json", "../../../regular/pca.json", "../large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd pca parameters": {
+            "algorithm": {
+                "estimator": "IncrementalPCA",
+                "estimator_methods": { "training": "fit", "inference": "" },
+                "num_batches": {"training": 10}
+            },
+            "data": {
+                "split_kwargs": { "test_size": 0.0001 }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+		"sklearnex spmd implementation",
+                "large scale 64 parameters",
+                "synthetic data",
+		"spmd pca parameters"
+            ]
+        }
+    }
+}

From 5c97aed7f9c3e09e6c7679bd1faf8c24f392e052 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Thu, 10 Oct 2024 03:11:37 -0400
Subject: [PATCH 067/110] incremental spmd test

---
 configs/spmd/large_scale/incremental/basic_stats.json  | 2 +-
 configs/spmd/large_scale/incremental/covariance.json   | 2 +-
 configs/spmd/large_scale/incremental/linear_model.json | 2 +-
 configs/spmd/large_scale/incremental/pca.json          | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/configs/spmd/large_scale/incremental/basic_stats.json b/configs/spmd/large_scale/incremental/basic_stats.json
index 88ad4c8d..deb03126 100644
--- a/configs/spmd/large_scale/incremental/basic_stats.json
+++ b/configs/spmd/large_scale/incremental/basic_stats.json
@@ -22,7 +22,7 @@
         "basicstats": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale 64 parameters",
+                "large scale two nodes parameters",
 		"synthetic data",
                 "spmd basicstats parameters"
             ]
diff --git a/configs/spmd/large_scale/incremental/covariance.json b/configs/spmd/large_scale/incremental/covariance.json
index 06c8e4ca..f9c062b9 100644
--- a/configs/spmd/large_scale/incremental/covariance.json
+++ b/configs/spmd/large_scale/incremental/covariance.json
@@ -22,7 +22,7 @@
         "covariance": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale 64 parameters",
+                "large scale two nodes parameters",
 		"synthetic data",
                 "spmd covariance parameters"
             ]
diff --git a/configs/spmd/large_scale/incremental/linear_model.json b/configs/spmd/large_scale/incremental/linear_model.json
index 19882482..593f25ed 100644
--- a/configs/spmd/large_scale/incremental/linear_model.json
+++ b/configs/spmd/large_scale/incremental/linear_model.json
@@ -19,7 +19,7 @@
         "linreg": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale 64 parameters",
+                "large scale two nodes parameters",
 		"synthetic data",
                 "spmd linear parameters"
             ]
diff --git a/configs/spmd/large_scale/incremental/pca.json b/configs/spmd/large_scale/incremental/pca.json
index f1a264ea..31bc9ec9 100644
--- a/configs/spmd/large_scale/incremental/pca.json
+++ b/configs/spmd/large_scale/incremental/pca.json
@@ -22,7 +22,7 @@
         "linreg": {
             "SETS": [
 		"sklearnex spmd implementation",
-                "large scale 64 parameters",
+                "large scale two nodes parameters",
                 "synthetic data",
 		"spmd pca parameters"
             ]

From 907b35a4d67ad790fffe55c13eb915e5442f61ef Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Thu, 10 Oct 2024 03:49:35 -0400
Subject: [PATCH 068/110] incremental spmd

---
 configs/spmd/large_scale/incremental/basic_stats.json  | 3 +--
 configs/spmd/large_scale/incremental/covariance.json   | 3 +--
 configs/spmd/large_scale/incremental/linear_model.json | 3 +--
 configs/spmd/large_scale/incremental/pca.json          | 3 +--
 configs/spmd/large_scale/large_scale.json              | 9 +++++++++
 5 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/configs/spmd/large_scale/incremental/basic_stats.json b/configs/spmd/large_scale/incremental/basic_stats.json
index deb03126..ca9e3eb9 100644
--- a/configs/spmd/large_scale/incremental/basic_stats.json
+++ b/configs/spmd/large_scale/incremental/basic_stats.json
@@ -13,7 +13,6 @@
         },
         "synthetic data": {
             "data": [
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
 		{ "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
             ]
         }
@@ -22,7 +21,7 @@
         "basicstats": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale two nodes parameters",
+                "large scale 32 parameters",
 		"synthetic data",
                 "spmd basicstats parameters"
             ]
diff --git a/configs/spmd/large_scale/incremental/covariance.json b/configs/spmd/large_scale/incremental/covariance.json
index f9c062b9..04fcd76b 100644
--- a/configs/spmd/large_scale/incremental/covariance.json
+++ b/configs/spmd/large_scale/incremental/covariance.json
@@ -13,7 +13,6 @@
         },
         "synthetic data": {
             "data": [
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
                 { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
             ]
         }
@@ -22,7 +21,7 @@
         "covariance": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale two nodes parameters",
+                "large scale 32 parameters",
 		"synthetic data",
                 "spmd covariance parameters"
             ]
diff --git a/configs/spmd/large_scale/incremental/linear_model.json b/configs/spmd/large_scale/incremental/linear_model.json
index 593f25ed..a483f613 100644
--- a/configs/spmd/large_scale/incremental/linear_model.json
+++ b/configs/spmd/large_scale/incremental/linear_model.json
@@ -10,7 +10,6 @@
         },
         "synthetic data": {
             "data": [
-                { "source": "make_regression", "generation_kwargs": { "n_samples": 30005000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 30000000, "test_size": 5000 } },
 		        { "source": "make_regression", "generation_kwargs": { "n_samples": 305000,  "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 300000, "test_size": 5000 } }
             ]
         }
@@ -19,7 +18,7 @@
         "linreg": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale two nodes parameters",
+                "large scale 32 parameters",
 		"synthetic data",
                 "spmd linear parameters"
             ]
diff --git a/configs/spmd/large_scale/incremental/pca.json b/configs/spmd/large_scale/incremental/pca.json
index 31bc9ec9..11fa5125 100644
--- a/configs/spmd/large_scale/incremental/pca.json
+++ b/configs/spmd/large_scale/incremental/pca.json
@@ -13,7 +13,6 @@
         },
         "synthetic data": {
             "data": [
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
                 { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
             ]
         }
@@ -22,7 +21,7 @@
         "linreg": {
             "SETS": [
 		"sklearnex spmd implementation",
-                "large scale two nodes parameters",
+                "large scale 32 parameters",
                 "synthetic data",
 		"spmd pca parameters"
             ]
diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
index 55a70fbf..7fd10353 100644
--- a/configs/spmd/large_scale/large_scale.json
+++ b/configs/spmd/large_scale/large_scale.json
@@ -72,6 +72,15 @@
                 "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
+        "large scale 32 parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
         "large scale 128 to 2k parameters": {
             "data": {
                 "dtype": "float64",

From 7ed023524a2948ac895cc6febb0e06e784da36d6 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Thu, 10 Oct 2024 03:52:21 -0400
Subject: [PATCH 069/110] incremental spmd

---
 sklbench/utils/logger.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklbench/utils/logger.py b/sklbench/utils/logger.py
index 90940630..250c5fa6 100644
--- a/sklbench/utils/logger.py
+++ b/sklbench/utils/logger.py
@@ -19,7 +19,7 @@
 logger = logging.Logger("sklbench")
 
 logging_channel = logging.StreamHandler()
-logging_formatter = logging.Formatter("%(levelname)s:%(name)s: %(message)s")
+logging_formatter = logging.Formatter("%(asctime)s - %(levelname)s:%(name)s: %(message)s")
 logging_channel.setFormatter(logging_formatter)
 
 logger.addHandler(logging_channel)

From e68edd5389c2cb8302a126f6d41a326e7ab66d3b Mon Sep 17 00:00:00 2001
From: ethanglaser <ethan.glaser@intel.com>
Date: Tue, 15 Oct 2024 23:41:37 +0000
Subject: [PATCH 070/110] configs nearly finalized + minor job updates

---
 configs/spmd/large_scale/basic_stats.json     |   2 +-
 .../spmd/large_scale/basic_stats_single.json  |  30 -----
 .../spmd/large_scale/basic_stats_strong.json  |   2 +-
 configs/spmd/large_scale/covariance.json      |   2 +-
 .../spmd/large_scale/covariance_strong.json   |   2 +-
 configs/spmd/large_scale/dbscan.json          |   7 +-
 configs/spmd/large_scale/dbscan_strong.json   |   7 +-
 configs/spmd/large_scale/forest.json          |   9 +-
 configs/spmd/large_scale/forest_reg.json      |  27 -----
 configs/spmd/large_scale/forest_strong.json   |   7 +-
 .../spmd/large_scale/forest_strong_reg.json   |  27 -----
 configs/spmd/large_scale/kmeans.json          |   6 +-
 configs/spmd/large_scale/kmeans_strong.json   |   4 +-
 configs/spmd/large_scale/kmeans_strong_2.json |  31 ------
 configs/spmd/large_scale/knn.json             |   6 +-
 configs/spmd/large_scale/knn_strong.json      |   8 +-
 configs/spmd/large_scale/large_scale.json     | 105 +-----------------
 .../{linear_model.json => linreg.json}        |   2 +-
 ...r_model_strong.json => linreg_strong.json} |   2 +-
 configs/spmd/large_scale/logreg.json          |   6 +-
 configs/spmd/large_scale/logreg_2.json        |  29 -----
 configs/spmd/large_scale/logreg_strong.json   |   7 +-
 configs/spmd/large_scale/logreg_strong_2.json |  28 -----
 configs/spmd/large_scale/pca.json             |   2 +-
 configs/spmd/large_scale/pca_single.json      |  30 -----
 configs/spmd/large_scale/pca_strong.json      |   2 +-
 sklbench/benchmarks/sklearn_estimator.py      |  12 +-
 sklbench/datasets/common.py                   |   4 +-
 28 files changed, 59 insertions(+), 347 deletions(-)
 delete mode 100644 configs/spmd/large_scale/basic_stats_single.json
 delete mode 100644 configs/spmd/large_scale/forest_reg.json
 delete mode 100644 configs/spmd/large_scale/forest_strong_reg.json
 delete mode 100644 configs/spmd/large_scale/kmeans_strong_2.json
 rename configs/spmd/large_scale/{linear_model.json => linreg.json} (90%)
 rename configs/spmd/large_scale/{linear_model_strong.json => linreg_strong.json} (88%)
 delete mode 100644 configs/spmd/large_scale/logreg_2.json
 delete mode 100644 configs/spmd/large_scale/logreg_strong_2.json
 delete mode 100644 configs/spmd/large_scale/pca_single.json

diff --git a/configs/spmd/large_scale/basic_stats.json b/configs/spmd/large_scale/basic_stats.json
index 9ac4725f..d6c2c4d2 100644
--- a/configs/spmd/large_scale/basic_stats.json
+++ b/configs/spmd/large_scale/basic_stats.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
     "PARAMETERS_SETS": {
         "spmd basicstats parameters": {
             "algorithm": {
diff --git a/configs/spmd/large_scale/basic_stats_single.json b/configs/spmd/large_scale/basic_stats_single.json
deleted file mode 100644
index 832bd3b2..00000000
--- a/configs/spmd/large_scale/basic_stats_single.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
-    "PARAMETERS_SETS": {
-        "spmd basicstats parameters": {
-            "algorithm": {
-                "estimator": "BasicStatistics",
-                "estimator_methods": { "training": "fit" }
-            },
-	    "data": {
-		"split_kwargs": { "test_size": 0.0001 }
-	    }
-        },
-        "synthetic data": {
-            "data": [
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
-		        { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
-            ]
-        }
-    },
-    "TEMPLATES": {
-        "basicstats": {
-            "SETS": [
-                "sklearnex spmd implementation",
-                "large scale one node parameters",
-		"synthetic data",
-                "spmd basicstats parameters"
-            ]
-        }
-    }
-}
diff --git a/configs/spmd/large_scale/basic_stats_strong.json b/configs/spmd/large_scale/basic_stats_strong.json
index b7aa22cb..b5b0ef69 100644
--- a/configs/spmd/large_scale/basic_stats_strong.json
+++ b/configs/spmd/large_scale/basic_stats_strong.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
     "PARAMETERS_SETS": {
         "spmd basicstats parameters": {
             "algorithm": {
diff --git a/configs/spmd/large_scale/covariance.json b/configs/spmd/large_scale/covariance.json
index 260befd0..20da8d15 100644
--- a/configs/spmd/large_scale/covariance.json
+++ b/configs/spmd/large_scale/covariance.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
     "PARAMETERS_SETS": {
         "spmd basicstats parameters": {
             "algorithm": {
diff --git a/configs/spmd/large_scale/covariance_strong.json b/configs/spmd/large_scale/covariance_strong.json
index 568b4a8f..b8424d92 100644
--- a/configs/spmd/large_scale/covariance_strong.json
+++ b/configs/spmd/large_scale/covariance_strong.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
     "PARAMETERS_SETS": {
         "spmd basicstats parameters": {
             "algorithm": {
diff --git a/configs/spmd/large_scale/dbscan.json b/configs/spmd/large_scale/dbscan.json
index 0660e869..61b0521e 100644
--- a/configs/spmd/large_scale/dbscan.json
+++ b/configs/spmd/large_scale/dbscan.json
@@ -6,6 +6,9 @@
 		"estimator": "DBSCAN",
 		"estimator_methods": {
 		    "training": "fit"
+		},
+		"estimator_params" : {
+			"eps": 10, "min_samples": 5
 		}
 	    },
 	    "data": {
@@ -14,7 +17,7 @@
 	},
 	"synthetic dataset": {
             "data": [
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 100, "centers": 10 }, "algorithm": { "eps": 5, "min_samples": 5 } }
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 100, "centers": 10 } }
             ]
 	}
     },
@@ -24,7 +27,7 @@
                 "common dbscan parameters",
                 "synthetic dataset",
                 "sklearnex spmd implementation",
-		"large scale default parameters",
+		"large scale <64 parameters",
                 "spmd dbscan parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/dbscan_strong.json b/configs/spmd/large_scale/dbscan_strong.json
index e591316e..24ea7cfc 100644
--- a/configs/spmd/large_scale/dbscan_strong.json
+++ b/configs/spmd/large_scale/dbscan_strong.json
@@ -6,7 +6,10 @@
 		"estimator": "DBSCAN",
 		"estimator_methods": {
 		    "training": "fit"
-		}
+		},
+                "estimator_params" : {
+                        "eps": 10, "min_samples": 5
+                }
 	    },
 	    "data": {
 		"dtype": "float64"
@@ -14,7 +17,7 @@
 	},
 	"synthetic dataset": {
             "data": [
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 500000,  "n_features": 100, "centers": 10 }, "algorithm": { "eps": 5, "min_samples": 5 } }
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 500000,  "n_features": 100, "centers": 10 } }
             ]
 	}
     },
diff --git a/configs/spmd/large_scale/forest.json b/configs/spmd/large_scale/forest.json
index 2d9dfde9..b4402442 100644
--- a/configs/spmd/large_scale/forest.json
+++ b/configs/spmd/large_scale/forest.json
@@ -1,16 +1,17 @@
 {
-    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
     "PARAMETERS_SETS": {
         "spmd forest classification parameters": {
             "algorithm": {
                 "estimator": "RandomForestClassifier",
-		"estimator_methods": { "training": "fit" }
+		"estimator_methods": { "training": "fit" },
+		"estimator_params": { "n_estimators": 20, "max_depth": 4 }
             }
         },
         "synthetic data": {
             "data": [
-                { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 501000, "n_features": 10, "n_classes": 2 },  "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } },
-                { "source": "make_classification", "split_kwargs": { "train_size": 10000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 11000, "n_features": 1000, "n_classes": 2 },  "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } }
+                { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 501000, "n_features": 10, "n_classes": 2 } },
+                { "source": "make_classification", "split_kwargs": { "train_size": 10000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 11000, "n_features": 1000, "n_classes": 2 } }
             ]
         }
     },
diff --git a/configs/spmd/large_scale/forest_reg.json b/configs/spmd/large_scale/forest_reg.json
deleted file mode 100644
index a5ec73cd..00000000
--- a/configs/spmd/large_scale/forest_reg.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
-    "PARAMETERS_SETS": {
-        "spmd forest regression parameters": {
-            "algorithm": {
-                "estimator": "RandomForestRegressor"
-            }
-        },
-        "synthetic data": {
-            "data": [
-                { "source": "make_regression", "generation_kwargs": { "n_samples": 501000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } }},
-                { "source": "make_regression", "generation_kwargs": { "n_samples": 11000,  "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 10000, "test_size": 1000 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } }}
-            
-            ]
-        }
-    },
-    "TEMPLATES": {
-        "forestReg": {
-            "SETS": [
-                "sklearnex spmd implementation",
-                "large scale 2k parameters",
-		"synthetic data",
-                "spmd forest regression parameters"
-            ]
-        }
-    }
-}
diff --git a/configs/spmd/large_scale/forest_strong.json b/configs/spmd/large_scale/forest_strong.json
index 17ca8c51..23b982f5 100644
--- a/configs/spmd/large_scale/forest_strong.json
+++ b/configs/spmd/large_scale/forest_strong.json
@@ -1,15 +1,16 @@
 {
-    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
     "PARAMETERS_SETS": {
         "spmd forest classification parameters": {
             "algorithm": {
                 "estimator": "RandomForestClassifier",
-		"estimator_methods": { "training": "fit" }
+		"estimator_methods": { "training": "fit" },
+		"estimator_params": { "n_estimators": 20, "max_depth": 4 }
             }
         },
         "synthetic data": {
             "data": [
-                { "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 10001000, "n_features": 100, "n_classes": 2 },  "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } }
+                { "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 10001000, "n_features": 100, "n_classes": 2 } }
             ]
         }
     },
diff --git a/configs/spmd/large_scale/forest_strong_reg.json b/configs/spmd/large_scale/forest_strong_reg.json
deleted file mode 100644
index 305e729b..00000000
--- a/configs/spmd/large_scale/forest_strong_reg.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
-    "PARAMETERS_SETS": {
-        "spmd forest regression parameters": {
-            "algorithm": {
-                "estimator": "RandomForestRegressor"
-            }
-        },
-        "synthetic data": {
-            "data": [
-                { "source": "make_regression", "generation_kwargs": { "n_samples": 1000000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 900000, "test_size": 10000 }, "algorithm": { "estimator_params": { "n_estimators": 5, "max_depth": 4 } }},
-                { "source": "make_regression", "generation_kwargs": { "n_samples": 100000,  "n_features": 100, "noise": 1.25 }, "split_kwargs": { "train_size": 90000, "test_size": 10000 }, "algorithm": { "estimator_params": { "n_estimators": 10, "max_depth": 4 } }}
-            
-            ]
-        }
-    },
-    "TEMPLATES": {
-        "forestReg": {
-            "SETS": [
-                "sklearnex spmd implementation",
-                "large scale strong 32 parameters",
-		"synthetic data",
-                "spmd forest regression parameters"
-            ]
-        }
-    }
-}
diff --git a/configs/spmd/large_scale/kmeans.json b/configs/spmd/large_scale/kmeans.json
index c77d22bc..1140823d 100644
--- a/configs/spmd/large_scale/kmeans.json
+++ b/configs/spmd/large_scale/kmeans.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../../common/sklearn.json", "../../regular/kmeans.json", "large_scale.json"],
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
     "PARAMETERS_SETS": {
         "spmd kmeans parameters": {
             "algorithm": {
@@ -12,8 +12,8 @@
 	},
 	"synthetic data": {
                 "data": [
-                    { "source": "make_blobs", "generation_kwargs": { "n_samples": 3750000,  "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
-	                { "source": "make_blobs", "generation_kwargs": { "n_samples": 18750,  "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }
+                    { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000,  "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
+	                { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000,  "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }
                 ]
         }
     },
diff --git a/configs/spmd/large_scale/kmeans_strong.json b/configs/spmd/large_scale/kmeans_strong.json
index 6f095af0..6277745b 100644
--- a/configs/spmd/large_scale/kmeans_strong.json
+++ b/configs/spmd/large_scale/kmeans_strong.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../../common/sklearn.json", "../../regular/kmeans.json", "large_scale.json"],
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
     "PARAMETERS_SETS": {
         "spmd kmeans parameters": {
             "algorithm": {
@@ -23,7 +23,7 @@
             "SETS": [
                 "synthetic data",
                 "sklearnex spmd implementation",
-                "large scale strong 32 parameters",
+                "large scale strong <64 parameters",
                 "spmd kmeans parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/kmeans_strong_2.json b/configs/spmd/large_scale/kmeans_strong_2.json
deleted file mode 100644
index 03f2bc59..00000000
--- a/configs/spmd/large_scale/kmeans_strong_2.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-    "INCLUDE": ["../../common/sklearn.json", "../../regular/kmeans.json", "large_scale.json"],
-    "PARAMETERS_SETS": {
-        "spmd kmeans parameters": {
-            "algorithm": {
-                "estimator": "KMeans",
-                "estimator_params": {
-                    "algorithm": "lloyd"
-                },
-                "estimator_methods": { "training": "fit", "inference": "predict" }
-            }
-	},
-	"synthetic data": {
-                "data": [
-                        { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000,  "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
-	                { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000,  "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
-			{ "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000,  "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } }
-                ]
-        }
-    },
-    "TEMPLATES": {
-        "kmeans": {
-            "SETS": [
-                "synthetic data",
-                "sklearnex spmd implementation",
-                "large scale strong two nodes parameters",
-                "spmd kmeans parameters"
-            ]
-        }
-    }
-}
diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json
index f1e0678d..b68b94af 100644
--- a/configs/spmd/large_scale/knn.json
+++ b/configs/spmd/large_scale/knn.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../../common/sklearn.json", "../../regular/knn.json", "large_scale.json"],
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
     "PARAMETERS_SETS": {
         "spmd knn cls parameters": {
             "algorithm": {
@@ -19,15 +19,13 @@
         },
         "synthetic classification data": {
             "data": [
-		        { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 5000 },   "generation_kwargs": {  "n_samples": 55000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
-		        { "source": "make_classification", "split_kwargs": { "train_size": 5000, "test_size": 50000 },   "generation_kwargs": {  "n_samples": 55000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
+		{ "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 5000 },   "generation_kwargs": {  "n_samples": 5005000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
             ]
         }	
     },
     "TEMPLATES": {
         "knn classifier": {
             "SETS": [
-                "common knn parameters",
                 "synthetic classification data",
                 "sklearnex spmd implementation",
 		"large scale 2k parameters",
diff --git a/configs/spmd/large_scale/knn_strong.json b/configs/spmd/large_scale/knn_strong.json
index 67398123..7fe862dd 100644
--- a/configs/spmd/large_scale/knn_strong.json
+++ b/configs/spmd/large_scale/knn_strong.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../../common/sklearn.json", "../../regular/knn.json", "large_scale.json"],
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
     "PARAMETERS_SETS": {
         "spmd knn cls parameters": {
             "algorithm": {
@@ -19,18 +19,16 @@
         },
         "synthetic classification data": {
             "data": [
-		        { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 5000 },   "generation_kwargs": {  "n_samples": 505000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
-		        { "source": "make_classification", "split_kwargs": { "train_size": 5000, "test_size": 500000 },   "generation_kwargs": {  "n_samples": 505000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
+		{ "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000000 },   "generation_kwargs": {  "n_samples": 1500000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
             ]
         }	
     },
     "TEMPLATES": {
         "knn classifier": {
             "SETS": [
-                "common knn parameters",
                 "synthetic classification data",
                 "sklearnex spmd implementation",
-		"large scale strong 32 parameters",
+		"large scale strong <64 parameters",
                 "spmd knn cls parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
index 832259a0..7e523984 100644
--- a/configs/spmd/large_scale/large_scale.json
+++ b/configs/spmd/large_scale/large_scale.json
@@ -18,49 +18,22 @@
                 "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
-        "large scale one node parameters": {
-            "data": {
-                "dtype": "float64",
-		        "distributed_split": "None"
-            },
-            "bench": {
-                "mpi_params": {"n": [1,2,3,4,5,6,7,8,9,10,11,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
-            }
-        },
-        "large scale strong one node parameters": {
-            "data": {
-                "dtype": "float64",
-                "distributed_split": "rank_based"
-            },
-            "bench": {
-                "mpi_params": {"n": [1,2,3,4,5,6,7,8,9,10,11,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
-            }
-        },
-        "large scale full one node parameters": {
-            "data": {
-                "dtype": "float64",
-		        "distributed_split": "None"
-            },
-            "bench": {
-                "mpi_params": {"n": [12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
-            }
-        },
-        "large scale strong full one node parameters": {
+        "large scale 2k parameters": {
             "data": {
                 "dtype": "float64",
-                "distributed_split": "rank_based"
+                "distributed_split": "None"
             },
             "bench": {
-                "mpi_params": {"n": [12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
-        "large scale 2k parameters": {
+        "large scale 32 parameters": {
             "data": {
                 "dtype": "float64",
                 "distributed_split": "None"
             },
             "bench": {
-                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
         "large scale <64 parameters": {
@@ -82,65 +55,6 @@
             }
         },
 
-        "large scale 128 parameters": {
-            "data": {
-                "dtype": "float64",
-                "distributed_split": "None"
-            },
-            "bench": {
-                "mpi_params": {"n": [1536], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
-            }
-        },
-
-        "large scale 256 parameters": {
-            "data": {
-                "dtype": "float64",
-                "distributed_split": "None"
-            },
-            "bench": {
-                "mpi_params": {"n": [3072], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
-            }
-        },
-
-        "large scale 512 parameters": {
-            "data": {
-                "dtype": "float64",
-                "distributed_split": "None"
-            },
-            "bench": {
-                "mpi_params": {"n": [6144], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
-            }
-        },
-
-        "large scale 1024 parameters": {
-            "data": {
-                "dtype": "float64",
-                "distributed_split": "None"
-            },
-            "bench": {
-                "mpi_params": {"n": [12288], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
-            }
-        },
-
-        "large scale 2048 parameters": {
-            "data": {
-                "dtype": "float64",
-                "distributed_split": "None"
-            },
-            "bench": {
-                "mpi_params": {"n": [24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
-            }
-        },
-
-        "large scale two nodes parameters": {
-            "data": {
-                "dtype": "float64",
-                "distributed_split": "None"
-            },
-            "bench": {
-                "mpi_params": {"n": [24], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
-            }
-        },
         "large scale strong 2k parameters": {
             "data": {
                 "dtype": "float64",
@@ -159,15 +73,6 @@
                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
-        "large scale strong two nodes parameters": {
-            "data": {
-                "dtype": "float64",
-                "distributed_split": "rank_based"
-            },
-            "bench": {
-                "mpi_params": {"n": [24], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
-            }
-        },
         "large scale impi parameters": {
             "data": {
                 "dtype": "float64",
diff --git a/configs/spmd/large_scale/linear_model.json b/configs/spmd/large_scale/linreg.json
similarity index 90%
rename from configs/spmd/large_scale/linear_model.json
rename to configs/spmd/large_scale/linreg.json
index f9d17b5b..ea45a52c 100644
--- a/configs/spmd/large_scale/linear_model.json
+++ b/configs/spmd/large_scale/linreg.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../../common/sklearn.json", "../../regular/linear_model.json", "large_scale.json"],
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
     "PARAMETERS_SETS": {
         "spmd linear parameters": {
             "algorithm": {
diff --git a/configs/spmd/large_scale/linear_model_strong.json b/configs/spmd/large_scale/linreg_strong.json
similarity index 88%
rename from configs/spmd/large_scale/linear_model_strong.json
rename to configs/spmd/large_scale/linreg_strong.json
index 77a9c79e..629bf544 100644
--- a/configs/spmd/large_scale/linear_model_strong.json
+++ b/configs/spmd/large_scale/linreg_strong.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../../common/sklearn.json", "../../regular/linear_model.json", "large_scale.json"],
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
     "PARAMETERS_SETS": {
         "spmd linear parameters": {
             "algorithm": {
diff --git a/configs/spmd/large_scale/logreg.json b/configs/spmd/large_scale/logreg.json
index c5ef6203..326f2580 100644
--- a/configs/spmd/large_scale/logreg.json
+++ b/configs/spmd/large_scale/logreg.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../../common/sklearn.json", "../../regular/logreg.json", "../logreg.json", "large_scale.json"],
+    "INCLUDE": ["../../common/sklearn.json", "../logreg.json", "large_scale.json"],
     "PARAMETERS_SETS": {
         "spmd logreg2 parameters": {
 	    "algorithm":{
@@ -11,12 +11,12 @@
         "synthetic data": {
             "data": [
 		{ "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 5001000, "n_features": 10, "n_classes": 2 } },
-                { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 101000, "n_features": 1000, "n_classes": 2 } }
+                { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 501000, "n_features": 1000, "n_classes": 2, "n_informative": 40, "n_clusters_per_class": 3, "flip_y": 0.05 } }
             ]
         }
     },
     "TEMPLATES": {
-        "linreg": {
+        "logreg": {
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
diff --git a/configs/spmd/large_scale/logreg_2.json b/configs/spmd/large_scale/logreg_2.json
deleted file mode 100644
index 796eb8ad..00000000
--- a/configs/spmd/large_scale/logreg_2.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-    "INCLUDE": ["../../common/sklearn.json", "../../regular/logreg.json", "../logreg.json", "large_scale.json"],
-    "PARAMETERS_SETS": {
-        "spmd logreg2 parameters": {
-	    "algorithm":{
-		"estimator": "LogisticRegression",
-                "estimator_methods": { "inference": "predict" },
-		"estimator_params": { "max_iter": 20 }
-            }
-	},
-        "synthetic data": {
-            "data": [
-		{ "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 5001000, "n_features": 10, "n_classes": 2 } },
-                { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 101000, "n_features": 1000, "n_classes": 2 } }
-            ]
-        }
-    },
-    "TEMPLATES": {
-        "linreg": {
-            "SETS": [
-                "sklearnex spmd implementation",
-                "large scale two nodes parameters",
-                "spmd logreg parameters",
-		"synthetic data",
-		"spmd logreg2 parameters"
-            ]
-        }
-    }
-}
diff --git a/configs/spmd/large_scale/logreg_strong.json b/configs/spmd/large_scale/logreg_strong.json
index 8787f6b6..0b79ba9d 100644
--- a/configs/spmd/large_scale/logreg_strong.json
+++ b/configs/spmd/large_scale/logreg_strong.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../../common/sklearn.json", "../../regular/logreg.json", "../logreg.json", "large_scale.json"],
+    "INCLUDE": ["../../common/sklearn.json", "../logreg.json", "large_scale.json"],
     "PARAMETERS_SETS": {
         "spmd logreg2 parameters": {
 	    "algorithm":{
@@ -10,12 +10,13 @@
 	},
         "synthetic data": {
             "data": [
-		{ "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 10001000, "n_features": 100, "n_classes": 2 } }
+		{ "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 10001000, "n_features": 100, "n_classes": 2 } },
+		{ "source": "make_classification", "split_kwargs": { "train_size": 12000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 12001000, "n_features": 200, "n_classes": 2, "n_informative": 40, "n_clusters_per_class": 3, "flip_y": 0.05 } }
             ]
         }
     },
     "TEMPLATES": {
-        "linreg": {
+        "logreg": {
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale strong 2k parameters",
diff --git a/configs/spmd/large_scale/logreg_strong_2.json b/configs/spmd/large_scale/logreg_strong_2.json
deleted file mode 100644
index 998e3bb7..00000000
--- a/configs/spmd/large_scale/logreg_strong_2.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-    "INCLUDE": ["../../common/sklearn.json", "../../regular/logreg.json", "../logreg.json", "large_scale.json"],
-    "PARAMETERS_SETS": {
-        "spmd logreg2 parameters": {
-	    "algorithm":{
-		"estimator": "LogisticRegression",
-                "estimator_methods": { "inference": "predict" },
-		"estimator_params": { "max_iter": 30 }
-            }
-	},
-        "synthetic data": {
-            "data": [
-		{ "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 10001000, "n_features": 100, "n_classes": 2 } }
-            ]
-        }
-    },
-    "TEMPLATES": {
-        "linreg": {
-            "SETS": [
-                "sklearnex spmd implementation",
-                "large scale strong two nodes parameters",
-                "spmd logreg parameters",
-		"synthetic data",
-		"spmd logreg2 parameters"
-            ]
-        }
-    }
-}
diff --git a/configs/spmd/large_scale/pca.json b/configs/spmd/large_scale/pca.json
index 9a6a6b02..d0ee879a 100644
--- a/configs/spmd/large_scale/pca.json
+++ b/configs/spmd/large_scale/pca.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../../common/sklearn.json", "../../regular/pca.json", "large_scale.json"],
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
     "PARAMETERS_SETS": {
         "spmd pca parameters": {
             "algorithm": {
diff --git a/configs/spmd/large_scale/pca_single.json b/configs/spmd/large_scale/pca_single.json
deleted file mode 100644
index 07775a6a..00000000
--- a/configs/spmd/large_scale/pca_single.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-    "INCLUDE": ["../../common/sklearn.json", "../../regular/pca.json", "large_scale.json"],
-    "PARAMETERS_SETS": {
-        "spmd pca parameters": {
-            "algorithm": {
-                "estimator": "PCA",
-                "estimator_methods": { "training": "fit", "inference": "" }
-            },
-            "data": {
-                "split_kwargs": { "test_size": 0.0001 }
-            }
-        },
-        "synthetic data": {
-            "data": [
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
-            ]
-        }
-    },
-    "TEMPLATES": {
-        "linreg": {
-            "SETS": [
-	        "sklearnex spmd implementation",
-                "large scale one node parameters",
-                "synthetic data",
-		"spmd pca parameters"
-            ]
-        }
-    }
-}
diff --git a/configs/spmd/large_scale/pca_strong.json b/configs/spmd/large_scale/pca_strong.json
index 9063c22e..3cb33e72 100644
--- a/configs/spmd/large_scale/pca_strong.json
+++ b/configs/spmd/large_scale/pca_strong.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../../common/sklearn.json", "../../regular/pca.json", "large_scale.json"],
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
     "PARAMETERS_SETS": {
         "spmd pca parameters": {
             "algorithm": {
diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
index b4d4f3ee..36ec40b6 100644
--- a/sklbench/benchmarks/sklearn_estimator.py
+++ b/sklbench/benchmarks/sklearn_estimator.py
@@ -66,8 +66,8 @@ def get_estimator(library_name: str, estimator_name: str):
             f"Using first {classes_map[estimator_name][0]}."
         )
     estimator = classes_map[estimator_name][0]
-    if not issubclass(estimator, BaseEstimator):
-        logger.info(f"{estimator} estimator is not derived from sklearn's BaseEstimator")
+    #if not issubclass(estimator, BaseEstimator):
+    #    logger.info(f"{estimator} estimator is not derived from sklearn's BaseEstimator")
     return estimator
 
 
@@ -515,7 +515,11 @@ def main(bench_case: BenchCase, filters: List[BenchCase]):
     estimator_params = get_bench_case_value(
         bench_case, "algorithm:estimator_params", dict()
     )
-
+    #logger.debug("estimator params: " + str(estimator_params))
+    if "DBSCAN" in str(estimator_name):
+        if "min_samples" in estimator_params:
+            from mpi4py import MPI
+            estimator_params["min_samples"] = MPI.COMM_WORLD.Get_size() * estimator_params["min_samples"]
     # get estimator methods for measurement
     estimator_methods = get_estimator_methods(bench_case)
 
@@ -551,7 +555,7 @@ def main(bench_case: BenchCase, filters: List[BenchCase]):
     # note: "handle" is not JSON-serializable
     if "handle" in estimator_params:
         del estimator_params["handle"]
-    logger.debug(f"Estimator parameters:\n{custom_format(estimator_params)}")
+    #logger.debug(f"Estimator parameters:\n{custom_format(estimator_params)}")
     result_template.update(estimator_params)
 
     data_descs = {
diff --git a/sklbench/datasets/common.py b/sklbench/datasets/common.py
index e7ed0160..5c6bd27a 100644
--- a/sklbench/datasets/common.py
+++ b/sklbench/datasets/common.py
@@ -136,11 +136,11 @@ def cache_wrapper(**kwargs):
         data_name = kwargs["data_name"]
         data_cache = kwargs["data_cache"]
         if len(get_filenames_by_prefix(data_cache, data_name)) > 0:
-            logger.info(f'Loading "{data_name}" dataset from cache files')
+            #logger.info(f'Loading "{data_name}" dataset from cache files')
             data = load_data_from_cache(data_cache, data_name)
             data_desc = load_data_description(data_cache, data_name)
         else:
-            logger.info(f'Loading "{data_name}" dataset from scratch')
+            #logger.info(f'Loading "{data_name}" dataset from scratch')
             data, data_desc = function(**kwargs)
             save_data_to_cache(data, data_cache, data_name)
             save_data_description(data_desc, data_cache, data_name)

From e8344932c33cf07f095c6a0de33ab9fdcbe18000 Mon Sep 17 00:00:00 2001
From: ethanglaser <ethan.glaser@intel.com>
Date: Wed, 16 Oct 2024 03:55:12 +0000
Subject: [PATCH 071/110] <=

---
 configs/spmd/large_scale/dbscan.json        |  2 +-
 configs/spmd/large_scale/dbscan_strong.json |  2 +-
 configs/spmd/large_scale/kmeans_strong.json |  2 +-
 configs/spmd/large_scale/knn_strong.json    |  2 +-
 configs/spmd/large_scale/large_scale.json   | 14 ++------------
 5 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/configs/spmd/large_scale/dbscan.json b/configs/spmd/large_scale/dbscan.json
index 61b0521e..e4996c9e 100644
--- a/configs/spmd/large_scale/dbscan.json
+++ b/configs/spmd/large_scale/dbscan.json
@@ -27,7 +27,7 @@
                 "common dbscan parameters",
                 "synthetic dataset",
                 "sklearnex spmd implementation",
-		"large scale <64 parameters",
+		"large scale <=64 parameters",
                 "spmd dbscan parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/dbscan_strong.json b/configs/spmd/large_scale/dbscan_strong.json
index 24ea7cfc..04fb9016 100644
--- a/configs/spmd/large_scale/dbscan_strong.json
+++ b/configs/spmd/large_scale/dbscan_strong.json
@@ -27,7 +27,7 @@
                 "common dbscan parameters",
                 "synthetic dataset",
                 "sklearnex spmd implementation",
-		"large scale strong <64 parameters",
+		"large scale strong <=64 parameters",
                 "spmd dbscan parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/kmeans_strong.json b/configs/spmd/large_scale/kmeans_strong.json
index 6277745b..87fb7fac 100644
--- a/configs/spmd/large_scale/kmeans_strong.json
+++ b/configs/spmd/large_scale/kmeans_strong.json
@@ -23,7 +23,7 @@
             "SETS": [
                 "synthetic data",
                 "sklearnex spmd implementation",
-                "large scale strong <64 parameters",
+                "large scale strong <=64 parameters",
                 "spmd kmeans parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/knn_strong.json b/configs/spmd/large_scale/knn_strong.json
index 7fe862dd..d202f6e4 100644
--- a/configs/spmd/large_scale/knn_strong.json
+++ b/configs/spmd/large_scale/knn_strong.json
@@ -28,7 +28,7 @@
             "SETS": [
                 "synthetic classification data",
                 "sklearnex spmd implementation",
-		"large scale strong <64 parameters",
+		"large scale strong <=64 parameters",
                 "spmd knn cls parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
index 7e523984..4e4c9d0c 100644
--- a/configs/spmd/large_scale/large_scale.json
+++ b/configs/spmd/large_scale/large_scale.json
@@ -36,7 +36,7 @@
                 "mpi_params": {"n": [1,2,6,12,24,48,96,192,384], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
-        "large scale <64 parameters": {
+        "large scale <=64 parameters": {
             "data": {
                 "dtype": "float64",
                 "distributed_split": "None"
@@ -45,16 +45,6 @@
                 "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
-        "large scale >64 parameters": {
-            "data": {
-                "dtype": "float64",
-                "distributed_split": "None"
-            },
-            "bench": {
-                "mpi_params": {"n": [768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
-            }
-        },
-
         "large scale strong 2k parameters": {
             "data": {
                 "dtype": "float64",
@@ -64,7 +54,7 @@
                 "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
-        "large scale strong <64 parameters": {
+        "large scale strong <=64 parameters": {
             "data": {
                 "dtype": "float64",
                 "distributed_split": "rank_based"

From 75f2f10e42728437ec6a32b98f76d84546c68b8b Mon Sep 17 00:00:00 2001
From: ethanglaser <ethan.glaser@intel.com>
Date: Wed, 16 Oct 2024 03:59:40 +0000
Subject: [PATCH 072/110] lint

---
 sklbench/benchmarks/sklearn_estimator.py | 11 +++++++----
 sklbench/datasets/common.py              |  4 ++--
 sklbench/utils/measurement.py            |  8 +++++++-
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
index 36ec40b6..e57a9038 100644
--- a/sklbench/benchmarks/sklearn_estimator.py
+++ b/sklbench/benchmarks/sklearn_estimator.py
@@ -66,7 +66,7 @@ def get_estimator(library_name: str, estimator_name: str):
             f"Using first {classes_map[estimator_name][0]}."
         )
     estimator = classes_map[estimator_name][0]
-    #if not issubclass(estimator, BaseEstimator):
+    # if not issubclass(estimator, BaseEstimator):
     #    logger.info(f"{estimator} estimator is not derived from sklearn's BaseEstimator")
     return estimator
 
@@ -515,11 +515,14 @@ def main(bench_case: BenchCase, filters: List[BenchCase]):
     estimator_params = get_bench_case_value(
         bench_case, "algorithm:estimator_params", dict()
     )
-    #logger.debug("estimator params: " + str(estimator_params))
+    # logger.debug("estimator params: " + str(estimator_params))
     if "DBSCAN" in str(estimator_name):
         if "min_samples" in estimator_params:
             from mpi4py import MPI
-            estimator_params["min_samples"] = MPI.COMM_WORLD.Get_size() * estimator_params["min_samples"]
+
+            estimator_params["min_samples"] = (
+                MPI.COMM_WORLD.Get_size() * estimator_params["min_samples"]
+            )
     # get estimator methods for measurement
     estimator_methods = get_estimator_methods(bench_case)
 
@@ -555,7 +558,7 @@ def main(bench_case: BenchCase, filters: List[BenchCase]):
     # note: "handle" is not JSON-serializable
     if "handle" in estimator_params:
         del estimator_params["handle"]
-    #logger.debug(f"Estimator parameters:\n{custom_format(estimator_params)}")
+    # logger.debug(f"Estimator parameters:\n{custom_format(estimator_params)}")
     result_template.update(estimator_params)
 
     data_descs = {
diff --git a/sklbench/datasets/common.py b/sklbench/datasets/common.py
index 5c6bd27a..28b62fe6 100644
--- a/sklbench/datasets/common.py
+++ b/sklbench/datasets/common.py
@@ -136,11 +136,11 @@ def cache_wrapper(**kwargs):
         data_name = kwargs["data_name"]
         data_cache = kwargs["data_cache"]
         if len(get_filenames_by_prefix(data_cache, data_name)) > 0:
-            #logger.info(f'Loading "{data_name}" dataset from cache files')
+            # logger.info(f'Loading "{data_name}" dataset from cache files')
             data = load_data_from_cache(data_cache, data_name)
             data_desc = load_data_description(data_cache, data_name)
         else:
-            #logger.info(f'Loading "{data_name}" dataset from scratch')
+            # logger.info(f'Loading "{data_name}" dataset from scratch')
             data, data_desc = function(**kwargs)
             save_data_to_cache(data, data_cache, data_name)
             save_data_description(data_desc, data_cache, data_name)
diff --git a/sklbench/utils/measurement.py b/sklbench/utils/measurement.py
index 3628813d..bfabbdc0 100644
--- a/sklbench/utils/measurement.py
+++ b/sklbench/utils/measurement.py
@@ -93,8 +93,14 @@ def measure_time(
             )
             break
     from mpi4py import MPI
+
     if MPI.COMM_WORLD.Get_rank() == 0:
-        logger.debug("iters across n runs: " + str(iters) + ", inner iters across n runs: " + str(inners))
+        logger.debug(
+            "iters across n runs: "
+            + str(iters)
+            + ", inner iters across n runs: "
+            + str(inners)
+        )
     logger.debug(times)
     # mean, std = box_filter(times)
     # if std / mean > std_mean_ratio:

From fdd32d1bf9d84bdd33d9363e170353c0623d2ca4 Mon Sep 17 00:00:00 2001
From: Ian Faust <icfaust@gmail.com>
Date: Wed, 16 Oct 2024 15:51:08 +0200
Subject: [PATCH 073/110] Update knn.json

---
 configs/regular/bf16/knn.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/regular/bf16/knn.json b/configs/regular/bf16/knn.json
index c39b577e..fabf6d6d 100644
--- a/configs/regular/bf16/knn.json
+++ b/configs/regular/bf16/knn.json
@@ -22,7 +22,7 @@
                 "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": [1, 2] }
             },
             "data": [
-		        { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 1000 },   "generation_kwargs": {  "n_samples": 51000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
+		        { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 },   "generation_kwargs": {  "n_samples": 5001000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
             ]
         },
         "synthetic regression data": {
@@ -31,7 +31,7 @@
                 "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": [1, 2] }
             },
             "data": [
-		        { "source": "make_regression", "split_kwargs": { "train_size": 50000, "test_size": 1000 },   "generation_kwargs": {  "n_samples": 51000,  "n_features": 100, "noise":1.5 } }
+		        { "source": "make_regression", "split_kwargs": { "train_size": 5000000, "test_size": 1000 },   "generation_kwargs": {  "n_samples": 5001000,  "n_features": 100, "noise":1.5 } }
             ]
         }
     },

From 99fdb8949662fc8b3a59fced3629c71e4d51137f Mon Sep 17 00:00:00 2001
From: Ian Faust <icfaust@gmail.com>
Date: Wed, 16 Oct 2024 17:18:11 +0200
Subject: [PATCH 074/110] Update linear_model.json

---
 configs/regular/bf16/linear_model.json | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/configs/regular/bf16/linear_model.json b/configs/regular/bf16/linear_model.json
index 393b2c64..b081bd68 100644
--- a/configs/regular/bf16/linear_model.json
+++ b/configs/regular/bf16/linear_model.json
@@ -12,7 +12,8 @@
                 "estimator_params": { "fit_intercept": true, "copy_X": true }
             },
             "data": {
-                "dtype": ["float32"]
+                "dtype": ["float32"],
+                "order": "C",
             }
         },
         "sklearn linear parameters": {

From d419a01e25fb9796db68b7d7f623765d6508d893 Mon Sep 17 00:00:00 2001
From: Md Shafiul Alam <md.shafiul.alam@intel.com>
Date: Thu, 17 Oct 2024 20:22:48 +0000
Subject: [PATCH 075/110] minor

---
 configs/regular/bf16/linear_model.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/regular/bf16/linear_model.json b/configs/regular/bf16/linear_model.json
index b081bd68..23aa49c0 100644
--- a/configs/regular/bf16/linear_model.json
+++ b/configs/regular/bf16/linear_model.json
@@ -13,7 +13,7 @@
             },
             "data": {
                 "dtype": ["float32"],
-                "order": "C",
+                "order": "C"
             }
         },
         "sklearn linear parameters": {

From fd59a64c6065aea8e330906967cc6cc3d4fca9b1 Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Mon, 17 Mar 2025 16:16:03 -0700
Subject: [PATCH 076/110] Added updated configs.

---
 configs/spmd/kmeans_strong.json           | 32 +++++++++
 configs/spmd/kmeans_wide_weak.json        | 34 +++++++++
 configs/spmd/kmeans_wide_weak.json.backup | 34 +++++++++
 configs/spmd/large_scale.json             | 85 +++++++++++++++++++++++
 4 files changed, 185 insertions(+)
 create mode 100644 configs/spmd/kmeans_strong.json
 create mode 100644 configs/spmd/kmeans_wide_weak.json
 create mode 100644 configs/spmd/kmeans_wide_weak.json.backup
 create mode 100644 configs/spmd/large_scale.json

diff --git a/configs/spmd/kmeans_strong.json b/configs/spmd/kmeans_strong.json
new file mode 100644
index 00000000..c0028de3
--- /dev/null
+++ b/configs/spmd/kmeans_strong.json
@@ -0,0 +1,32 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../large_scale/large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd kmeans parameters": {
+            "algorithm": {
+                "estimator": "KMeans",
+                "estimator_params": {
+                    "algorithm": "lloyd",
+		    "max_iter": 20,
+		    "n_clusters": 100
+                },
+                "estimator_methods": { "training": "fit", "inference": "predict" },
+                "sklearnex_context": { "use_raw_input": true }
+            }
+        },
+        "synthetic data": {
+                "data": [
+                        { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000,  "n_features": 100, "centers": 100 }}
+                ]
+        }
+    },
+    "TEMPLATES": {
+        "kmeans": {
+            "SETS": [
+                "synthetic data",
+                "sklearnex spmd implementation",
+                "large scale strong <=64 parameters",
+                "spmd kmeans parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/kmeans_wide_weak.json b/configs/spmd/kmeans_wide_weak.json
new file mode 100644
index 00000000..56874e77
--- /dev/null
+++ b/configs/spmd/kmeans_wide_weak.json
@@ -0,0 +1,34 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../large_scale/large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd kmeans parameters": {
+            "algorithm": {
+                "estimator": "KMeans",
+                "estimator_params": {
+                    "algorithm": "lloyd",
+		    "max_iter": 20,
+		    "n_clusters": 10,
+		    "random_state": 42
+                },
+                "estimator_methods": { "training": "fit", "inference": "" },
+                "sklearnex_context": { "use_raw_input": true }
+            }
+        },
+        "synthetic data": {
+                "data": [
+		    { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000,  "n_features": 1000, "centers": 2000}}
+                ]
+        }
+    },
+    "TEMPLATES": {
+        "kmeans": {
+            "SETS": [
+                "synthetic data",
+                "sklearnex spmd implementation",
+                "large scale 2k parameters sample shift",
+                "spmd kmeans parameters"
+            ]
+        }
+    }
+}
+
diff --git a/configs/spmd/kmeans_wide_weak.json.backup b/configs/spmd/kmeans_wide_weak.json.backup
new file mode 100644
index 00000000..603ee877
--- /dev/null
+++ b/configs/spmd/kmeans_wide_weak.json.backup
@@ -0,0 +1,34 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../large_scale/large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd kmeans parameters": {
+            "algorithm": {
+                "estimator": "KMeans",
+                "estimator_params": {
+                    "algorithm": "lloyd",
+		    "max_iter": 20,
+		    "n_clusters": 10,
+		    "random_state": 42
+                },
+                "estimator_methods": { "training": "fit", "inference": "" },
+                "sklearnex_context": { "use_raw_input": true }
+            }
+        },
+        "synthetic data": {
+                "data": [
+		    { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000,  "n_features": 1000, "centers": 2000, "cluster_std":3.0,  "center_box":1000}}
+                ]
+        }
+    },
+    "TEMPLATES": {
+        "kmeans": {
+            "SETS": [
+                "synthetic data",
+                "sklearnex spmd implementation",
+                "large scale 2k parameters sample shift",
+                "spmd kmeans parameters"
+            ]
+        }
+    }
+}
+
diff --git a/configs/spmd/large_scale.json b/configs/spmd/large_scale.json
new file mode 100644
index 00000000..8b575dbf
--- /dev/null
+++ b/configs/spmd/large_scale.json
@@ -0,0 +1,85 @@
+{
+    "PARAMETERS_SETS": {
+        "large scale default parameters": {
+            "data": {
+                "dtype": "float64",
+		        "distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+        "large scale strong parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "rank_based"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+        "large scale 2k parameters sample shift": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "sample_shift"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+	"large scale 2k parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+        "large scale 32 parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+        "large scale <=64 parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+        "large scale strong 2k parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "rank_based"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+        "large scale strong <=64 parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "rank_based"
+            },
+            "bench": {
+               "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+        "large scale impi parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "no"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,4,6,9,12], "ppn": 12}
+            }
+        }
+    }
+}

From 985db075e277d286dd94c542d8802239b55bad8b Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Mon, 17 Mar 2025 16:27:23 -0700
Subject: [PATCH 077/110] Added shift.

---
 configs/spmd/kmeans_wide_weak.json |  2 +-
 sklbench/datasets/transformer.py   | 37 ++++++++++++++++++++++++++----
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/configs/spmd/kmeans_wide_weak.json b/configs/spmd/kmeans_wide_weak.json
index 56874e77..d5fe545a 100644
--- a/configs/spmd/kmeans_wide_weak.json
+++ b/configs/spmd/kmeans_wide_weak.json
@@ -25,7 +25,7 @@
             "SETS": [
                 "synthetic data",
                 "sklearnex spmd implementation",
-                "large scale 2k parameters sample shift",
+                "large scale 2k parameters",
                 "spmd kmeans parameters"
             ]
         }
diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index 86944ead..040ac2ee 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -23,7 +23,7 @@
 
 from ..utils.bench_case import get_bench_case_value
 from ..utils.logger import logger
-
+from mpi4py import MPI
 
 def convert_data(data, dformat: str, order: str, dtype: str, device: str = None):
     if isinstance(data, csr_matrix) and dformat != "csr_matrix":
@@ -113,8 +113,36 @@ def split_and_transform_data(bench_case, data, data_description):
         "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "")
         and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1
     )
-    if distributed_split == "rank_based" or knn_split_train:
-        from mpi4py import MPI
+
+    if distributed_split == "sample_shift":
+       comm = MPI.COMM_WORLD
+       rank = comm.Get_rank()
+       size = comm.Get_size()  
+
+       n_train = len(x_train)
+       n_test = len(x_test)
+
+       train_start = 0
+       train_end = n_train
+       test_start = 0
+       test_end = n_test 
+
+       adjust_number = (math.sqrt(rank) * 0.003) + 1
+
+       if "y" in data:
+            x_train, y_train = (
+                x_train[train_start:train_end] *  adjust_number,
+                y_train[train_start:train_end],
+            )
+            
+            x_test, y_test = x_test[test_start:test_end] * adjust_number, y_test[test_start:test_end]
+       else:
+            x_train = x_train[train_start:train_end]
+        
+            x_test = x_test[test_start:test_end] * adjust_number
+
+    elif distributed_split == "rank_based" or knn_split_train:
+        
 
         comm = MPI.COMM_WORLD
         rank = comm.Get_rank()
@@ -127,6 +155,7 @@ def split_and_transform_data(bench_case, data, data_description):
         train_end = (1 + rank) * n_train // size
         test_start = rank * n_test // size
         test_end = (1 + rank) * n_test // size
+        x_train_rank = x_train[train_start:train_end] 
 
         if "y" in data:
             x_train, y_train = (
@@ -138,7 +167,7 @@ def split_and_transform_data(bench_case, data, data_description):
         else:
             x_train = x_train[train_start:train_end]
             if distributed_split == "rank_based":
-                x_test = x_test[test_start:test_end]
+                x_test = x_test[test_start:test_end] * adjust_number
 
     device = get_bench_case_value(bench_case, "algorithm:device", None)
     common_data_format = get_bench_case_value(bench_case, "data:format", "pandas")

From 34a30c74177e480b2a55eb7cfc2c9a4c865e6bb7 Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Mon, 17 Mar 2025 16:32:36 -0700
Subject: [PATCH 078/110] Added center box.

---
 sklbench/datasets/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sklbench/datasets/__init__.py b/sklbench/datasets/__init__.py
index 093875c4..27336eb4 100644
--- a/sklbench/datasets/__init__.py
+++ b/sklbench/datasets/__init__.py
@@ -67,6 +67,8 @@ def load_data(bench_case: BenchCase) -> Tuple[Dict, Dict]:
             generation_kwargs = get_bench_case_value(
                 bench_case, "data:generation_kwargs", dict()
             )
+            if 'center_box' in generation_kwargs:
+                generation_kwargs['center_box'] = (-1 * generation_kwargs['center_box'], generation_kwargs['center_box'])
             return load_sklearn_synthetic_data(
                 function_name=source,
                 input_kwargs=generation_kwargs,

From d47face749f5af71599139b2080c5d4cef189a08 Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Tue, 18 Mar 2025 10:21:01 -0700
Subject: [PATCH 079/110] Removed the inertia for Kmeans.

---
 sklbench/benchmarks/sklearn_estimator.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
index e57a9038..877707af 100644
--- a/sklbench/benchmarks/sklearn_estimator.py
+++ b/sklbench/benchmarks/sklearn_estimator.py
@@ -191,19 +191,6 @@ def get_subset_metrics_of_estimator(
                     }
                 )
     elif task == "clustering":
-        if hasattr(estimator_instance, "inertia_"):
-            # compute inertia manually using distances to cluster centers
-            # provided by KMeans.transform
-            metrics.update(
-                {
-                    "inertia": float(
-                        np.power(
-                            convert_to_numpy(estimator_instance.transform(x)).min(axis=1),
-                            2,
-                        ).sum()
-                    )
-                }
-            )
         if hasattr(estimator_instance, "predict"):
             y_pred = convert_to_numpy(estimator_instance.predict(x))
             metrics.update(

From e6177916bf7375ef06bf6b8dfc2d119345f7d936 Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Tue, 18 Mar 2025 10:39:00 -0700
Subject: [PATCH 080/110] fixed config locations.

---
 configs/spmd/kmeans_wide_weak.json.backup     | 34 -------------------
 .../{ => large_scale}/kmeans_wide_weak.json   |  0
 2 files changed, 34 deletions(-)
 delete mode 100644 configs/spmd/kmeans_wide_weak.json.backup
 rename configs/spmd/{ => large_scale}/kmeans_wide_weak.json (100%)

diff --git a/configs/spmd/kmeans_wide_weak.json.backup b/configs/spmd/kmeans_wide_weak.json.backup
deleted file mode 100644
index 603ee877..00000000
--- a/configs/spmd/kmeans_wide_weak.json.backup
+++ /dev/null
@@ -1,34 +0,0 @@
-{
-    "INCLUDE": ["../../common/sklearn.json", "../large_scale/large_scale.json"],
-    "PARAMETERS_SETS": {
-        "spmd kmeans parameters": {
-            "algorithm": {
-                "estimator": "KMeans",
-                "estimator_params": {
-                    "algorithm": "lloyd",
-		    "max_iter": 20,
-		    "n_clusters": 10,
-		    "random_state": 42
-                },
-                "estimator_methods": { "training": "fit", "inference": "" },
-                "sklearnex_context": { "use_raw_input": true }
-            }
-        },
-        "synthetic data": {
-                "data": [
-		    { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000,  "n_features": 1000, "centers": 2000, "cluster_std":3.0,  "center_box":1000}}
-                ]
-        }
-    },
-    "TEMPLATES": {
-        "kmeans": {
-            "SETS": [
-                "synthetic data",
-                "sklearnex spmd implementation",
-                "large scale 2k parameters sample shift",
-                "spmd kmeans parameters"
-            ]
-        }
-    }
-}
-
diff --git a/configs/spmd/kmeans_wide_weak.json b/configs/spmd/large_scale/kmeans_wide_weak.json
similarity index 100%
rename from configs/spmd/kmeans_wide_weak.json
rename to configs/spmd/large_scale/kmeans_wide_weak.json

From 00ac46d6132a44f2124f042c31e62ae2a041cd3c Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Tue, 18 Mar 2025 10:47:28 -0700
Subject: [PATCH 081/110] Updated configs.

---
 .../spmd/large_scale/kmeans_narrow_weak.json  | 33 +++++++++++++++++++
 configs/spmd/large_scale/kmeans_strong.json   | 18 +++++-----
 2 files changed, 43 insertions(+), 8 deletions(-)
 create mode 100644 configs/spmd/large_scale/kmeans_narrow_weak.json

diff --git a/configs/spmd/large_scale/kmeans_narrow_weak.json b/configs/spmd/large_scale/kmeans_narrow_weak.json
new file mode 100644
index 00000000..4d8a34d1
--- /dev/null
+++ b/configs/spmd/large_scale/kmeans_narrow_weak.json
@@ -0,0 +1,33 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../large_scale/large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd kmeans parameters": {
+            "algorithm": {
+                "estimator": "KMeans",
+                "estimator_params": {
+                    "algorithm": "lloyd",
+		    "max_iter": 20,
+		    "n_clusters": 10,
+		    "random_state": 42
+                },
+                "estimator_methods": { "training": "fit", "inference": "" },
+                "sklearnex_context": { "use_raw_input": true }
+            }
+        },
+        "synthetic data": {
+                "data": [
+		    { "source": "make_blobs", "generation_kwargs": { "n_samples": 2000000,  "n_features": 100, "centers": 2000, "cluster_std": 3, "center_box": 100.0}}
+                ]
+        }
+    },
+    "TEMPLATES": {
+        "kmeans": {
+            "SETS": [
+                "synthetic data",
+                "sklearnex spmd implementation",
+                "large scale 2k parameters sample shift",
+                "spmd kmeans parameters"
+            ]
+        }
+    }
+}
\ No newline at end of file
diff --git a/configs/spmd/large_scale/kmeans_strong.json b/configs/spmd/large_scale/kmeans_strong.json
index 87fb7fac..f61172c9 100644
--- a/configs/spmd/large_scale/kmeans_strong.json
+++ b/configs/spmd/large_scale/kmeans_strong.json
@@ -1,20 +1,21 @@
 {
-    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+    "INCLUDE": ["../../common/sklearn.json", "../large_scale/large_scale.json"],
     "PARAMETERS_SETS": {
         "spmd kmeans parameters": {
             "algorithm": {
                 "estimator": "KMeans",
                 "estimator_params": {
-                    "algorithm": "lloyd"
+                    "algorithm": "lloyd",
+		    "max_iter": 20,
+		    "n_clusters": 100
                 },
-                "estimator_methods": { "training": "fit", "inference": "predict" }
+                "estimator_methods": { "training": "fit", "inference": "predict" },
+                "sklearnex_context": { "use_raw_input": true }
             }
-	},
-	"synthetic data": {
+        },
+        "synthetic data": {
                 "data": [
-                        { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000,  "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
-	                { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000,  "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
-			{ "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000,  "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } }
+                        { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000,  "n_features": 100, "centers": 100 }}
                 ]
         }
     },
@@ -29,3 +30,4 @@
         }
     }
 }
+

From f37f964a729c133f6bdd46f4e623316030c16fe2 Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Tue, 18 Mar 2025 10:54:53 -0700
Subject: [PATCH 082/110] Moved large scale files.

---
 configs/spmd/kmeans_strong.json | 32 -------------
 configs/spmd/large_scale.json   | 85 ---------------------------------
 2 files changed, 117 deletions(-)
 delete mode 100644 configs/spmd/kmeans_strong.json
 delete mode 100644 configs/spmd/large_scale.json

diff --git a/configs/spmd/kmeans_strong.json b/configs/spmd/kmeans_strong.json
deleted file mode 100644
index c0028de3..00000000
--- a/configs/spmd/kmeans_strong.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-    "INCLUDE": ["../../common/sklearn.json", "../large_scale/large_scale.json"],
-    "PARAMETERS_SETS": {
-        "spmd kmeans parameters": {
-            "algorithm": {
-                "estimator": "KMeans",
-                "estimator_params": {
-                    "algorithm": "lloyd",
-		    "max_iter": 20,
-		    "n_clusters": 100
-                },
-                "estimator_methods": { "training": "fit", "inference": "predict" },
-                "sklearnex_context": { "use_raw_input": true }
-            }
-        },
-        "synthetic data": {
-                "data": [
-                        { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000,  "n_features": 100, "centers": 100 }}
-                ]
-        }
-    },
-    "TEMPLATES": {
-        "kmeans": {
-            "SETS": [
-                "synthetic data",
-                "sklearnex spmd implementation",
-                "large scale strong <=64 parameters",
-                "spmd kmeans parameters"
-            ]
-        }
-    }
-}
diff --git a/configs/spmd/large_scale.json b/configs/spmd/large_scale.json
deleted file mode 100644
index 8b575dbf..00000000
--- a/configs/spmd/large_scale.json
+++ /dev/null
@@ -1,85 +0,0 @@
-{
-    "PARAMETERS_SETS": {
-        "large scale default parameters": {
-            "data": {
-                "dtype": "float64",
-		        "distributed_split": "None"
-            },
-            "bench": {
-                "mpi_params": {"n": [1,2], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
-            }
-        },
-        "large scale strong parameters": {
-            "data": {
-                "dtype": "float64",
-                "distributed_split": "rank_based"
-            },
-            "bench": {
-                "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
-            }
-        },
-        "large scale 2k parameters sample shift": {
-            "data": {
-                "dtype": "float64",
-                "distributed_split": "sample_shift"
-            },
-            "bench": {
-                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
-            }
-        },
-	"large scale 2k parameters": {
-            "data": {
-                "dtype": "float64",
-                "distributed_split": "None"
-            },
-            "bench": {
-                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
-            }
-        },
-        "large scale 32 parameters": {
-            "data": {
-                "dtype": "float64",
-                "distributed_split": "None"
-            },
-            "bench": {
-                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
-            }
-        },
-        "large scale <=64 parameters": {
-            "data": {
-                "dtype": "float64",
-                "distributed_split": "None"
-            },
-            "bench": {
-                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
-            }
-        },
-        "large scale strong 2k parameters": {
-            "data": {
-                "dtype": "float64",
-                "distributed_split": "rank_based"
-            },
-            "bench": {
-                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
-            }
-        },
-        "large scale strong <=64 parameters": {
-            "data": {
-                "dtype": "float64",
-                "distributed_split": "rank_based"
-            },
-            "bench": {
-               "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
-            }
-        },
-        "large scale impi parameters": {
-            "data": {
-                "dtype": "float64",
-                "distributed_split": "no"
-            },
-            "bench": {
-                "mpi_params": {"n": [1,2,4,6,9,12], "ppn": 12}
-            }
-        }
-    }
-}

From 1c5552b27893b27a6a695f2acda88527570b73a2 Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Tue, 18 Mar 2025 10:55:53 -0700
Subject: [PATCH 083/110] Added line.

---
 configs/spmd/large_scale/kmeans_narrow_weak.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/spmd/large_scale/kmeans_narrow_weak.json b/configs/spmd/large_scale/kmeans_narrow_weak.json
index 4d8a34d1..d6b73029 100644
--- a/configs/spmd/large_scale/kmeans_narrow_weak.json
+++ b/configs/spmd/large_scale/kmeans_narrow_weak.json
@@ -30,4 +30,4 @@
             ]
         }
     }
-}
\ No newline at end of file
+}

From dcfef94ab2bb8c1a34b7ed7eea57e2396fe8d9eb Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Tue, 18 Mar 2025 10:58:29 -0700
Subject: [PATCH 084/110] Added large scale 2k parameters sample shift

---
 configs/spmd/large_scale/large_scale.json | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
index 4e4c9d0c..a1ae8a62 100644
--- a/configs/spmd/large_scale/large_scale.json
+++ b/configs/spmd/large_scale/large_scale.json
@@ -27,6 +27,15 @@
                 "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
+        "large scale 2k parameters sample shift": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "sample_shift"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
         "large scale 32 parameters": {
             "data": {
                 "dtype": "float64",

From 4ba3fe43e5effba47c942ea0ebce7b16aeef2f69 Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Tue, 18 Mar 2025 12:09:12 -0700
Subject: [PATCH 085/110] Fixed imports.

---
 sklbench/datasets/transformer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index 040ac2ee..894d711d 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -18,12 +18,13 @@
 
 import numpy as np
 import pandas as pd
+from mpi4py import MPI
 from scipy.sparse import csr_matrix
 from sklearn.model_selection import train_test_split
 
 from ..utils.bench_case import get_bench_case_value
 from ..utils.logger import logger
-from mpi4py import MPI
+
 
 def convert_data(data, dformat: str, order: str, dtype: str, device: str = None):
     if isinstance(data, csr_matrix) and dformat != "csr_matrix":

From 5c04a35f8d0156e5a4a308c5eaa2a38aee714387 Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Tue, 18 Mar 2025 12:29:03 -0700
Subject: [PATCH 086/110] Updated format.

---
 sklbench/datasets/__init__.py    |  7 ++++--
 sklbench/datasets/transformer.py | 38 +++++++++++++++++---------------
 sklbench/utils/common.py         |  2 +-
 3 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/sklbench/datasets/__init__.py b/sklbench/datasets/__init__.py
index 27336eb4..d4bddca1 100644
--- a/sklbench/datasets/__init__.py
+++ b/sklbench/datasets/__init__.py
@@ -67,8 +67,11 @@ def load_data(bench_case: BenchCase) -> Tuple[Dict, Dict]:
             generation_kwargs = get_bench_case_value(
                 bench_case, "data:generation_kwargs", dict()
             )
-            if 'center_box' in generation_kwargs:
-                generation_kwargs['center_box'] = (-1 * generation_kwargs['center_box'], generation_kwargs['center_box'])
+            if "center_box" in generation_kwargs:
+                generation_kwargs["center_box"] = (
+                    -1 * generation_kwargs["center_box"],
+                    generation_kwargs["center_box"],
+                )
             return load_sklearn_synthetic_data(
                 function_name=source,
                 input_kwargs=generation_kwargs,
diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index 894d711d..b386578e 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -116,34 +116,36 @@ def split_and_transform_data(bench_case, data, data_description):
     )
 
     if distributed_split == "sample_shift":
-       comm = MPI.COMM_WORLD
-       rank = comm.Get_rank()
-       size = comm.Get_size()  
+        comm = MPI.COMM_WORLD
+        rank = comm.Get_rank()
+        size = comm.Get_size()
 
-       n_train = len(x_train)
-       n_test = len(x_test)
+        n_train = len(x_train)
+        n_test = len(x_test)
 
-       train_start = 0
-       train_end = n_train
-       test_start = 0
-       test_end = n_test 
+        train_start = 0
+        train_end = n_train
+        test_start = 0
+        test_end = n_test
 
-       adjust_number = (math.sqrt(rank) * 0.003) + 1
+        adjust_number = (math.sqrt(rank) * 0.003) + 1
 
-       if "y" in data:
+        if "y" in data:
             x_train, y_train = (
-                x_train[train_start:train_end] *  adjust_number,
+                x_train[train_start:train_end] * adjust_number,
                 y_train[train_start:train_end],
             )
-            
-            x_test, y_test = x_test[test_start:test_end] * adjust_number, y_test[test_start:test_end]
-       else:
+
+            x_test, y_test = (
+                x_test[test_start:test_end] * adjust_number,
+                y_test[test_start:test_end],
+            )
+        else:
             x_train = x_train[train_start:train_end]
-        
+
             x_test = x_test[test_start:test_end] * adjust_number
 
     elif distributed_split == "rank_based" or knn_split_train:
-        
 
         comm = MPI.COMM_WORLD
         rank = comm.Get_rank()
@@ -156,7 +158,7 @@ def split_and_transform_data(bench_case, data, data_description):
         train_end = (1 + rank) * n_train // size
         test_start = rank * n_test // size
         test_end = (1 + rank) * n_test // size
-        x_train_rank = x_train[train_start:train_end] 
+        x_train_rank = x_train[train_start:train_end]
 
         if "y" in data:
             x_train, y_train = (
diff --git a/sklbench/utils/common.py b/sklbench/utils/common.py
index 06486428..995f4b5e 100755
--- a/sklbench/utils/common.py
+++ b/sklbench/utils/common.py
@@ -120,7 +120,7 @@ def flatten_list(input_list: List, ensure_type_homogeneity: bool = False) -> Lis
 
 
 def get_module_members(
-    module_names_chain: Union[List, str]
+    module_names_chain: Union[List, str],
 ) -> Tuple[ModuleContentMap, ModuleContentMap]:
     def get_module_name(module_names_chain: List[str]) -> str:
         name = module_names_chain[0]

From af48e968488fbfa2c4b664e7fd0ec973ce90bd4e Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Tue, 18 Mar 2025 16:48:09 -0700
Subject: [PATCH 087/110] Added the math import.

---
 sklbench/datasets/transformer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index b386578e..57999775 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -16,6 +16,7 @@
 
 import os
 
+import math
 import numpy as np
 import pandas as pd
 from mpi4py import MPI

From c7f38f4b5b43a6dcc8920ba2d9f7e1f89e3847eb Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Thu, 20 Mar 2025 16:25:28 -0700
Subject: [PATCH 088/110] Rolled back the accidental changes to the
 ranked_based distributed_split.

---
 sklbench/datasets/transformer.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index 57999775..46342b3b 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -146,8 +146,7 @@ def split_and_transform_data(bench_case, data, data_description):
 
             x_test = x_test[test_start:test_end] * adjust_number
 
-    elif distributed_split == "rank_based" or knn_split_train:
-
+    if distributed_split == "rank_based" or knn_split_train:
         comm = MPI.COMM_WORLD
         rank = comm.Get_rank()
         size = comm.Get_size()
@@ -159,7 +158,6 @@ def split_and_transform_data(bench_case, data, data_description):
         train_end = (1 + rank) * n_train // size
         test_start = rank * n_test // size
         test_end = (1 + rank) * n_test // size
-        x_train_rank = x_train[train_start:train_end]
 
         if "y" in data:
             x_train, y_train = (
@@ -171,7 +169,8 @@ def split_and_transform_data(bench_case, data, data_description):
         else:
             x_train = x_train[train_start:train_end]
             if distributed_split == "rank_based":
-                x_test = x_test[test_start:test_end] * adjust_number
+                x_test = x_test[test_start:test_end]
+
 
     device = get_bench_case_value(bench_case, "algorithm:device", None)
     common_data_format = get_bench_case_value(bench_case, "data:format", "pandas")

From 264701eae57156971a6b1a6e4c77c6e41f6dffe3 Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Thu, 20 Mar 2025 16:32:23 -0700
Subject: [PATCH 089/110] Updated large scale 2k parameters for the full 24576
 tiles.

---
 configs/spmd/large_scale/large_scale.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
index a1ae8a62..28626dc9 100644
--- a/configs/spmd/large_scale/large_scale.json
+++ b/configs/spmd/large_scale/large_scale.json
@@ -33,7 +33,7 @@
                 "distributed_split": "sample_shift"
             },
             "bench": {
-                "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
         "large scale 32 parameters": {

From 20419a954e02557f791df9051e6986c4784b8df8 Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Thu, 20 Mar 2025 16:40:55 -0700
Subject: [PATCH 090/110] Updated config files.

---
 configs/spmd/large_scale/kmeans.json          | 30 -------------------
 .../spmd/large_scale/kmeans_narrow_weak.json  |  2 +-
 configs/spmd/large_scale/kmeans_strong.json   |  2 +-
 .../spmd/large_scale/kmeans_wide_weak.json    |  2 +-
 4 files changed, 3 insertions(+), 33 deletions(-)
 delete mode 100644 configs/spmd/large_scale/kmeans.json

diff --git a/configs/spmd/large_scale/kmeans.json b/configs/spmd/large_scale/kmeans.json
deleted file mode 100644
index 1140823d..00000000
--- a/configs/spmd/large_scale/kmeans.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
-    "PARAMETERS_SETS": {
-        "spmd kmeans parameters": {
-            "algorithm": {
-                "estimator": "KMeans",
-                "estimator_params": {
-                    "algorithm": "lloyd"
-                },
-                "estimator_methods": { "training": "fit", "inference": "predict" }
-            }
-	},
-	"synthetic data": {
-                "data": [
-                    { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000,  "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
-	                { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000,  "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }
-                ]
-        }
-    },
-    "TEMPLATES": {
-        "kmeans": {
-            "SETS": [
-                "synthetic data",
-                "sklearnex spmd implementation",
-                "large scale 2k parameters",
-                "spmd kmeans parameters"
-            ]
-        }
-    }
-}
diff --git a/configs/spmd/large_scale/kmeans_narrow_weak.json b/configs/spmd/large_scale/kmeans_narrow_weak.json
index d6b73029..523aba01 100644
--- a/configs/spmd/large_scale/kmeans_narrow_weak.json
+++ b/configs/spmd/large_scale/kmeans_narrow_weak.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../../common/sklearn.json", "../large_scale/large_scale.json"],
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
     "PARAMETERS_SETS": {
         "spmd kmeans parameters": {
             "algorithm": {
diff --git a/configs/spmd/large_scale/kmeans_strong.json b/configs/spmd/large_scale/kmeans_strong.json
index f61172c9..90a1ea3f 100644
--- a/configs/spmd/large_scale/kmeans_strong.json
+++ b/configs/spmd/large_scale/kmeans_strong.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../../common/sklearn.json", "../large_scale/large_scale.json"],
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
     "PARAMETERS_SETS": {
         "spmd kmeans parameters": {
             "algorithm": {
diff --git a/configs/spmd/large_scale/kmeans_wide_weak.json b/configs/spmd/large_scale/kmeans_wide_weak.json
index d5fe545a..1c588d60 100644
--- a/configs/spmd/large_scale/kmeans_wide_weak.json
+++ b/configs/spmd/large_scale/kmeans_wide_weak.json
@@ -1,5 +1,5 @@
 {
-    "INCLUDE": ["../../common/sklearn.json", "../large_scale/large_scale.json"],
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
     "PARAMETERS_SETS": {
         "spmd kmeans parameters": {
             "algorithm": {

From 4e93858b24fa1b95e80947b7d6cf82e4230e6165 Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Thu, 20 Mar 2025 16:46:55 -0700
Subject: [PATCH 091/110] cleaned up diff.

---
 sklbench/utils/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklbench/utils/common.py b/sklbench/utils/common.py
index 995f4b5e..06486428 100755
--- a/sklbench/utils/common.py
+++ b/sklbench/utils/common.py
@@ -120,7 +120,7 @@ def flatten_list(input_list: List, ensure_type_homogeneity: bool = False) -> Lis
 
 
 def get_module_members(
-    module_names_chain: Union[List, str],
+    module_names_chain: Union[List, str]
 ) -> Tuple[ModuleContentMap, ModuleContentMap]:
     def get_module_name(module_names_chain: List[str]) -> str:
         name = module_names_chain[0]

From 428f3df094feacd79eb4703e25e325fc55d232eb Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Thu, 20 Mar 2025 17:23:13 -0700
Subject: [PATCH 092/110] Reformatted correctly.

---
 sklbench/datasets/transformer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index 46342b3b..9e00d05d 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -171,7 +171,6 @@ def split_and_transform_data(bench_case, data, data_description):
             if distributed_split == "rank_based":
                 x_test = x_test[test_start:test_end]
 
-
     device = get_bench_case_value(bench_case, "algorithm:device", None)
     common_data_format = get_bench_case_value(bench_case, "data:format", "pandas")
     common_data_order = get_bench_case_value(bench_case, "data:order", "F")

From 2f8c68b89dd397390accbaf633c4978d38559531 Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Thu, 20 Mar 2025 17:28:29 -0700
Subject: [PATCH 093/110] Fixed if else.

---
 sklbench/datasets/transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index 9e00d05d..fa5badd9 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -146,7 +146,7 @@ def split_and_transform_data(bench_case, data, data_description):
 
             x_test = x_test[test_start:test_end] * adjust_number
 
-    if distributed_split == "rank_based" or knn_split_train:
+    elif distributed_split == "rank_based" or knn_split_train:
         comm = MPI.COMM_WORLD
         rank = comm.Get_rank()
         size = comm.Get_size()

From 816c6dcf5111ffe58923499c823fc39f8a2c1bca Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Fri, 21 Mar 2025 09:42:19 -0700
Subject: [PATCH 094/110] Updated format.

---
 sklbench/datasets/transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index fa5badd9..63a636c3 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -14,9 +14,9 @@
 # limitations under the License.
 # ===============================================================================
 
+import math
 import os
 
-import math
 import numpy as np
 import pandas as pd
 from mpi4py import MPI

From 5d3bf52927709502a30712ad6020678adb3f8225 Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Fri, 21 Mar 2025 10:07:02 -0700
Subject: [PATCH 095/110] Added mpi4py

---
 envs/conda-env-rapids.yml  | 1 +
 envs/conda-env-sklearn.yml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/envs/conda-env-rapids.yml b/envs/conda-env-rapids.yml
index d72aa2d8..9eaa9206 100644
--- a/envs/conda-env-rapids.yml
+++ b/envs/conda-env-rapids.yml
@@ -19,3 +19,4 @@ dependencies:
   - psutil
   - requests
   - py-cpuinfo
+  - mpi4py
\ No newline at end of file
diff --git a/envs/conda-env-sklearn.yml b/envs/conda-env-sklearn.yml
index bbc34463..afa7641f 100644
--- a/envs/conda-env-sklearn.yml
+++ b/envs/conda-env-sklearn.yml
@@ -21,3 +21,4 @@ dependencies:
   - psutil
   - requests
   - py-cpuinfo
+  - mpi4py

From a937963e214bc79092cbf18b2a750959ca948e01 Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Fri, 21 Mar 2025 10:44:55 -0700
Subject: [PATCH 096/110] fixed mpi4py

---
 envs/conda-env-rapids.yml        | 2 +-
 envs/conda-env-sklearn.yml       | 2 +-
 sklbench/datasets/transformer.py | 5 ++++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/envs/conda-env-rapids.yml b/envs/conda-env-rapids.yml
index 9eaa9206..b43c0958 100644
--- a/envs/conda-env-rapids.yml
+++ b/envs/conda-env-rapids.yml
@@ -19,4 +19,4 @@ dependencies:
   - psutil
   - requests
   - py-cpuinfo
-  - mpi4py
\ No newline at end of file
+  
\ No newline at end of file
diff --git a/envs/conda-env-sklearn.yml b/envs/conda-env-sklearn.yml
index afa7641f..070be2cc 100644
--- a/envs/conda-env-sklearn.yml
+++ b/envs/conda-env-sklearn.yml
@@ -21,4 +21,4 @@ dependencies:
   - psutil
   - requests
   - py-cpuinfo
-  - mpi4py
+  
diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index 63a636c3..cd00d724 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -19,7 +19,6 @@
 
 import numpy as np
 import pandas as pd
-from mpi4py import MPI
 from scipy.sparse import csr_matrix
 from sklearn.model_selection import train_test_split
 
@@ -117,6 +116,8 @@ def split_and_transform_data(bench_case, data, data_description):
     )
 
     if distributed_split == "sample_shift":
+        from mpi4py import MPI
+
         comm = MPI.COMM_WORLD
         rank = comm.Get_rank()
         size = comm.Get_size()
@@ -147,6 +148,8 @@ def split_and_transform_data(bench_case, data, data_description):
             x_test = x_test[test_start:test_end] * adjust_number
 
     elif distributed_split == "rank_based" or knn_split_train:
+        from mpi4py import MPI
+        
         comm = MPI.COMM_WORLD
         rank = comm.Get_rank()
         size = comm.Get_size()

From 3809d1760aaa77acc43299460e38f57eea70e0bb Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Fri, 21 Mar 2025 10:47:33 -0700
Subject: [PATCH 097/110] Rolled back mpi4py.

---
 envs/conda-env-rapids.yml  | 1 -
 envs/conda-env-sklearn.yml | 1 -
 2 files changed, 2 deletions(-)

diff --git a/envs/conda-env-rapids.yml b/envs/conda-env-rapids.yml
index b43c0958..d72aa2d8 100644
--- a/envs/conda-env-rapids.yml
+++ b/envs/conda-env-rapids.yml
@@ -19,4 +19,3 @@ dependencies:
   - psutil
   - requests
   - py-cpuinfo
-  
\ No newline at end of file
diff --git a/envs/conda-env-sklearn.yml b/envs/conda-env-sklearn.yml
index 070be2cc..bbc34463 100644
--- a/envs/conda-env-sklearn.yml
+++ b/envs/conda-env-sklearn.yml
@@ -21,4 +21,3 @@ dependencies:
   - psutil
   - requests
   - py-cpuinfo
-  

From c12874832788ad3ae874ac9b9b38e6f26044abe3 Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Fri, 21 Mar 2025 12:53:41 -0700
Subject: [PATCH 098/110] Formatted file.

---
 sklbench/datasets/transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index cd00d724..ea5646ca 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -149,7 +149,7 @@ def split_and_transform_data(bench_case, data, data_description):
 
     elif distributed_split == "rank_based" or knn_split_train:
         from mpi4py import MPI
-        
+
         comm = MPI.COMM_WORLD
         rank = comm.Get_rank()
         size = comm.Get_size()

From 15db7929f76696956a4fe0529e8addef35f95f74 Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Fri, 21 Mar 2025 14:56:49 -0700
Subject: [PATCH 099/110] Removed environment from diff.

---
 sklbench/report/implementation.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py
index 2bc3a05e..cddb45f3 100644
--- a/sklbench/report/implementation.py
+++ b/sklbench/report/implementation.py
@@ -100,7 +100,8 @@
     "batch_size",
 ]
 
-DIFFBY_COLUMNS = ["environment_name", "library", "format", "device"]
+#DIFFBY_COLUMNS = ["environment_name", "library", "format", "device"]
+DIFFBY_COLUMNS = ["library", "format", "device"]
 
 
 def geomean_wrapper(a):

From 30b0b80353f19fc87108b04a08b6d089a037047c Mon Sep 17 00:00:00 2001
From: ethanglaser <42726565+ethanglaser@users.noreply.github.com>
Date: Fri, 21 Mar 2025 15:00:01 -0700
Subject: [PATCH 100/110] initial alignment of configs to final results (#176)

* initial alignment of configs to final results

* logic updates

* fix large scale CI

* black

* minimize incremental

* black

* align custom function with skl estimator

* Update sklbench/utils/measurement.py

* Update sklbench/utils/measurement.py

* Update sklbench/utils/measurement.py

* Update sklbench/utils/measurement.py
---
 configs/incremental.json                      | 100 ------------------
 configs/spmd/large_scale/basic_stats.json     |  15 +--
 .../spmd/large_scale/basic_stats_strong.json  |  13 +--
 configs/spmd/large_scale/covariance.json      |   9 +-
 .../spmd/large_scale/covariance_strong.json   |   7 +-
 configs/spmd/large_scale/dbscan.json          |   5 +-
 configs/spmd/large_scale/dbscan_strong.json   |  15 +--
 .../spmd/large_scale/forest_max_samples.json  |  28 +++++
 ...forest.json => forest_no_max_samples.json} |   9 +-
 configs/spmd/large_scale/forest_strong.json   |  11 +-
 configs/spmd/large_scale/incremental.json     |  77 ++++++++++++++
 .../large_scale/incremental/basic_stats.json  |  30 ------
 .../large_scale/incremental/covariance.json   |  30 ------
 .../large_scale/incremental/linear_model.json |  27 -----
 configs/spmd/large_scale/incremental/pca.json |  30 ------
 configs/spmd/large_scale/knn_strong.json      |  17 +--
 configs/spmd/large_scale/knn_tier1.json       |  35 ++++++
 .../large_scale/{knn.json => knn_tier2.json}  |  19 ++--
 configs/spmd/large_scale/linreg.json          |   9 +-
 configs/spmd/large_scale/linreg_strong.json   |   7 +-
 configs/spmd/large_scale/logreg.json          |  15 +--
 configs/spmd/large_scale/logreg_strong.json   |  18 ++--
 configs/spmd/large_scale/pca.json             |  11 +-
 configs/spmd/large_scale/pca_strong.json      |   9 +-
 configs/spmd/large_scale/spmd_for_online.json |  96 -----------------
 .../large_scale/spmd_for_online_strong.json   |  60 -----------
 sklbench/benchmarks/custom_function.py        |  10 +-
 sklbench/benchmarks/sklearn_estimator.py      |   6 +-
 sklbench/datasets/transformer.py              |  10 +-
 sklbench/utils/measurement.py                 |  26 +++--
 test-configuration-linux.yml                  |   5 -
 test-configuration-win.yml                    |   4 -
 32 files changed, 273 insertions(+), 490 deletions(-)
 delete mode 100644 configs/incremental.json
 create mode 100644 configs/spmd/large_scale/forest_max_samples.json
 rename configs/spmd/large_scale/{forest.json => forest_no_max_samples.json} (58%)
 create mode 100644 configs/spmd/large_scale/incremental.json
 delete mode 100644 configs/spmd/large_scale/incremental/basic_stats.json
 delete mode 100644 configs/spmd/large_scale/incremental/covariance.json
 delete mode 100644 configs/spmd/large_scale/incremental/linear_model.json
 delete mode 100644 configs/spmd/large_scale/incremental/pca.json
 create mode 100644 configs/spmd/large_scale/knn_tier1.json
 rename configs/spmd/large_scale/{knn.json => knn_tier2.json} (55%)
 delete mode 100644 configs/spmd/large_scale/spmd_for_online.json
 delete mode 100644 configs/spmd/large_scale/spmd_for_online_strong.json

diff --git a/configs/incremental.json b/configs/incremental.json
deleted file mode 100644
index e1f589a4..00000000
--- a/configs/incremental.json
+++ /dev/null
@@ -1,100 +0,0 @@
-{   "INCLUDE": ["./common/sklearn.json"],
-    "PARAMETERS_SETS": {
-        "common": {"bench": {"n_runs": 10, "time_limit": 60}},
-        "covariance data": {
-            "data": [
-                {
-                    "source": "make_blobs",
-                    "generation_kwargs": {
-                        "centers": 1,
-                        "n_samples": 12000000,
-                        "n_features": [10, 100]
-                    },
-                    "split_kwargs": {"ignore": true}
-                }
-            ]
-        },
-        "basic_statistics data": {
-            "data": {
-                "source": "make_blobs",
-                "generation_kwargs": {
-                    "centers": 1,
-                    "n_samples": 12000000,
-                    "n_features": [10, 100]
-                },
-                "split_kwargs": {"ignore": true}
-            }
-        },
-        "linear_regression data": {
-            "data": {
-                "source": "make_regression",
-                "split_kwargs": {"train_size": 0.2, "test_size": 0.8},
-                "generation_kwargs": {
-		    "n_samples": 12000000,
-                    "n_features": [10, 100],
-                    "n_informative": 5,
-                    "noise": 2.0
-                }
-            }
-        },
-        "pca data": {
-            "data": {
-                "source": "make_blobs",
-                "generation_kwargs": {
-                    "centers": 1,
-                    "n_samples": 12000000,
-                    "n_features": [10, 100]
-                },
-                "split_kwargs": {"ignore": true}
-            }
-        },
-        "covariance": {
-            "algorithm": [
-                {
-                    "estimator": "IncrementalEmpiricalCovariance",
-                    "library": "sklearnex.covariance",
-                    "estimator_methods": {"training": "partial_fit"},
-                    "num_batches": {"training": 12}
-                }
-            ]
-	 },
-        "basic_statistics": {
-            "algorithm": [
-                {
-                    "estimator": "IncrementalBasicStatistics",
-                    "library": "sklearnex.basic_statistics",
-                    "estimator_methods": {"training": "partial_fit"},
-                    "num_batches": {"training": 12}
-                }
-            ]
-        },
-        "linear_regression": {
-            "algorithm": [
-                {
-                    "estimator": "IncrementalLinearRegression",
-                    "library": "sklearnex.linear_model",
-                    "estimator_methods": {"training": "partial_fit"},
-                    "num_batches": {"training": 12}
-                }
-            ]
-        },
-        "pca": {
-            "algorithm": [
-                {
-                    "estimator": "IncrementalPCA",
-                    "library": "sklearnex.preview.decomposition",
-                    "estimator_methods": {"training": "partial_fit"},
-                    "num_batches": {"training": 12}
-		    }
-            ]
-        }
-    },
-    "TEMPLATES": {
-        "basic_statistics": {"SETS": ["common", "basic_statistics", "basic_statistics data", "sklearn-ex[gpu] implementations"]},
-        "covariance": {"SETS": ["common", "covariance", "covariance data", "sklearn-ex[gpu] implementations"]},
-        "linear_regression": {
-            "SETS": ["common", "linear_regression", "linear_regression data", "sklearn-ex[gpu] implementations"]
-        },
-        "pca": {"SETS": ["common", "pca", "pca data", "sklearn-ex[gpu] implementations"]}
-    }
-}
diff --git a/configs/spmd/large_scale/basic_stats.json b/configs/spmd/large_scale/basic_stats.json
index d6c2c4d2..f8f44e4e 100644
--- a/configs/spmd/large_scale/basic_stats.json
+++ b/configs/spmd/large_scale/basic_stats.json
@@ -4,16 +4,17 @@
         "spmd basicstats parameters": {
             "algorithm": {
                 "estimator": "BasicStatistics",
-                "estimator_methods": { "training": "fit" }
+                "estimator_methods": { "training": "fit" },
+                "sklearnex_context": { "use_raw_input": true }
             },
-	    "data": {
-		"split_kwargs": { "test_size": 0.0001 }
-	    }
+            "data": {
+                "split_kwargs": { "test_size": 0.0001 }
+            }
         },
         "synthetic data": {
             "data": [
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
-		{ "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000000,  "n_features": 10, "centers": 1 } },
+		        { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000,  "n_features": 1000, "centers": 1 } }
             ]
         }
     },
@@ -22,7 +23,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
-		"synthetic data",
+		        "synthetic data",
                 "spmd basicstats parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/basic_stats_strong.json b/configs/spmd/large_scale/basic_stats_strong.json
index b5b0ef69..0c7c671e 100644
--- a/configs/spmd/large_scale/basic_stats_strong.json
+++ b/configs/spmd/large_scale/basic_stats_strong.json
@@ -4,11 +4,12 @@
         "spmd basicstats parameters": {
             "algorithm": {
                 "estimator": "BasicStatistics",
-                "estimator_methods": { "training": "fit" }
+                "estimator_methods": { "training": "fit" },
+                "sklearnex_context": { "use_raw_input": true }
             },
-	    "data": {
-		"split_kwargs": { "test_size": 0.0001 }
-	    }
+            "data": {
+                "split_kwargs": { "test_size": 0.0001 }
+            }
         },
         "synthetic data": {
             "data": [
@@ -20,8 +21,8 @@
         "basicstats": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale strong 2k parameters",
-		"synthetic data",
+                "large scale strong <=64 parameters",
+                "synthetic data",
                 "spmd basicstats parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/covariance.json b/configs/spmd/large_scale/covariance.json
index 20da8d15..7f4d6d7d 100644
--- a/configs/spmd/large_scale/covariance.json
+++ b/configs/spmd/large_scale/covariance.json
@@ -4,7 +4,8 @@
         "spmd basicstats parameters": {
             "algorithm": {
                 "estimator": "EmpiricalCovariance",
-                "estimator_methods": { "training": "fit" }
+                "estimator_methods": { "training": "fit" },
+                "sklearnex_context": { "use_raw_input": true }
             },
             "data": {
                 "split_kwargs": { "test_size": 0.0001 }
@@ -12,8 +13,8 @@
         },
         "synthetic data": {
             "data": [
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000000,  "n_features": 10, "centers": 1 } },
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000,  "n_features": 1000, "centers": 1 } }
             ]
         }
     },
@@ -22,7 +23,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
-		"synthetic data",
+                "synthetic data",
                 "spmd basicstats parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/covariance_strong.json b/configs/spmd/large_scale/covariance_strong.json
index b8424d92..8e388801 100644
--- a/configs/spmd/large_scale/covariance_strong.json
+++ b/configs/spmd/large_scale/covariance_strong.json
@@ -4,7 +4,8 @@
         "spmd basicstats parameters": {
             "algorithm": {
                 "estimator": "EmpiricalCovariance",
-                "estimator_methods": { "training": "fit" }
+                "estimator_methods": { "training": "fit" },
+                "sklearnex_context": { "use_raw_input": true }
             },
             "data": {
                 "split_kwargs": { "test_size": 0.0001 }
@@ -20,8 +21,8 @@
         "covariance": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale strong 2k parameters",
-		"synthetic data",
+                "large scale strong <=64 parameters",
+                "synthetic data",
                 "spmd basicstats parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/dbscan.json b/configs/spmd/large_scale/dbscan.json
index e4996c9e..bf60b7cc 100644
--- a/configs/spmd/large_scale/dbscan.json
+++ b/configs/spmd/large_scale/dbscan.json
@@ -9,7 +9,8 @@
 		},
 		"estimator_params" : {
 			"eps": 10, "min_samples": 5
-		}
+		},
+		"sklearnex_context": { "use_raw_input": true }
 	    },
 	    "data": {
 		"dtype": "float64"
@@ -17,7 +18,7 @@
 	},
 	"synthetic dataset": {
             "data": [
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 100, "centers": 10 } }
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 40000,  "n_features": 100, "centers": 10 } }
             ]
 	}
     },
diff --git a/configs/spmd/large_scale/dbscan_strong.json b/configs/spmd/large_scale/dbscan_strong.json
index 04fb9016..5e7ab322 100644
--- a/configs/spmd/large_scale/dbscan_strong.json
+++ b/configs/spmd/large_scale/dbscan_strong.json
@@ -3,13 +3,14 @@
     "PARAMETERS_SETS": {
         "spmd dbscan parameters": {
 	    "algorithm": {
-		"estimator": "DBSCAN",
-		"estimator_methods": {
-		    "training": "fit"
+                "estimator": "DBSCAN",
+                "estimator_methods": {
+                "training": "fit"
 		},
                 "estimator_params" : {
-                        "eps": 10, "min_samples": 5
-                }
+                        "eps": 15, "min_samples": 50
+                },
+		"sklearnex_context": { "use_raw_input": true }
 	    },
 	    "data": {
 		"dtype": "float64"
@@ -17,7 +18,7 @@
 	},
 	"synthetic dataset": {
             "data": [
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 500000,  "n_features": 100, "centers": 10 } }
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 4000000,  "n_features": 100, "centers": 10 } }
             ]
 	}
     },
@@ -27,7 +28,7 @@
                 "common dbscan parameters",
                 "synthetic dataset",
                 "sklearnex spmd implementation",
-		"large scale strong <=64 parameters",
+                "large scale strong <=64 parameters",
                 "spmd dbscan parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/forest_max_samples.json b/configs/spmd/large_scale/forest_max_samples.json
new file mode 100644
index 00000000..95affb16
--- /dev/null
+++ b/configs/spmd/large_scale/forest_max_samples.json
@@ -0,0 +1,28 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd forest classification parameters": {
+            "algorithm": {
+                "estimator": "RandomForestClassifier",
+                "estimator_methods": { "training": "fit" },
+                "estimator_params": { "n_estimators": 20, "max_depth": 10 },
+                "sklearnex_context": { "use_raw_input": true }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_classification", "split_kwargs": { "train_size": 1000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 1001000, "n_features": 100, "n_classes": 2 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "forestCls": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale 32 parameters",
+                "synthetic data",
+                "spmd forest classification parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/forest.json b/configs/spmd/large_scale/forest_no_max_samples.json
similarity index 58%
rename from configs/spmd/large_scale/forest.json
rename to configs/spmd/large_scale/forest_no_max_samples.json
index b4402442..c371371b 100644
--- a/configs/spmd/large_scale/forest.json
+++ b/configs/spmd/large_scale/forest_no_max_samples.json
@@ -4,14 +4,13 @@
         "spmd forest classification parameters": {
             "algorithm": {
                 "estimator": "RandomForestClassifier",
-		"estimator_methods": { "training": "fit" },
-		"estimator_params": { "n_estimators": 20, "max_depth": 4 }
+                "estimator_params": { "n_estimators": 100, "max_depth": 7 },
+                "sklearnex_context": { "use_raw_input": true }
             }
         },
         "synthetic data": {
             "data": [
-                { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 501000, "n_features": 10, "n_classes": 2 } },
-                { "source": "make_classification", "split_kwargs": { "train_size": 10000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 11000, "n_features": 1000, "n_classes": 2 } }
+                { "source": "make_classification", "split_kwargs": { "train_size": 1000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 1001000, "n_features": 100, "n_classes": 2 }, "n_informative": "[SPECIAL_VALUE]0.5" }
             ]
         }
     },
@@ -20,7 +19,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
-		"synthetic data",
+                "synthetic data",
                 "spmd forest classification parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/forest_strong.json b/configs/spmd/large_scale/forest_strong.json
index 23b982f5..653c70dc 100644
--- a/configs/spmd/large_scale/forest_strong.json
+++ b/configs/spmd/large_scale/forest_strong.json
@@ -4,13 +4,14 @@
         "spmd forest classification parameters": {
             "algorithm": {
                 "estimator": "RandomForestClassifier",
-		"estimator_methods": { "training": "fit" },
-		"estimator_params": { "n_estimators": 20, "max_depth": 4 }
+                "estimator_methods": { "training": "fit" },
+                "estimator_params": { "n_estimators": 100, "max_depth": 8 },
+                "sklearnex_context": {"use_raw_input": true}
             }
         },
         "synthetic data": {
             "data": [
-                { "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 10001000, "n_features": 100, "n_classes": 2 } }
+                { "source": "make_classification", "split_kwargs": { "train_size": 20000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 21000, "n_features": 200, "n_classes": 2 } }
             ]
         }
     },
@@ -18,8 +19,8 @@
         "forestCls": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale strong 2k parameters",
-		"synthetic data",
+                "large scale strong <=64 parameters",
+                "synthetic data",
                 "spmd forest classification parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/incremental.json b/configs/spmd/large_scale/incremental.json
new file mode 100644
index 00000000..195074ee
--- /dev/null
+++ b/configs/spmd/large_scale/incremental.json
@@ -0,0 +1,77 @@
+{   "INCLUDE": [ ],
+    "PARAMETERS_SETS": {
+        "common incremental raw gpu params": {
+                "algorithm": {
+                    "device": "gpu",
+                    "sklearnex_context": { "use_raw_input": true }
+                },
+                "data": {
+                    "format":"dpctl",
+                    "order": "C"
+                }
+        },
+        "statistical batches and data": [
+                { "algorithm": { "num_batches": { "training": [1,2,6,12]} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000000,  "n_features": 10, "centers": 1 } } },
+                { "algorithm": { "num_batches": { "training": 2} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 50000000,  "n_features": 10, "centers": 1 } } },
+                { "algorithm": { "num_batches": { "training": 6} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 16666667,  "n_features": 10, "centers": 1 } } },
+                { "algorithm": { "num_batches": { "training": 12} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 8333333,  "n_features": 10, "centers": 1 } } },
+                { "algorithm": { "num_batches": { "training": [1,2,6,12]} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000,  "n_features": 1000, "centers": 1 } } },
+                { "algorithm": { "num_batches": { "training": 2} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 500000,  "n_features": 1000, "centers": 1 } } },
+                { "algorithm": { "num_batches": { "training": 6} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 166667,  "n_features": 1000, "centers": 1 } } },
+                { "algorithm": { "num_batches": { "training": 12} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 83333,  "n_features": 1000, "centers": 1 } } }
+        ],
+        "regression batches and data": [
+            { "algorithm": { "num_batches": { "training": [1,2,6,12]} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 100005000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 100000000, "test_size": 5000 } } },
+            { "algorithm": { "num_batches": { "training": 2} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 100005000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 50000000, "test_size": 5000 } } },
+            { "algorithm": { "num_batches": { "training": 6} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 100005000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 16666667, "test_size": 5000 } } },
+            { "algorithm": { "num_batches": { "training": 12} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 100005000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 8333333, "test_size": 5000 } } },
+            { "algorithm": { "num_batches": { "training": [1,2,6,12]} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000,  "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 3000000, "test_size": 5000 } } },
+            { "algorithm": { "num_batches": { "training": 2} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000,  "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 1500000, "test_size": 5000 } } },
+            { "algorithm": { "num_batches": { "training": 6} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000,  "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 500000, "test_size": 5000 } } },
+            { "algorithm": { "num_batches": { "training": 12} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000,  "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 250000, "test_size": 5000 } } }
+        ],
+        "covariance": {
+            "algorithm": {
+                    "estimator": "IncrementalEmpiricalCovariance",
+                    "library": "sklearnex",
+                    "estimator_methods": {"training": "partial_fit"}
+                },
+            "data": {
+                    "split_kwargs": { "test_size": 0.0001 }
+                }
+         },
+        "basic_statistics": {
+            "algorithm": {
+                    "estimator": "IncrementalBasicStatistics",
+                    "library": "sklearnex",
+                    "estimator_methods": {"training": "partial_fit"}
+                },
+            "data": {
+                    "split_kwargs": { "test_size": 0.0001 }
+                }
+        },
+        "linear_regression": {
+            "algorithm": {
+                    "estimator": "IncrementalLinearRegression",
+                    "library": "sklearnex",
+                    "estimator_methods": {"training": "partial_fit"}
+                }
+        },
+        "pca": {
+            "algorithm": {
+                    "estimator": "IncrementalPCA",
+                    "library": "sklearnex.preview",
+                    "estimator_methods": {"training": "partial_fit"}
+                    },
+            "data": {
+                    "split_kwargs": { "test_size": 0.0001 }
+                }
+        }
+    },
+    "TEMPLATES": {
+        "basic_statistics": { "SETS": ["common incremental raw gpu params", "basic_statistics", "statistical batches and data"] },
+        "covariance": { "SETS": ["common incremental raw gpu params", "covariance", "statistical batches and data"] },
+        "linear_regression": { "SETS": ["common incremental raw gpu params", "linear_regression", "regression batches and data"] },
+        "pca": { "SETS": ["common incremental raw gpu params", "pca", "statistical batches and data"] }
+    }
+}
diff --git a/configs/spmd/large_scale/incremental/basic_stats.json b/configs/spmd/large_scale/incremental/basic_stats.json
deleted file mode 100644
index ca9e3eb9..00000000
--- a/configs/spmd/large_scale/incremental/basic_stats.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-    "INCLUDE": ["../../../common/sklearn.json", "../../../spmd/stats_covariance.json", "../large_scale.json"],
-    "PARAMETERS_SETS": {
-        "spmd basicstats parameters": {
-            "algorithm": {
-                "estimator": "IncrementalBasicStatistics",
-                "estimator_methods": { "training": "fit" },
-                "num_batches": {"training": 10}
-            },
-	    "data": {
-		"split_kwargs": { "test_size": 0.0001 }
-	    }
-        },
-        "synthetic data": {
-            "data": [
-		{ "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
-            ]
-        }
-    },
-    "TEMPLATES": {
-        "basicstats": {
-            "SETS": [
-                "sklearnex spmd implementation",
-                "large scale 32 parameters",
-		"synthetic data",
-                "spmd basicstats parameters"
-            ]
-        }
-    }
-}
diff --git a/configs/spmd/large_scale/incremental/covariance.json b/configs/spmd/large_scale/incremental/covariance.json
deleted file mode 100644
index 04fcd76b..00000000
--- a/configs/spmd/large_scale/incremental/covariance.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-    "INCLUDE": ["../../../common/sklearn.json", "../../../spmd/stats_covariance.json", "../large_scale.json"],
-    "PARAMETERS_SETS": {
-        "spmd covariance parameters": {
-            "algorithm": {
-                "estimator": "IncrementalEmpiricalCovariance",
-                "estimator_methods": { "training": "fit" },
-                "num_batches": {"training": 10}
-            },
-            "data": {
-                "split_kwargs": { "test_size": 0.0001 }
-            }
-        },
-        "synthetic data": {
-            "data": [
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
-            ]
-        }
-    },
-    "TEMPLATES": {
-        "covariance": {
-            "SETS": [
-                "sklearnex spmd implementation",
-                "large scale 32 parameters",
-		"synthetic data",
-                "spmd covariance parameters"
-            ]
-        }
-    }
-}
diff --git a/configs/spmd/large_scale/incremental/linear_model.json b/configs/spmd/large_scale/incremental/linear_model.json
deleted file mode 100644
index a483f613..00000000
--- a/configs/spmd/large_scale/incremental/linear_model.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-    "INCLUDE": ["../../../common/sklearn.json", "../../../regular/linear_model.json", "../large_scale.json"],
-    "PARAMETERS_SETS": {
-        "spmd linear parameters": {
-            "algorithm": {
-                "estimator": "IncrementalLinearRegression",
-                "estimator_methods": { "training": "fit" },
-                "num_batches": {"training": 10}
-            }
-        },
-        "synthetic data": {
-            "data": [
-		        { "source": "make_regression", "generation_kwargs": { "n_samples": 305000,  "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 300000, "test_size": 5000 } }
-            ]
-        }
-    },
-    "TEMPLATES": {
-        "linreg": {
-            "SETS": [
-                "sklearnex spmd implementation",
-                "large scale 32 parameters",
-		"synthetic data",
-                "spmd linear parameters"
-            ]
-        }
-    }
-}
diff --git a/configs/spmd/large_scale/incremental/pca.json b/configs/spmd/large_scale/incremental/pca.json
deleted file mode 100644
index 11fa5125..00000000
--- a/configs/spmd/large_scale/incremental/pca.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-    "INCLUDE": ["../../../common/sklearn.json", "../../../regular/pca.json", "../large_scale.json"],
-    "PARAMETERS_SETS": {
-        "spmd pca parameters": {
-            "algorithm": {
-                "estimator": "IncrementalPCA",
-                "estimator_methods": { "training": "fit", "inference": "" },
-                "num_batches": {"training": 10}
-            },
-            "data": {
-                "split_kwargs": { "test_size": 0.0001 }
-            }
-        },
-        "synthetic data": {
-            "data": [
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
-            ]
-        }
-    },
-    "TEMPLATES": {
-        "linreg": {
-            "SETS": [
-		"sklearnex spmd implementation",
-                "large scale 32 parameters",
-                "synthetic data",
-		"spmd pca parameters"
-            ]
-        }
-    }
-}
diff --git a/configs/spmd/large_scale/knn_strong.json b/configs/spmd/large_scale/knn_strong.json
index d202f6e4..36daf3f1 100644
--- a/configs/spmd/large_scale/knn_strong.json
+++ b/configs/spmd/large_scale/knn_strong.json
@@ -3,23 +3,24 @@
     "PARAMETERS_SETS": {
         "spmd knn cls parameters": {
             "algorithm": {
-		"estimator": "KNeighborsClassifier",
+                "estimator": "KNeighborsClassifier",
                 "estimator_params": {
                     "algorithm": "brute",
                     "metric": "minkowski",
                     "p": 2,
                     "weights": "uniform",
-		    "n_neighbors": 5
+                    "n_neighbors": 100
                 },
-		"estimator_methods": {
-			"training": "fit",
-			"inference": "predict"
-		}
+                "estimator_methods": {
+                    "training": "fit",
+                    "inference": "predict"
+                },
+		"sklearnex_context": { "use_raw_input": true }
 	    }
         },
         "synthetic classification data": {
             "data": [
-		{ "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000000 },   "generation_kwargs": {  "n_samples": 1500000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
+                { "source": "make_classification", "split_kwargs": { "train_size": 3000000, "test_size": 2000000 },   "generation_kwargs": {  "n_samples": 5000000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
             ]
         }	
     },
@@ -28,7 +29,7 @@
             "SETS": [
                 "synthetic classification data",
                 "sklearnex spmd implementation",
-		"large scale strong <=64 parameters",
+                "large scale strong <=64 parameters",
                 "spmd knn cls parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/knn_tier1.json b/configs/spmd/large_scale/knn_tier1.json
new file mode 100644
index 00000000..c230cc4e
--- /dev/null
+++ b/configs/spmd/large_scale/knn_tier1.json
@@ -0,0 +1,35 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd knn cls parameters": {
+            "algorithm": {
+                "estimator": "KNeighborsClassifier",
+                "estimator_params": {
+                    "algorithm": "brute",
+                    "metric": "minkowski",
+                    "p": 2,
+                    "weights": "uniform"
+                },
+                "estimator_methods": {
+                    "training": "fit",
+                    "inference": "predict"
+                },
+                "sklearnex_context": { "use_raw_input": true }
+            }
+        },
+        "synthetic classification data": [
+            { "data": { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 100000},   "generation_kwargs": {  "n_samples": 2000000,  "n_features": 10, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }, "algorithm": { "estimator_params": { "n_neighbors": 5 } } },
+            { "data": { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 10000},   "generation_kwargs": {  "n_samples": 2000000,  "n_features": 1000, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }, "algorithm": { "estimator_params": { "n_neighbors": 100 } } }
+        ]
+    },
+    "TEMPLATES": {
+        "knn classifier": {
+            "SETS": [
+                "synthetic classification data",
+                "sklearnex spmd implementation",
+                "large scale 32 parameters",
+                "spmd knn cls parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn_tier2.json
similarity index 55%
rename from configs/spmd/large_scale/knn.json
rename to configs/spmd/large_scale/knn_tier2.json
index b68b94af..ff0032e2 100644
--- a/configs/spmd/large_scale/knn.json
+++ b/configs/spmd/large_scale/knn_tier2.json
@@ -3,23 +3,24 @@
     "PARAMETERS_SETS": {
         "spmd knn cls parameters": {
             "algorithm": {
-		"estimator": "KNeighborsClassifier",
+                "estimator": "KNeighborsClassifier",
                 "estimator_params": {
                     "algorithm": "brute",
                     "metric": "minkowski",
                     "p": 2,
                     "weights": "uniform",
-		    "n_neighbors": 5
+                    "n_neighbors": 5
                 },
-		"estimator_methods": {
-			"training": "fit",
-			"inference": "predict"
-		}
-	    }
+                "estimator_methods": {
+                    "training": "fit",
+                    "inference": "predict"
+                },
+                "sklearnex_context": { "use_raw_input": true }
+            }
         },
         "synthetic classification data": {
             "data": [
-		{ "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 5000 },   "generation_kwargs": {  "n_samples": 5005000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
+                { "source": "make_classification", "split_kwargs": { "train_size": 100, "test_size": 100},   "generation_kwargs": {  "n_samples": 200,  "n_features": 10, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
             ]
         }	
     },
@@ -28,7 +29,7 @@
             "SETS": [
                 "synthetic classification data",
                 "sklearnex spmd implementation",
-		"large scale 2k parameters",
+                "large scale 2k parameters",
                 "spmd knn cls parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/linreg.json b/configs/spmd/large_scale/linreg.json
index ea45a52c..7c7fb035 100644
--- a/configs/spmd/large_scale/linreg.json
+++ b/configs/spmd/large_scale/linreg.json
@@ -4,13 +4,14 @@
         "spmd linear parameters": {
             "algorithm": {
                 "estimator": "LinearRegression",
-                "estimator_methods": { "training": "fit" }
+                "estimator_methods": { "training": "fit" },
+                "sklearnex_context": { "use_raw_input": true }
             }
         },
         "synthetic data": {
             "data": [
-                { "source": "make_regression", "generation_kwargs": { "n_samples": 30005000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 30000000, "test_size": 5000 } },
-		        { "source": "make_regression", "generation_kwargs": { "n_samples": 305000,  "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 300000, "test_size": 5000 } }
+		    { "source": "make_regression", "generation_kwargs": { "n_samples": 100005000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 100000000, "test_size": 5000 } },
+		    { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000,  "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 3000000, "test_size": 5000 } }
             ]
         }
     },
@@ -19,7 +20,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
-		"synthetic data",
+                "synthetic data",
                 "spmd linear parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/linreg_strong.json b/configs/spmd/large_scale/linreg_strong.json
index 629bf544..ac5a6c7a 100644
--- a/configs/spmd/large_scale/linreg_strong.json
+++ b/configs/spmd/large_scale/linreg_strong.json
@@ -4,7 +4,8 @@
         "spmd linear parameters": {
             "algorithm": {
                 "estimator": "LinearRegression",
-                "estimator_methods": { "training": "fit" }
+                "estimator_methods": { "training": "fit" },
+                "sklearnex_context": { "use_raw_input": true }
             }
         },
         "synthetic data": {
@@ -17,8 +18,8 @@
         "linreg": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale strong 2k parameters",
-		"synthetic data",
+                "large scale strong <=64 parameters",
+                "synthetic data",
                 "spmd linear parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/logreg.json b/configs/spmd/large_scale/logreg.json
index 326f2580..b7b4b998 100644
--- a/configs/spmd/large_scale/logreg.json
+++ b/configs/spmd/large_scale/logreg.json
@@ -2,15 +2,16 @@
     "INCLUDE": ["../../common/sklearn.json", "../logreg.json", "large_scale.json"],
     "PARAMETERS_SETS": {
         "spmd logreg2 parameters": {
-	    "algorithm":{
-		"estimator": "LogisticRegression",
+            "algorithm":{
+                "estimator": "LogisticRegression",
                 "estimator_methods": { "inference": "predict" },
-		"estimator_params": { "max_iter": 20 }
+                "estimator_params": { "max_iter": 10 },
+                "sklearnex_context": { "use_raw_input": true }
             }
-	},
+        },
         "synthetic data": {
             "data": [
-		{ "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 5001000, "n_features": 10, "n_classes": 2 } },
+                { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 5001000, "n_features": 10, "n_classes": 2 } },
                 { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 501000, "n_features": 1000, "n_classes": 2, "n_informative": 40, "n_clusters_per_class": 3, "flip_y": 0.05 } }
             ]
         }
@@ -21,8 +22,8 @@
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
                 "spmd logreg parameters",
-		"synthetic data",
-		"spmd logreg2 parameters"
+                "synthetic data",
+                "spmd logreg2 parameters"
             ]
         }
     }
diff --git a/configs/spmd/large_scale/logreg_strong.json b/configs/spmd/large_scale/logreg_strong.json
index 0b79ba9d..219840ea 100644
--- a/configs/spmd/large_scale/logreg_strong.json
+++ b/configs/spmd/large_scale/logreg_strong.json
@@ -2,16 +2,16 @@
     "INCLUDE": ["../../common/sklearn.json", "../logreg.json", "large_scale.json"],
     "PARAMETERS_SETS": {
         "spmd logreg2 parameters": {
-	    "algorithm":{
-		"estimator": "LogisticRegression",
+            "algorithm":{
+                "estimator": "LogisticRegression",
                 "estimator_methods": { "inference": "predict" },
-		"estimator_params": { "max_iter": 16 }
+                "estimator_params": { "max_iter": 16 },
+                "sklearnex_context": { "use_raw_input": true }
             }
-	},
+        },
         "synthetic data": {
             "data": [
-		{ "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 10001000, "n_features": 100, "n_classes": 2 } },
-		{ "source": "make_classification", "split_kwargs": { "train_size": 12000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 12001000, "n_features": 200, "n_classes": 2, "n_informative": 40, "n_clusters_per_class": 3, "flip_y": 0.05 } }
+                { "source": "make_classification", "split_kwargs": { "train_size": 12000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 12001000, "n_features": 200, "n_classes": 2, "n_informative": 40, "n_clusters_per_class": 3, "flip_y": 0.05 } }
             ]
         }
     },
@@ -19,10 +19,10 @@
         "logreg": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale strong 2k parameters",
+                "large scale strong 64 parameters",
                 "spmd logreg parameters",
-		"synthetic data",
-		"spmd logreg2 parameters"
+                "synthetic data",
+                "spmd logreg2 parameters"
             ]
         }
     }
diff --git a/configs/spmd/large_scale/pca.json b/configs/spmd/large_scale/pca.json
index d0ee879a..ce56bd8a 100644
--- a/configs/spmd/large_scale/pca.json
+++ b/configs/spmd/large_scale/pca.json
@@ -4,7 +4,8 @@
         "spmd pca parameters": {
             "algorithm": {
                 "estimator": "PCA",
-                "estimator_methods": { "training": "fit", "inference": "" }
+                "estimator_methods": { "training": "fit", "inference": "" },
+                "sklearnex_context": { "use_raw_input": true }
             },
             "data": {
                 "split_kwargs": { "test_size": 0.0001 }
@@ -12,18 +13,18 @@
         },
         "synthetic data": {
             "data": [
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000000,  "n_features": 10, "centers": 1 } },
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000,  "n_features": 1000, "centers": 1 } }
             ]
         }
     },
     "TEMPLATES": {
         "linreg": {
             "SETS": [
-		"sklearnex spmd implementation",
+                "sklearnex spmd implementation",
                 "large scale 2k parameters",
                 "synthetic data",
-		"spmd pca parameters"
+                "spmd pca parameters"
             ]
         }
     }
diff --git a/configs/spmd/large_scale/pca_strong.json b/configs/spmd/large_scale/pca_strong.json
index 3cb33e72..70461ba7 100644
--- a/configs/spmd/large_scale/pca_strong.json
+++ b/configs/spmd/large_scale/pca_strong.json
@@ -4,7 +4,8 @@
         "spmd pca parameters": {
             "algorithm": {
                 "estimator": "PCA",
-                "estimator_methods": { "training": "fit", "inference": "" }
+                "estimator_methods": { "training": "fit", "inference": "" },
+                "sklearnex_context": { "use_raw_input": true }
             },
             "data": {
                 "split_kwargs": { "test_size": 0.0001 }
@@ -19,10 +20,10 @@
     "TEMPLATES": {
         "linreg": {
             "SETS": [
-	        "sklearnex spmd implementation",
-                "large scale strong 2k parameters",
+                "sklearnex spmd implementation",
+                "large scale strong <=64 parameters",
                 "synthetic data",
-		"spmd pca parameters"
+                "spmd pca parameters"
             ]
         }
     }
diff --git a/configs/spmd/large_scale/spmd_for_online.json b/configs/spmd/large_scale/spmd_for_online.json
deleted file mode 100644
index 2ef60f5b..00000000
--- a/configs/spmd/large_scale/spmd_for_online.json
+++ /dev/null
@@ -1,96 +0,0 @@
-{
-    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
-    "PARAMETERS_SETS": {
-        "covariance data": {
-            "data": [
-                {
-                    "source": "make_blobs",
-                    "generation_kwargs": {
-                        "centers": 1,
-                        "n_samples": 1000000,
-                        "n_features": [10, 100]
-                    },
-                    "split_kwargs": {"ignore": true}
-                }
-            ]
-        },
-        "basic_statistics data": {
-            "data": {
-                "source": "make_blobs",
-                "generation_kwargs": {
-                    "centers": 1,
-                    "n_samples": 1000000,
-                    "n_features": [10, 100]
-                },
-                "split_kwargs": {"ignore": true}
-            }
-        },
-        "linear_regression data": {
-            "data": {
-                "source": "make_regression",
-                "split_kwargs": {"train_size": 0.2, "test_size": 0.8},
-                "generation_kwargs": {
-                    "n_samples": 1000000,
-                    "n_features": [10, 100],
-                    "n_informative": 5,
-                    "noise": 2.0
-                }
-            }
-        },
-        "pca data": {
-            "data": {
-                "source": "make_blobs",
-                "generation_kwargs": {
-                    "centers": 1,
-                    "n_samples": 1000000,
-                    "n_features": [10, 100]
-                },
-                "split_kwargs": {"ignore": true}
-            }
-        },
-        "basic_statistics": {
-            "algorithm": [
-                {
-                    "estimator": "BasicStatistics",
-                    "library": "sklearnex.spmd",
-                    "estimator_methods": {"training": "fit"}
-                }
-            ]
-        },
-        "covariance": {
-            "algorithm": [
-                {
-                    "estimator": "EmpiricalCovariance",
-                    "library": "sklearnex.spmd",
-                    "estimator_methods": {"training": "fit"}
-                }
-            ]
-        },
-        "linear_regression": {
-            "algorithm": [
-                {
-                    "estimator": "LinearRegression",
-                    "library": "sklearnex.spmd",
-                    "estimator_methods": {"training": "fit"}
-                }
-            ]
-        },
-        "pca": {
-            "algorithm": [
-                {
-                    "estimator": "PCA",
-                    "library": "sklearnex.spmd",
-                    "estimator_methods": {"training": "fit", "inference": ""}
-                }
-            ]
-        }
-    },
-    "TEMPLATES": {
-        "basic_statistics": {"SETS": ["basic_statistics", "basic_statistics data", "sklearnex spmd implementation", "large scale full one node parameters"]},
-        "covariance": {"SETS": ["covariance", "covariance data", "sklearnex spmd implementation", "large scale full one node parameters"]},
-        "linear_regression": {
-            "SETS": ["linear_regression", "linear_regression data", "sklearnex spmd implementation", "large scale full one node parameters"]
-        },
-        "pca": {"SETS": ["pca", "pca data", "sklearnex spmd implementation", "large scale full one node parameters"]}
-    }
-}
diff --git a/configs/spmd/large_scale/spmd_for_online_strong.json b/configs/spmd/large_scale/spmd_for_online_strong.json
deleted file mode 100644
index 77a25075..00000000
--- a/configs/spmd/large_scale/spmd_for_online_strong.json
+++ /dev/null
@@ -1,60 +0,0 @@
-{
-    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
-    "PARAMETERS_SETS": {
-        "covariance data": {
-            "data": [
-                {
-                    "source": "make_blobs",
-                    "generation_kwargs": {
-                        "centers": 1,
-                        "n_samples": 12000000,
-                        "n_features": [10, 100]
-                    },
-                    "split_kwargs": {"ignore": true}
-                }
-            ]
-        },
-        "basic_statistics data": {
-            "data": {
-                "source": "make_blobs",
-                "generation_kwargs": {
-                    "centers": 1,
-                    "n_samples": 12000000,
-                    "n_features": [10, 100]
-                },
-                "split_kwargs": {"ignore": true}
-            }
-        },
-        "linear_regression data": {
-            "data": {
-                "source": "make_regression",
-                "split_kwargs": {"train_size": 0.2, "test_size": 0.8},
-                "generation_kwargs": {
-                    "n_samples": 12000000,
-                    "n_features": [10, 100],
-                    "n_informative": 5,
-                    "noise": 2.0
-                }
-            }
-        },
-        "pca data": {
-            "data": {
-                "source": "make_blobs",
-                "generation_kwargs": {
-                    "centers": 1,
-                    "n_samples": 12000000,
-                    "n_features": [10, 100]
-                },
-                "split_kwargs": {"ignore": true}
-            }
-        }
-    },
-    "TEMPLATES": {
-        "basic_statistics": {"SETS": ["basic_statistics data", "spmd default parameters", "sklearnex spmd implementation", "large scale strong full one node parameters"]},
-        "covariance": {"SETS": ["covariance data", "spmd default parameters","sklearnex spmd implementation", "large scale strong full one node parameters"]},
-        "linear_regression": {
-            "SETS": ["linear_regression data", "spmd default parameters", "sklearnex spmd implementation", "large scale strong full one node parameters"]
-        },
-        "pca": {"SETS": ["pca data", "spmd default parameters", "sklearnex spmd implementation", "large scale strong full one node parameters"]}
-    }
-}
diff --git a/sklbench/benchmarks/custom_function.py b/sklbench/benchmarks/custom_function.py
index 25abb900..34b223ed 100644
--- a/sklbench/benchmarks/custom_function.py
+++ b/sklbench/benchmarks/custom_function.py
@@ -64,9 +64,13 @@ def get_function_args(bench_case: BenchCase, x_train, y_train, x_test, y_test) -
 
 def measure_function_instance(bench_case, function_instance, args: Tuple, kwargs: Dict):
     metrics = dict()
-    metrics["time[ms]"], metrics["time std[ms]"], _ = measure_case(
-        bench_case, function_instance, *args, **kwargs
-    )
+    (
+        metrics["time[ms]"],
+        metrics["time std[ms]"],
+        metrics["first iter[ms]"],
+        metrics["box filter mean[ms]"],
+        metrics["box filter std[ms]"],
+    ) = measure_case(bench_case, function_instance, *args, **kwargs)
     return metrics
 
 
diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
index e57a9038..4164a10d 100644
--- a/sklbench/benchmarks/sklearn_estimator.py
+++ b/sklbench/benchmarks/sklearn_estimator.py
@@ -516,7 +516,11 @@ def main(bench_case: BenchCase, filters: List[BenchCase]):
         bench_case, "algorithm:estimator_params", dict()
     )
     # logger.debug("estimator params: " + str(estimator_params))
-    if "DBSCAN" in str(estimator_name):
+    if (
+        "DBSCAN" in str(estimator_name)
+        and get_bench_case_value(bench_case, "data:distributed_split", None)
+        != "rank_based"
+    ):
         if "min_samples" in estimator_params:
             from mpi4py import MPI
 
diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index 86944ead..38b4fe3b 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -109,11 +109,11 @@ def split_and_transform_data(bench_case, data, data_description):
         y_train, y_test = None, None
 
     distributed_split = get_bench_case_value(bench_case, "data:distributed_split", None)
-    knn_split_train = (
-        "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "")
-        and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1
-    )
-    if distributed_split == "rank_based" or knn_split_train:
+    # knn_split_train = (
+    #     "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "")
+    #     and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1
+    # )
+    if distributed_split == "rank_based":
         from mpi4py import MPI
 
         comm = MPI.COMM_WORLD
diff --git a/sklbench/utils/measurement.py b/sklbench/utils/measurement.py
index bfabbdc0..3677e760 100644
--- a/sklbench/utils/measurement.py
+++ b/sklbench/utils/measurement.py
@@ -79,7 +79,7 @@ def measure_time(
         t0 = timeit.default_timer()
         func_return_value = func(*args, **kwargs)
         t1 = timeit.default_timer()
-        if hasattr(func.__self__, "_n_inner_iter"):
+        if hasattr(func, "__self__") and hasattr(func.__self__, "_n_inner_iter"):
             inners.append(func.__self__._n_inner_iter)
             iters.append(func.__self__.n_iter_)
         if enable_itt and itt_is_available:
@@ -92,16 +92,20 @@ def measure_time(
                 f"exceeded time limit ({time_limit} seconds)"
             )
             break
-    from mpi4py import MPI
-
-    if MPI.COMM_WORLD.Get_rank() == 0:
-        logger.debug(
-            "iters across n runs: "
-            + str(iters)
-            + ", inner iters across n runs: "
-            + str(inners)
-        )
-    logger.debug(times)
+
+    try:
+        from mpi4py import MPI
+
+        if MPI.COMM_WORLD.Get_rank() == 0:
+            logger.debug(
+                "iters across n runs: "
+                + str(iters)
+                + ", inner iters across n runs: "
+                + str(inners)
+            )
+            logger.debug(f"Runtime for all {n_runs} iterations: {times}")
+    except ModuleNotFoundError:
+        logger.debug(f"Runtime for all {n_runs} iterations: {times}")
     # mean, std = box_filter(times)
     # if std / mean > std_mean_ratio:
     #    logger.warning(
diff --git a/test-configuration-linux.yml b/test-configuration-linux.yml
index 722d1008..a37769ce 100644
--- a/test-configuration-linux.yml
+++ b/test-configuration-linux.yml
@@ -45,11 +45,6 @@ steps:
       conda activate bench-env
       python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json
     displayName: Sklearn example run
-  - script: |
-      source /usr/share/miniconda/etc/profile.d/conda.sh
-      conda activate bench-env
-      python -m sklbench --report -l DEBUG --report -c configs/incremental.json
-    displayName: Incremental algorithms example run
   - script: |
       source /usr/share/miniconda/etc/profile.d/conda.sh
       conda activate bench-env
diff --git a/test-configuration-win.yml b/test-configuration-win.yml
index f3ac1595..a1eddaeb 100644
--- a/test-configuration-win.yml
+++ b/test-configuration-win.yml
@@ -43,10 +43,6 @@ steps:
       call activate bench-env
       python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json
     displayName: Sklearn example run
-  - script: |
-      call activate bench-env
-      python -m sklbench --report -l DEBUG --report -c configs/incremental.json
-    displayName: Incremental algorithms example run
   - script: |
       call activate bench-env
       python -m sklbench --report -l DEBUG --report -c configs/xgboost_example.json

From f0fccdd263725f143ba827e30597e84d78c1352c Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Fri, 21 Mar 2025 15:02:15 -0700
Subject: [PATCH 101/110] Revert "Removed environment from diff."

This reverts commit 15db7929f76696956a4fe0529e8addef35f95f74.
---
 sklbench/report/implementation.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py
index cddb45f3..2bc3a05e 100644
--- a/sklbench/report/implementation.py
+++ b/sklbench/report/implementation.py
@@ -100,8 +100,7 @@
     "batch_size",
 ]
 
-#DIFFBY_COLUMNS = ["environment_name", "library", "format", "device"]
-DIFFBY_COLUMNS = ["library", "format", "device"]
+DIFFBY_COLUMNS = ["environment_name", "library", "format", "device"]
 
 
 def geomean_wrapper(a):

From 4d675ecdfbb117750fc79ebaeb66a7feaa2444d5 Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Fri, 21 Mar 2025 15:29:50 -0700
Subject: [PATCH 102/110] Removed extra code for sample_shift.

---
 sklbench/datasets/transformer.py | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index 44476871..34b438af 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -117,34 +117,21 @@ def split_and_transform_data(bench_case, data, data_description):
     if distributed_split == "sample_shift":
         from mpi4py import MPI
 
-        comm = MPI.COMM_WORLD
-        rank = comm.Get_rank()
-        size = comm.Get_size()
-
-        n_train = len(x_train)
-        n_test = len(x_test)
-
-        train_start = 0
-        train_end = n_train
-        test_start = 0
-        test_end = n_test
-
+        rank = MPI.COMM_WORLD.Get_rank()
         adjust_number = (math.sqrt(rank) * 0.003) + 1
 
         if "y" in data:
             x_train, y_train = (
-                x_train[train_start:train_end] * adjust_number,
-                y_train[train_start:train_end],
+                x_train * adjust_number,
+                y_train,
             )
 
             x_test, y_test = (
-                x_test[test_start:test_end] * adjust_number,
-                y_test[test_start:test_end],
+                x_test * adjust_number,
+                y_test,
             )
         else:
-            x_train = x_train[train_start:train_end]
-
-            x_test = x_test[test_start:test_end] * adjust_number
+            x_test = x_test * adjust_number
 
     elif distributed_split == "rank_based":
         from mpi4py import MPI

From e8fbd0b3784b957d4c8555228063f544b381a5b5 Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Fri, 21 Mar 2025 15:39:33 -0700
Subject: [PATCH 103/110] Changes for sample_shift.

---
 configs/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/configs/README.md b/configs/README.md
index 8d3c5ac2..b1219124 100644
--- a/configs/README.md
+++ b/configs/README.md
@@ -105,6 +105,7 @@ Configs have the three highest parameter keys:
 | `data`:`order` | `F` | `C`, `F` | Data order to use in benchmark: contiguous(C) or Fortran. |
 | `data`:`dtype` | `float64` |  | Data type to use in benchmark. |
 | `data`:`distributed_split` | None | None, `rank_based` | Split type used to distribute data between machines in distributed algorithm. `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. |
+| `data`: `sample_shift` | None | None, `rank_based` | Determines how data points are shifted based on MPI rank. `None` type means use all data without any shift across all machines. `sample_shift`: Shift each data point in a rank by \((\sqrt{\text{rank id}} \times 0.003) + 1\). |
 |<h3>Algorithm parameters</h3>||||
 | `algorithm`:`library` | None |  | Python module containing measured entity (class or function). |
 | `algorithm`:`device` | `default` | `default`, `cpu`, `gpu` | Device selected for computation. |

From a7cea17bed5f6ad721a98b7d15ae6d64bb35c75f Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Fri, 21 Mar 2025 15:44:53 -0700
Subject: [PATCH 104/110] Updated sample shift.

---
 configs/README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/configs/README.md b/configs/README.md
index b1219124..91b45b81 100644
--- a/configs/README.md
+++ b/configs/README.md
@@ -104,8 +104,7 @@ Configs have the three highest parameter keys:
 | `data`:`format` | `pandas` | `pandas`, `numpy`, `cudf` | Data format to use in benchmark. |
 | `data`:`order` | `F` | `C`, `F` | Data order to use in benchmark: contiguous(C) or Fortran. |
 | `data`:`dtype` | `float64` |  | Data type to use in benchmark. |
-| `data`:`distributed_split` | None | None, `rank_based` | Split type used to distribute data between machines in distributed algorithm. `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. |
-| `data`: `sample_shift` | None | None, `rank_based` | Determines how data points are shifted based on MPI rank. `None` type means use all data without any shift across all machines. `sample_shift`: Shift each data point in a rank by \((\sqrt{\text{rank id}} \times 0.003) + 1\). |
+| `data`:`distributed_split` | None | None, `rank_based`, `sample_shift` | `rank_based` Split type used to distribute data between machines in distributed algorithm. `sample_shift`: Shift each data point in a rank by \((\sqrt{\text{rank id}} \times 0.003) + 1\). `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. |
 |<h3>Algorithm parameters</h3>||||
 | `algorithm`:`library` | None |  | Python module containing measured entity (class or function). |
 | `algorithm`:`device` | `default` | `default`, `cpu`, `gpu` | Device selected for computation. |

From f3c2757e87dd10a723ff8a241b6cdd78d16a3bd7 Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Fri, 21 Mar 2025 15:47:14 -0700
Subject: [PATCH 105/110] Updated sample shift.

---
 configs/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/README.md b/configs/README.md
index 91b45b81..3d16a6ce 100644
--- a/configs/README.md
+++ b/configs/README.md
@@ -104,7 +104,7 @@ Configs have the three highest parameter keys:
 | `data`:`format` | `pandas` | `pandas`, `numpy`, `cudf` | Data format to use in benchmark. |
 | `data`:`order` | `F` | `C`, `F` | Data order to use in benchmark: contiguous(C) or Fortran. |
 | `data`:`dtype` | `float64` |  | Data type to use in benchmark. |
-| `data`:`distributed_split` | None | None, `rank_based`, `sample_shift` | `rank_based` Split type used to distribute data between machines in distributed algorithm. `sample_shift`: Shift each data point in a rank by \((\sqrt{\text{rank id}} \times 0.003) + 1\). `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. |
+| `data`:`distributed_split` | None | None, `rank_based`, `sample_shift` | `rank_based` Split type used to distribute data between machines in distributed algorithm. `sample_shift`: Shift each data point in each rank by sqrt (rank id) * 0.003) + 1. `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. |
 |<h3>Algorithm parameters</h3>||||
 | `algorithm`:`library` | None |  | Python module containing measured entity (class or function). |
 | `algorithm`:`device` | `default` | `default`, `cpu`, `gpu` | Device selected for computation. |

From 2ae3c394547490c0c6782af591d6eec7ee7e8838 Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Fri, 21 Mar 2025 15:52:40 -0700
Subject: [PATCH 106/110] Removed extra code.

---
 sklbench/datasets/transformer.py | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index 34b438af..79a6d7a0 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -119,19 +119,7 @@ def split_and_transform_data(bench_case, data, data_description):
 
         rank = MPI.COMM_WORLD.Get_rank()
         adjust_number = (math.sqrt(rank) * 0.003) + 1
-
-        if "y" in data:
-            x_train, y_train = (
-                x_train * adjust_number,
-                y_train,
-            )
-
-            x_test, y_test = (
-                x_test * adjust_number,
-                y_test,
-            )
-        else:
-            x_test = x_test * adjust_number
+        x_test = x_test * adjust_number
 
     elif distributed_split == "rank_based":
         from mpi4py import MPI

From 39cc4f2de559d7f1b96b01f4f58931e0b3ef58a9 Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Fri, 21 Mar 2025 16:07:06 -0700
Subject: [PATCH 107/110] Added comment for sample_shift.

---
 sklbench/datasets/transformer.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index 79a6d7a0..81bdf5fb 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -118,6 +118,10 @@ def split_and_transform_data(bench_case, data, data_description):
         from mpi4py import MPI
 
         rank = MPI.COMM_WORLD.Get_rank()
+        # This approach was chosen to shift the distribution of synthetic data on each rank
+        # for KMeans weak scaling tests. When testing with a large number of tiles, this method avoids duplication of data on each rank.
+        # For example, if there are 24,576 tiles being used, each data point in the 24,576th tile would be multiplied by 1.47.
+        # The factor 0.003 was chosen arbitrarily and can be fine-tuned for other datasets and algorithms if needed.
         adjust_number = (math.sqrt(rank) * 0.003) + 1
         x_test = x_test * adjust_number
 

From 3fc7c42dfc1ea52bb19bffb944bd675a8b1dd093 Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Fri, 21 Mar 2025 16:14:18 -0700
Subject: [PATCH 108/110] Added back in x_train in sample_shift.

---
 sklbench/datasets/transformer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index 81bdf5fb..c63d3b20 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -124,6 +124,7 @@ def split_and_transform_data(bench_case, data, data_description):
         # The factor 0.003 was chosen arbitrarily and can be fine-tuned for other datasets and algorithms if needed.
         adjust_number = (math.sqrt(rank) * 0.003) + 1
         x_test = x_test * adjust_number
+        x_train = x_train * adjust_number
 
     elif distributed_split == "rank_based":
         from mpi4py import MPI

From 1bd5aa150735d97efeb8776355f1dccbd71fdf3c Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Fri, 21 Mar 2025 16:17:59 -0700
Subject: [PATCH 109/110] Updated description of sample_shift.

---
 configs/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/README.md b/configs/README.md
index 3d16a6ce..e1cf8390 100644
--- a/configs/README.md
+++ b/configs/README.md
@@ -104,7 +104,7 @@ Configs have the three highest parameter keys:
 | `data`:`format` | `pandas` | `pandas`, `numpy`, `cudf` | Data format to use in benchmark. |
 | `data`:`order` | `F` | `C`, `F` | Data order to use in benchmark: contiguous(C) or Fortran. |
 | `data`:`dtype` | `float64` |  | Data type to use in benchmark. |
-| `data`:`distributed_split` | None | None, `rank_based`, `sample_shift` | `rank_based` Split type used to distribute data between machines in distributed algorithm. `sample_shift`: Shift each data point in each rank by sqrt (rank id) * 0.003) + 1. `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. |
+| `data`:`distributed_split` | None | None, `rank_based`, `sample_shift` | Split type used to distribute data between machines in distributed algorithm. `sample_shift`: Shift each data point in each rank by sqrt (rank id) * 0.003) + 1. `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. |
 |<h3>Algorithm parameters</h3>||||
 | `algorithm`:`library` | None |  | Python module containing measured entity (class or function). |
 | `algorithm`:`device` | `default` | `default`, `cpu`, `gpu` | Device selected for computation. |

From 06944c172997de5e5e1659cc973637bea8d8704b Mon Sep 17 00:00:00 2001
From: "Mcgrievy, Kathleen" <kathleen.mcgrievy@intel.com>
Date: Fri, 21 Mar 2025 16:20:52 -0700
Subject: [PATCH 110/110] Added predict back in.

---
 configs/spmd/large_scale/kmeans_narrow_weak.json | 2 +-
 configs/spmd/large_scale/kmeans_wide_weak.json   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/spmd/large_scale/kmeans_narrow_weak.json b/configs/spmd/large_scale/kmeans_narrow_weak.json
index 523aba01..69f0b6ac 100644
--- a/configs/spmd/large_scale/kmeans_narrow_weak.json
+++ b/configs/spmd/large_scale/kmeans_narrow_weak.json
@@ -10,7 +10,7 @@
 		    "n_clusters": 10,
 		    "random_state": 42
                 },
-                "estimator_methods": { "training": "fit", "inference": "" },
+                "estimator_methods": { "training": "fit", "inference": "predict" },
                 "sklearnex_context": { "use_raw_input": true }
             }
         },
diff --git a/configs/spmd/large_scale/kmeans_wide_weak.json b/configs/spmd/large_scale/kmeans_wide_weak.json
index 1c588d60..5520f10a 100644
--- a/configs/spmd/large_scale/kmeans_wide_weak.json
+++ b/configs/spmd/large_scale/kmeans_wide_weak.json
@@ -10,7 +10,7 @@
 		    "n_clusters": 10,
 		    "random_state": 42
                 },
-                "estimator_methods": { "training": "fit", "inference": "" },
+                "estimator_methods": { "training": "fit", "inference": "predict" },
                 "sklearnex_context": { "use_raw_input": true }
             }
         },