diff --git a/configs/README.md b/configs/README.md
index 8d3c5ac2..e1cf8390 100644
--- a/configs/README.md
+++ b/configs/README.md
@@ -104,7 +104,7 @@ Configs have the three highest parameter keys:
| `data`:`format` | `pandas` | `pandas`, `numpy`, `cudf` | Data format to use in benchmark. |
| `data`:`order` | `F` | `C`, `F` | Data order to use in benchmark: contiguous(C) or Fortran. |
| `data`:`dtype` | `float64` | | Data type to use in benchmark. |
-| `data`:`distributed_split` | None | None, `rank_based` | Split type used to distribute data between machines in distributed algorithm. `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. |
+| `data`:`distributed_split` | None | None, `rank_based`, `sample_shift` | Split type used to distribute data between machines in distributed algorithm. `sample_shift`: Shift each data point in each rank by sqrt (rank id) * 0.003) + 1. `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. |
|
Algorithm parameters
||||
| `algorithm`:`library` | None | | Python module containing measured entity (class or function). |
| `algorithm`:`device` | `default` | `default`, `cpu`, `gpu` | Device selected for computation. |
diff --git a/configs/common/sklearn.json b/configs/common/sklearn.json
index d7b13188..43051093 100644
--- a/configs/common/sklearn.json
+++ b/configs/common/sklearn.json
@@ -12,6 +12,11 @@
{ "library": "sklearnex", "device": ["cpu", "gpu"] }
]
},
+ "sklearn-ex[gpu] implementations": {
+ "algorithm": [
+ { "library": "sklearnex", "device": ["gpu"] }
+ ]
+ },
"sklearn-ex[preview] implementations": {
"algorithm": [
{ "library": "sklearn", "device": "cpu" },
diff --git a/configs/regular/batch_for_online.json b/configs/regular/batch_for_online.json
new file mode 100644
index 00000000..973c4ed4
--- /dev/null
+++ b/configs/regular/batch_for_online.json
@@ -0,0 +1,85 @@
+{
+ "INCLUDE": ["../common/sklearn.json"],
+ "PARAMETERS_SETS": {
+ "common": {"bench": {"n_runs": 10}},
+ "basic_statistics data": {
+ "data": {
+ "source": "make_blobs",
+ "generation_kwargs": {
+ "centers": 1,
+ "n_samples": 12000000,
+ "n_features": [10, 100]
+ },
+ "split_kwargs": {"ignore": true}
+ }
+ },
+ "linear_regression data": {
+ "data": {
+ "source": "make_regression",
+ "split_kwargs": {"train_size": 0.2, "test_size": 0.8},
+ "generation_kwargs": {
+ "n_samples": 12000000,
+ "n_features": [10, 100],
+ "n_informative": 5,
+ "noise": 2.0
+ }
+ }
+ },
+ "pca data": {
+ "data": {
+ "source": "make_blobs",
+ "generation_kwargs": {
+ "centers": 1,
+ "n_samples": 12000000,
+ "n_features": [10, 100]
+ },
+ "split_kwargs": {"ignore": true}
+ }
+ },
+ "basic_statistics": {
+ "algorithm": [
+ {
+ "estimator": "BasicStatistics",
+ "library": "sklearnex.basic_statistics",
+ "estimator_methods": {"training": "fit"}
+ }
+ ]
+ },
+ "covariance": {
+ "algorithm": [
+ {
+ "estimator": "EmpiricalCovariance",
+ "library": "sklearnex.preview.covariance",
+ "estimator_methods": {"training": "fit"}
+ }
+ ]
+ },
+ "linear_regression": {
+ "algorithm": [
+ {
+ "estimator": "LinearRegression",
+ "library": "sklearnex.linear_model",
+ "estimator_methods": {"training": "fit"}
+ }
+ ]
+ },
+ "pca": {
+ "algorithm": [
+ {
+ "estimator": "PCA",
+ "library": "sklearnex.decomposition",
+ "estimator_methods": {"training": "fit"}
+ }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "basic_statistics": {"SETS": ["common", "basic_statistics", "basic_statistics data", "sklearn-ex[gpu] implementations"]},
+ "covariance": {"SETS": ["common", "basic_statistics data", "sklearn-ex[gpu] implementations", "covariance"]},
+ "linear_regression": {
+ "SETS": ["common", "linear_regression", "linear_regression data", "sklearn-ex[gpu] implementations"]
+ },
+ "pca": {"SETS": ["common", "pca", "pca data", "sklearn-ex[gpu] implementations"]}
+ }
+}
+
diff --git a/configs/regular/bf16/basic_statistics.json b/configs/regular/bf16/basic_statistics.json
new file mode 100644
index 00000000..671521ab
--- /dev/null
+++ b/configs/regular/bf16/basic_statistics.json
@@ -0,0 +1,27 @@
+{
+ "INCLUDE": ["../../common/sklearn.json"],
+ "PARAMETERS_SETS": {
+ "basic stats parameters": {
+ "algorithm": {
+ "estimator": "BasicStatistics"
+ },
+ "data": {
+ "dtype": ["float32"]
+ }
+ },
+ "synthetic data": {
+ "data": [
+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "basic_statistics": {
+ "SETS": [
+ "sklearn-ex[gpu] implementations",
+ "basic stats parameters",
+ "synthetic data"
+ ]
+ }
+ }
+}
diff --git a/configs/regular/bf16/covariance.json b/configs/regular/bf16/covariance.json
new file mode 100644
index 00000000..1cd6ef4a
--- /dev/null
+++ b/configs/regular/bf16/covariance.json
@@ -0,0 +1,28 @@
+{
+ "INCLUDE": ["../../common/sklearn.json"],
+ "PARAMETERS_SETS": {
+ "covariance parameters": {
+ "algorithm": {
+ "estimator": "EmpiricalCovariance",
+ "library": "sklearnex.preview.covariance"
+ },
+ "data": {
+ "dtype": ["float32"]
+ }
+ },
+ "synthetic data": {
+ "data": [
+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "covariance": {
+ "SETS": [
+ "sklearn-ex[gpu] implementations",
+ "covariance parameters",
+ "synthetic data"
+ ]
+ }
+ }
+}
diff --git a/configs/regular/bf16/dbscan.json b/configs/regular/bf16/dbscan.json
new file mode 100644
index 00000000..b91120e8
--- /dev/null
+++ b/configs/regular/bf16/dbscan.json
@@ -0,0 +1,41 @@
+{
+ "INCLUDE": ["../../common/sklearn.json"],
+ "PARAMETERS_SETS": {
+ "common dbscan parameters": {
+ "algorithm": {
+ "estimator": "DBSCAN",
+ "estimator_params": {
+ "eps": "[SPECIAL_VALUE]distances_quantile:0.01",
+ "min_samples": 5,
+ "metric": "euclidean"
+ }
+ },
+ "data": {
+ "dtype": ["float32"]
+ }
+ },
+ "sklearn dbscan parameters": {
+ "algorithm": {
+ "estimator_params": {
+ "algorithm": "brute",
+ "n_jobs": "[SPECIAL_VALUE]physical_cpus"
+ }
+ }
+ },
+ "synthetic dataset": {
+ "data": [
+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 10, "centers": 10 }, "algorithm": { "eps": 5, "min_samples": 5 } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "sklearn dbscan": {
+ "SETS": [
+ "sklearn-ex[gpu] implementations",
+ "common dbscan parameters",
+ "sklearn dbscan parameters",
+ "synthetic dataset"
+ ]
+ }
+ }
+}
diff --git a/configs/regular/bf16/forest.json b/configs/regular/bf16/forest.json
new file mode 100644
index 00000000..845b73a2
--- /dev/null
+++ b/configs/regular/bf16/forest.json
@@ -0,0 +1,34 @@
+{
+ "INCLUDE": ["../../common/sklearn.json"],
+ "PARAMETERS_SETS": {
+ "common forest params": {
+ "data": {
+ "dtype": ["float32"]
+ }
+ },
+ "forest classifier params": {
+ "algorithm": {"estimator": "RandomForestClassifier"},
+ "data": { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 501000, "n_features": 10, "n_classes": 2 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } }
+ },
+ "forest regression params": {
+ "algorithm": {"estimator": "RandomForestRegressor"},
+ "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 501000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } }}
+ }
+ },
+ "TEMPLATES": {
+ "forest cls": {
+ "SETS": [
+ "sklearn-ex[gpu] implementations",
+ "common forest params",
+ "forest classifier params"
+ ]
+ },
+ "forest reg": {
+ "SETS": [
+ "sklearn-ex[gpu] implementations",
+ "common forest params",
+ "forest regression params"
+ ]
+ }
+ }
+}
diff --git a/configs/regular/bf16/kmeans.json b/configs/regular/bf16/kmeans.json
new file mode 100644
index 00000000..8a5323c5
--- /dev/null
+++ b/configs/regular/bf16/kmeans.json
@@ -0,0 +1,40 @@
+{
+ "INCLUDE": ["../../common/sklearn.json"],
+ "PARAMETERS_SETS": {
+ "common kmeans parameters": {
+ "algorithm": {
+ "estimator": "KMeans",
+ "estimator_params": {
+ "n_clusters": "[SPECIAL_VALUE]auto",
+ "n_init": 1,
+ "max_iter": 30,
+ "tol": 1e-3,
+ "random_state": 42
+ },
+ "estimator_methods": { "inference": "predict" }
+ },
+ "data": {
+ "dtype": ["float32"],
+ "preprocessing_kwargs": { "normalize": true }
+ }
+ },
+ "sklearn kmeans parameters": {
+ "algorithm": { "estimator_params": { "init": "k-means++", "algorithm": "lloyd" } }
+ },
+ "synthetic data": {
+ "data": [
+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "sklearn kmeans": {
+ "SETS": [
+ "sklearn-ex[gpu] implementations",
+ "common kmeans parameters",
+ "sklearn kmeans parameters",
+ "synthetic data"
+ ]
+ }
+ }
+}
diff --git a/configs/regular/bf16/knn.json b/configs/regular/bf16/knn.json
new file mode 100644
index 00000000..fabf6d6d
--- /dev/null
+++ b/configs/regular/bf16/knn.json
@@ -0,0 +1,56 @@
+{
+ "INCLUDE": ["../../common/sklearn.json"],
+ "PARAMETERS_SETS": {
+ "common knn parameters": {
+ "algorithm": {
+ "estimator_params": {
+ "n_neighbors": [10, 100],
+ "weights": "uniform"
+ }
+ },
+ "data": {
+ "dtype": ["float32"],
+ "preprocessing_kwargs": { "normalize": true }
+ }
+ },
+ "sklearn knn parameters": {
+ "algorithm": { "estimator_params": { "n_jobs": "[SPECIAL_VALUE]physical_cpus" } }
+ },
+ "synthetic classification data": {
+ "algorithm": {
+ "estimator": "KNeighborsClassifier",
+ "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": [1, 2] }
+ },
+ "data": [
+ { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }
+ ]
+ },
+ "synthetic regression data": {
+ "algorithm": {
+ "estimator": "KNeighborsRegressor",
+ "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": [1, 2] }
+ },
+ "data": [
+ { "source": "make_regression", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 100, "noise":1.5 } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "sklearn brute knn clsf": {
+ "SETS": [
+ "sklearn-ex[gpu] implementations",
+ "common knn parameters",
+ "sklearn knn parameters",
+ "synthetic classification data"
+ ]
+ },
+ "sklearn brute knn reg": {
+ "SETS": [
+ "sklearn-ex[gpu] implementations",
+ "common knn parameters",
+ "sklearn knn parameters",
+ "synthetic regression data"
+ ]
+ }
+ }
+}
diff --git a/configs/regular/bf16/linear_model.json b/configs/regular/bf16/linear_model.json
new file mode 100644
index 00000000..23aa49c0
--- /dev/null
+++ b/configs/regular/bf16/linear_model.json
@@ -0,0 +1,33 @@
+{
+ "INCLUDE": ["../../common/sklearn.json"],
+ "PARAMETERS_SETS": {
+ "synthetic data": {
+ "data": [
+ { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 3000000, "test_size": 5000 } }
+ ]
+ },
+ "common linear parameters": {
+ "algorithm": {
+ "estimator": "LinearRegression",
+ "estimator_params": { "fit_intercept": true, "copy_X": true }
+ },
+ "data": {
+ "dtype": ["float32"],
+ "order": "C"
+ }
+ },
+ "sklearn linear parameters": {
+ "estimator_params": { "n_jobs": "[SPECIAL_VALUE]physical_cpus" }
+ }
+ },
+ "TEMPLATES": {
+ "sklearn linear": {
+ "SETS": [
+ "sklearn-ex[gpu] implementations",
+ "common linear parameters",
+ "sklearn linear parameters",
+ "synthetic data"
+ ]
+ }
+ }
+}
diff --git a/configs/regular/bf16/logreg.json b/configs/regular/bf16/logreg.json
new file mode 100644
index 00000000..863d67f9
--- /dev/null
+++ b/configs/regular/bf16/logreg.json
@@ -0,0 +1,45 @@
+{
+ "INCLUDE": ["../../common/sklearn.json"],
+ "PARAMETERS_SETS": {
+ "common logreg parameters": {
+ "algorithm": {
+ "estimator": "LogisticRegression",
+ "estimator_methods": { "inference": "predict" },
+ "estimator_params": {
+ "penalty": "l2",
+ "tol": 1e-4,
+ "C": 1.0,
+ "l1_ratio": null,
+ "max_iter": 20
+ }
+ },
+ "data": {
+ "dtype": ["float32"]
+ }
+ },
+ "sklearn logreg parameters": {
+ "algorithm": {
+ "estimator_params": {
+ "solver": "newton-cg",
+ "n_jobs": "[SPECIAL_VALUE]physical_cpus",
+ "random_state": 42
+ }
+ }
+ },
+ "synthetic data": {
+ "data": [
+ { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 10, "n_classes": 2 } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "sklearn logreg": {
+ "SETS": [
+ "sklearn-ex[gpu] implementations",
+ "common logreg parameters",
+ "sklearn logreg parameters",
+ "synthetic data"
+ ]
+ }
+ }
+}
diff --git a/configs/regular/bf16/pca.json b/configs/regular/bf16/pca.json
new file mode 100644
index 00000000..e5113261
--- /dev/null
+++ b/configs/regular/bf16/pca.json
@@ -0,0 +1,36 @@
+{
+ "INCLUDE": ["../../common/sklearn.json"],
+ "PARAMETERS_SETS": {
+ "pca parameters": {
+ "algorithm": {
+ "estimator": "PCA",
+ "estimator_params": {
+ "n_components": 3,
+ "copy": true,
+ "whiten": false,
+ "svd_solver": "covariance_eigh",
+ "tol": 0.0,
+ "iterated_power": 15,
+ "random_state": 42
+ }
+ },
+ "data": {
+ "dtype": ["float32"]
+ }
+ },
+ "synthetic data": {
+ "data": [
+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "sklearn pca": {
+ "SETS": [
+ "sklearn-ex[gpu] implementations",
+ "pca parameters",
+ "synthetic data"
+ ]
+ }
+ }
+}
diff --git a/configs/regular/dbscan.json b/configs/regular/dbscan.json
index 71dcdc9b..711c15cd 100644
--- a/configs/regular/dbscan.json
+++ b/configs/regular/dbscan.json
@@ -58,19 +58,11 @@
"TEMPLATES": {
"sklearn dbscan": {
"SETS": [
- "sklearn-ex[cpu,gpu] implementations",
+ "sklearn-ex[gpu] implementations",
"common dbscan parameters",
"sklearn dbscan parameters",
"dbscan datasets"
]
- },
- "cuml dbscan": {
- "SETS": [
- "cuml implementation",
- "common dbscan parameters",
- "cuml dbscan parameters",
- "dbscan datasets"
- ]
}
}
}
diff --git a/configs/regular/ensemble.json b/configs/regular/ensemble.json
index 56e37e77..f01c1383 100644
--- a/configs/regular/ensemble.json
+++ b/configs/regular/ensemble.json
@@ -90,7 +90,7 @@
"TEMPLATES": {
"sklearn ensemble classification": {
"SETS": [
- "sklearn-ex[cpu,gpu] implementations",
+ "sklearn-ex[gpu] implementations",
"common ensemble params",
"sklearn ensemble classifier params",
"ensemble classification data"
@@ -98,27 +98,11 @@
},
"sklearn ensemble regression": {
"SETS": [
- "sklearn-ex[cpu,gpu] implementations",
+ "sklearn-ex[gpu] implementations",
"common ensemble params",
"sklearn ensemble regressor params",
"ensemble regression data"
]
- },
- "cuml ensemble classification": {
- "SETS": [
- "cuml implementation",
- "common ensemble params",
- "cuml ensemble classifier params",
- "ensemble classification data"
- ]
- },
- "cuml ensemble regression": {
- "SETS": [
- "cuml implementation",
- "common ensemble params",
- "cuml ensemble regressor params",
- "ensemble regression data"
- ]
}
}
}
diff --git a/configs/regular/kmeans.json b/configs/regular/kmeans.json
index bcb7026f..756e2bab 100644
--- a/configs/regular/kmeans.json
+++ b/configs/regular/kmeans.json
@@ -70,19 +70,11 @@
"TEMPLATES": {
"sklearn kmeans": {
"SETS": [
- "sklearn-ex[cpu,gpu] implementations",
+ "sklearn-ex[gpu] implementations",
"common kmeans parameters",
"sklearn kmeans parameters",
"kmeans datasets"
]
- },
- "cuml kmeans": {
- "SETS": [
- "cuml implementation",
- "common kmeans parameters",
- "cuml kmeans parameters",
- "kmeans datasets"
- ]
}
}
}
diff --git a/configs/regular/knn.json b/configs/regular/knn.json
index e1cd8a75..a69c6864 100644
--- a/configs/regular/knn.json
+++ b/configs/regular/knn.json
@@ -74,47 +74,17 @@
"TEMPLATES": {
"sklearn brute knn clsf": {
"SETS": [
- "sklearn-ex[cpu,gpu] implementations",
+ "sklearn-ex[gpu] implementations",
"common knn parameters",
"sklearn knn parameters",
"brute knn algorithm - classification data"
]
},
- "sklearn kd_tree knn clsf": {
- "SETS": [
- "sklearn-ex[cpu] implementations",
- "common knn parameters",
- "sklearn knn parameters",
- "kd_tree knn algorithm - classification data"
- ]
- },
"sklearn brute knn regr": {
"SETS": [
- "sklearn-ex[cpu,gpu] implementations",
- "common knn parameters",
- "sklearn knn parameters",
- "brute knn algorithm - regression data"
- ]
- },
- "sklearn kd_tree knn regr": {
- "SETS": [
- "sklearn-ex[cpu] implementations",
+ "sklearn-ex[gpu] implementations",
"common knn parameters",
"sklearn knn parameters",
- "kd_tree knn algorithm - regression data"
- ]
- },
- "cuml brute knn clsf": {
- "SETS": [
- "cuml implementation",
- "common knn parameters",
- "brute knn algorithm - classification data"
- ]
- },
- "cuml brute knn regr": {
- "SETS": [
- "cuml implementation",
- "common knn parameters",
"brute knn algorithm - regression data"
]
}
diff --git a/configs/regular/linear_model.json b/configs/regular/linear_model.json
index eb1b79ba..3040c82d 100644
--- a/configs/regular/linear_model.json
+++ b/configs/regular/linear_model.json
@@ -85,34 +85,12 @@
"TEMPLATES": {
"sklearn linear": {
"SETS": [
- "sklearn-ex[cpu,gpu] implementations",
+ "sklearn-ex[gpu] implementations",
"common linear parameters",
"sklearn linear parameters",
"regression datasets"
]
},
- "sklearn ridge": {
- "SETS": [
- "sklearn-ex[cpu] implementations",
- "common ridge parameters",
- "sklearn ridge parameters",
- "regression datasets"
- ]
- },
- "sklearn lasso": {
- "SETS": [
- "sklearn-ex[cpu] implementations",
- "common lasso parameters",
- "regression datasets"
- ]
- },
- "sklearn elasticnet": {
- "SETS": [
- "sklearn-ex[cpu] implementations",
- "common elasticnet parameters",
- "regression datasets"
- ]
- },
"cuml linear": {
"SETS": [
"cuml implementation",
@@ -120,30 +98,6 @@
"cuml L2 parameters",
"regression datasets"
]
- },
- "cuml ridge": {
- "SETS": [
- "cuml implementation",
- "common ridge parameters",
- "cuml L2 parameters",
- "regression datasets"
- ]
- },
- "cuml lasso": {
- "SETS": [
- "cuml implementation",
- "common lasso parameters",
- "cuml L1 parameters",
- "regression datasets"
- ]
- },
- "cuml elasticnet": {
- "SETS": [
- "cuml implementation",
- "common elasticnet parameters",
- "cuml L1 parameters",
- "regression datasets"
- ]
}
}
}
diff --git a/configs/regular/logreg.json b/configs/regular/logreg.json
index a94a7fcf..a8323b02 100644
--- a/configs/regular/logreg.json
+++ b/configs/regular/logreg.json
@@ -54,19 +54,11 @@
"TEMPLATES": {
"sklearn logreg": {
"SETS": [
- "sklearn-ex[cpu,gpu] implementations",
+ "sklearn-ex[gpu] implementations",
"common logreg parameters",
"sklearn logreg parameters",
"logreg datasets"
]
- },
- "cuml logreg": {
- "SETS": [
- "cuml implementation",
- "common logreg parameters",
- "cuml logreg parameters",
- "logreg datasets"
- ]
}
}
}
diff --git a/configs/regular/pca.json b/configs/regular/pca.json
index 582acc9e..e26d3f44 100644
--- a/configs/regular/pca.json
+++ b/configs/regular/pca.json
@@ -46,14 +46,7 @@
"TEMPLATES": {
"sklearn pca": {
"SETS": [
- "sklearn-ex[cpu,gpu] implementations",
- "pca parameters",
- "pca datasets"
- ]
- },
- "cuml pca": {
- "SETS": [
- "cuml implementation",
+ "sklearn-ex[gpu] implementations",
"pca parameters",
"pca datasets"
]
diff --git a/configs/spmd/large_scale/basic_stats.json b/configs/spmd/large_scale/basic_stats.json
new file mode 100644
index 00000000..f8f44e4e
--- /dev/null
+++ b/configs/spmd/large_scale/basic_stats.json
@@ -0,0 +1,31 @@
+{
+ "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+ "PARAMETERS_SETS": {
+ "spmd basicstats parameters": {
+ "algorithm": {
+ "estimator": "BasicStatistics",
+ "estimator_methods": { "training": "fit" },
+ "sklearnex_context": { "use_raw_input": true }
+ },
+ "data": {
+ "split_kwargs": { "test_size": 0.0001 }
+ }
+ },
+ "synthetic data": {
+ "data": [
+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000000, "n_features": 10, "centers": 1 } },
+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 1000, "centers": 1 } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "basicstats": {
+ "SETS": [
+ "sklearnex spmd implementation",
+ "large scale 2k parameters",
+ "synthetic data",
+ "spmd basicstats parameters"
+ ]
+ }
+ }
+}
diff --git a/configs/spmd/large_scale/basic_stats_strong.json b/configs/spmd/large_scale/basic_stats_strong.json
new file mode 100644
index 00000000..0c7c671e
--- /dev/null
+++ b/configs/spmd/large_scale/basic_stats_strong.json
@@ -0,0 +1,30 @@
+{
+ "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+ "PARAMETERS_SETS": {
+ "spmd basicstats parameters": {
+ "algorithm": {
+ "estimator": "BasicStatistics",
+ "estimator_methods": { "training": "fit" },
+ "sklearnex_context": { "use_raw_input": true }
+ },
+ "data": {
+ "split_kwargs": { "test_size": 0.0001 }
+ }
+ },
+ "synthetic data": {
+ "data": [
+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000, "n_features": 100, "centers": 1 } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "basicstats": {
+ "SETS": [
+ "sklearnex spmd implementation",
+ "large scale strong <=64 parameters",
+ "synthetic data",
+ "spmd basicstats parameters"
+ ]
+ }
+ }
+}
diff --git a/configs/spmd/large_scale/covariance.json b/configs/spmd/large_scale/covariance.json
new file mode 100644
index 00000000..7f4d6d7d
--- /dev/null
+++ b/configs/spmd/large_scale/covariance.json
@@ -0,0 +1,31 @@
+{
+ "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+ "PARAMETERS_SETS": {
+ "spmd basicstats parameters": {
+ "algorithm": {
+ "estimator": "EmpiricalCovariance",
+ "estimator_methods": { "training": "fit" },
+ "sklearnex_context": { "use_raw_input": true }
+ },
+ "data": {
+ "split_kwargs": { "test_size": 0.0001 }
+ }
+ },
+ "synthetic data": {
+ "data": [
+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000000, "n_features": 10, "centers": 1 } },
+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 1000, "centers": 1 } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "covariance": {
+ "SETS": [
+ "sklearnex spmd implementation",
+ "large scale 2k parameters",
+ "synthetic data",
+ "spmd basicstats parameters"
+ ]
+ }
+ }
+}
diff --git a/configs/spmd/large_scale/covariance_strong.json b/configs/spmd/large_scale/covariance_strong.json
new file mode 100644
index 00000000..8e388801
--- /dev/null
+++ b/configs/spmd/large_scale/covariance_strong.json
@@ -0,0 +1,30 @@
+{
+ "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+ "PARAMETERS_SETS": {
+ "spmd basicstats parameters": {
+ "algorithm": {
+ "estimator": "EmpiricalCovariance",
+ "estimator_methods": { "training": "fit" },
+ "sklearnex_context": { "use_raw_input": true }
+ },
+ "data": {
+ "split_kwargs": { "test_size": 0.0001 }
+ }
+ },
+ "synthetic data": {
+ "data": [
+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000, "n_features": 100, "centers": 1 } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "covariance": {
+ "SETS": [
+ "sklearnex spmd implementation",
+ "large scale strong <=64 parameters",
+ "synthetic data",
+ "spmd basicstats parameters"
+ ]
+ }
+ }
+}
diff --git a/configs/spmd/large_scale/dbscan.json b/configs/spmd/large_scale/dbscan.json
new file mode 100644
index 00000000..bf60b7cc
--- /dev/null
+++ b/configs/spmd/large_scale/dbscan.json
@@ -0,0 +1,36 @@
+{
+ "INCLUDE": ["../../common/sklearn.json", "../../regular/dbscan.json", "large_scale.json"],
+ "PARAMETERS_SETS": {
+ "spmd dbscan parameters": {
+ "algorithm": {
+ "estimator": "DBSCAN",
+ "estimator_methods": {
+ "training": "fit"
+ },
+ "estimator_params" : {
+ "eps": 10, "min_samples": 5
+ },
+ "sklearnex_context": { "use_raw_input": true }
+ },
+ "data": {
+ "dtype": "float64"
+ }
+ },
+ "synthetic dataset": {
+ "data": [
+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 40000, "n_features": 100, "centers": 10 } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "dbscan": {
+ "SETS": [
+ "common dbscan parameters",
+ "synthetic dataset",
+ "sklearnex spmd implementation",
+ "large scale <=64 parameters",
+ "spmd dbscan parameters"
+ ]
+ }
+ }
+}
diff --git a/configs/spmd/large_scale/dbscan_strong.json b/configs/spmd/large_scale/dbscan_strong.json
new file mode 100644
index 00000000..5e7ab322
--- /dev/null
+++ b/configs/spmd/large_scale/dbscan_strong.json
@@ -0,0 +1,36 @@
+{
+ "INCLUDE": ["../../common/sklearn.json", "../../regular/dbscan.json", "large_scale.json"],
+ "PARAMETERS_SETS": {
+ "spmd dbscan parameters": {
+ "algorithm": {
+ "estimator": "DBSCAN",
+ "estimator_methods": {
+ "training": "fit"
+ },
+ "estimator_params" : {
+ "eps": 15, "min_samples": 50
+ },
+ "sklearnex_context": { "use_raw_input": true }
+ },
+ "data": {
+ "dtype": "float64"
+ }
+ },
+ "synthetic dataset": {
+ "data": [
+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 4000000, "n_features": 100, "centers": 10 } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "dbscan": {
+ "SETS": [
+ "common dbscan parameters",
+ "synthetic dataset",
+ "sklearnex spmd implementation",
+ "large scale strong <=64 parameters",
+ "spmd dbscan parameters"
+ ]
+ }
+ }
+}
diff --git a/configs/spmd/large_scale/forest_max_samples.json b/configs/spmd/large_scale/forest_max_samples.json
new file mode 100644
index 00000000..95affb16
--- /dev/null
+++ b/configs/spmd/large_scale/forest_max_samples.json
@@ -0,0 +1,28 @@
+{
+ "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+ "PARAMETERS_SETS": {
+ "spmd forest classification parameters": {
+ "algorithm": {
+ "estimator": "RandomForestClassifier",
+ "estimator_methods": { "training": "fit" },
+ "estimator_params": { "n_estimators": 20, "max_depth": 10 },
+ "sklearnex_context": { "use_raw_input": true }
+ }
+ },
+ "synthetic data": {
+ "data": [
+ { "source": "make_classification", "split_kwargs": { "train_size": 1000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 1001000, "n_features": 100, "n_classes": 2 } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "forestCls": {
+ "SETS": [
+ "sklearnex spmd implementation",
+ "large scale 32 parameters",
+ "synthetic data",
+ "spmd forest classification parameters"
+ ]
+ }
+ }
+}
diff --git a/configs/spmd/large_scale/forest_no_max_samples.json b/configs/spmd/large_scale/forest_no_max_samples.json
new file mode 100644
index 00000000..c371371b
--- /dev/null
+++ b/configs/spmd/large_scale/forest_no_max_samples.json
@@ -0,0 +1,27 @@
+{
+ "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+ "PARAMETERS_SETS": {
+ "spmd forest classification parameters": {
+ "algorithm": {
+ "estimator": "RandomForestClassifier",
+ "estimator_params": { "n_estimators": 100, "max_depth": 7 },
+ "sklearnex_context": { "use_raw_input": true }
+ }
+ },
+ "synthetic data": {
+ "data": [
+ { "source": "make_classification", "split_kwargs": { "train_size": 1000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 1001000, "n_features": 100, "n_classes": 2 }, "n_informative": "[SPECIAL_VALUE]0.5" }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "forestCls": {
+ "SETS": [
+ "sklearnex spmd implementation",
+ "large scale 2k parameters",
+ "synthetic data",
+ "spmd forest classification parameters"
+ ]
+ }
+ }
+}
diff --git a/configs/spmd/large_scale/forest_strong.json b/configs/spmd/large_scale/forest_strong.json
new file mode 100644
index 00000000..653c70dc
--- /dev/null
+++ b/configs/spmd/large_scale/forest_strong.json
@@ -0,0 +1,28 @@
+{
+ "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+ "PARAMETERS_SETS": {
+ "spmd forest classification parameters": {
+ "algorithm": {
+ "estimator": "RandomForestClassifier",
+ "estimator_methods": { "training": "fit" },
+ "estimator_params": { "n_estimators": 100, "max_depth": 8 },
+ "sklearnex_context": {"use_raw_input": true}
+ }
+ },
+ "synthetic data": {
+ "data": [
+ { "source": "make_classification", "split_kwargs": { "train_size": 20000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 21000, "n_features": 200, "n_classes": 2 } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "forestCls": {
+ "SETS": [
+ "sklearnex spmd implementation",
+ "large scale strong <=64 parameters",
+ "synthetic data",
+ "spmd forest classification parameters"
+ ]
+ }
+ }
+}
diff --git a/configs/spmd/large_scale/incremental.json b/configs/spmd/large_scale/incremental.json
new file mode 100644
index 00000000..195074ee
--- /dev/null
+++ b/configs/spmd/large_scale/incremental.json
@@ -0,0 +1,77 @@
+{ "INCLUDE": [ ],
+ "PARAMETERS_SETS": {
+ "common incremental raw gpu params": {
+ "algorithm": {
+ "device": "gpu",
+ "sklearnex_context": { "use_raw_input": true }
+ },
+ "data": {
+ "format":"dpctl",
+ "order": "C"
+ }
+ },
+ "statistical batches and data": [
+ { "algorithm": { "num_batches": { "training": [1,2,6,12]} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000000, "n_features": 10, "centers": 1 } } },
+ { "algorithm": { "num_batches": { "training": 2} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 50000000, "n_features": 10, "centers": 1 } } },
+ { "algorithm": { "num_batches": { "training": 6} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 16666667, "n_features": 10, "centers": 1 } } },
+ { "algorithm": { "num_batches": { "training": 12} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 8333333, "n_features": 10, "centers": 1 } } },
+ { "algorithm": { "num_batches": { "training": [1,2,6,12]} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 1000, "centers": 1 } } },
+ { "algorithm": { "num_batches": { "training": 2} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 500000, "n_features": 1000, "centers": 1 } } },
+ { "algorithm": { "num_batches": { "training": 6} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 166667, "n_features": 1000, "centers": 1 } } },
+ { "algorithm": { "num_batches": { "training": 12} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 83333, "n_features": 1000, "centers": 1 } } }
+ ],
+ "regression batches and data": [
+ { "algorithm": { "num_batches": { "training": [1,2,6,12]} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 100005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 100000000, "test_size": 5000 } } },
+ { "algorithm": { "num_batches": { "training": 2} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 100005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 50000000, "test_size": 5000 } } },
+ { "algorithm": { "num_batches": { "training": 6} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 100005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 16666667, "test_size": 5000 } } },
+ { "algorithm": { "num_batches": { "training": 12} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 100005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 8333333, "test_size": 5000 } } },
+ { "algorithm": { "num_batches": { "training": [1,2,6,12]} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 3000000, "test_size": 5000 } } },
+ { "algorithm": { "num_batches": { "training": 2} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 1500000, "test_size": 5000 } } },
+ { "algorithm": { "num_batches": { "training": 6} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 500000, "test_size": 5000 } } },
+ { "algorithm": { "num_batches": { "training": 12} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 250000, "test_size": 5000 } } }
+ ],
+ "covariance": {
+ "algorithm": {
+ "estimator": "IncrementalEmpiricalCovariance",
+ "library": "sklearnex",
+ "estimator_methods": {"training": "partial_fit"}
+ },
+ "data": {
+ "split_kwargs": { "test_size": 0.0001 }
+ }
+ },
+ "basic_statistics": {
+ "algorithm": {
+ "estimator": "IncrementalBasicStatistics",
+ "library": "sklearnex",
+ "estimator_methods": {"training": "partial_fit"}
+ },
+ "data": {
+ "split_kwargs": { "test_size": 0.0001 }
+ }
+ },
+ "linear_regression": {
+ "algorithm": {
+ "estimator": "IncrementalLinearRegression",
+ "library": "sklearnex",
+ "estimator_methods": {"training": "partial_fit"}
+ }
+ },
+ "pca": {
+ "algorithm": {
+ "estimator": "IncrementalPCA",
+ "library": "sklearnex.preview",
+ "estimator_methods": {"training": "partial_fit"}
+ },
+ "data": {
+ "split_kwargs": { "test_size": 0.0001 }
+ }
+ }
+ },
+ "TEMPLATES": {
+ "basic_statistics": { "SETS": ["common incremental raw gpu params", "basic_statistics", "statistical batches and data"] },
+ "covariance": { "SETS": ["common incremental raw gpu params", "covariance", "statistical batches and data"] },
+ "linear_regression": { "SETS": ["common incremental raw gpu params", "linear_regression", "regression batches and data"] },
+ "pca": { "SETS": ["common incremental raw gpu params", "pca", "statistical batches and data"] }
+ }
+}
diff --git a/configs/spmd/large_scale/kmeans_narrow_weak.json b/configs/spmd/large_scale/kmeans_narrow_weak.json
new file mode 100644
index 00000000..69f0b6ac
--- /dev/null
+++ b/configs/spmd/large_scale/kmeans_narrow_weak.json
@@ -0,0 +1,33 @@
+{
+ "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+ "PARAMETERS_SETS": {
+ "spmd kmeans parameters": {
+ "algorithm": {
+ "estimator": "KMeans",
+ "estimator_params": {
+ "algorithm": "lloyd",
+ "max_iter": 20,
+ "n_clusters": 10,
+ "random_state": 42
+ },
+ "estimator_methods": { "training": "fit", "inference": "predict" },
+ "sklearnex_context": { "use_raw_input": true }
+ }
+ },
+ "synthetic data": {
+ "data": [
+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 2000000, "n_features": 100, "centers": 2000, "cluster_std": 3, "center_box": 100.0}}
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "kmeans": {
+ "SETS": [
+ "synthetic data",
+ "sklearnex spmd implementation",
+ "large scale 2k parameters sample shift",
+ "spmd kmeans parameters"
+ ]
+ }
+ }
+}
diff --git a/configs/spmd/large_scale/kmeans_strong.json b/configs/spmd/large_scale/kmeans_strong.json
new file mode 100644
index 00000000..90a1ea3f
--- /dev/null
+++ b/configs/spmd/large_scale/kmeans_strong.json
@@ -0,0 +1,33 @@
+{
+ "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+ "PARAMETERS_SETS": {
+ "spmd kmeans parameters": {
+ "algorithm": {
+ "estimator": "KMeans",
+ "estimator_params": {
+ "algorithm": "lloyd",
+ "max_iter": 20,
+ "n_clusters": 100
+ },
+ "estimator_methods": { "training": "fit", "inference": "predict" },
+ "sklearnex_context": { "use_raw_input": true }
+ }
+ },
+ "synthetic data": {
+ "data": [
+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000, "n_features": 100, "centers": 100 }}
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "kmeans": {
+ "SETS": [
+ "synthetic data",
+ "sklearnex spmd implementation",
+ "large scale strong <=64 parameters",
+ "spmd kmeans parameters"
+ ]
+ }
+ }
+}
+
diff --git a/configs/spmd/large_scale/kmeans_wide_weak.json b/configs/spmd/large_scale/kmeans_wide_weak.json
new file mode 100644
index 00000000..5520f10a
--- /dev/null
+++ b/configs/spmd/large_scale/kmeans_wide_weak.json
@@ -0,0 +1,34 @@
+{
+ "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+ "PARAMETERS_SETS": {
+ "spmd kmeans parameters": {
+ "algorithm": {
+ "estimator": "KMeans",
+ "estimator_params": {
+ "algorithm": "lloyd",
+ "max_iter": 20,
+ "n_clusters": 10,
+ "random_state": 42
+ },
+ "estimator_methods": { "training": "fit", "inference": "predict" },
+ "sklearnex_context": { "use_raw_input": true }
+ }
+ },
+ "synthetic data": {
+ "data": [
+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 1000, "centers": 2000}}
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "kmeans": {
+ "SETS": [
+ "synthetic data",
+ "sklearnex spmd implementation",
+ "large scale 2k parameters",
+ "spmd kmeans parameters"
+ ]
+ }
+ }
+}
+
diff --git a/configs/spmd/large_scale/knn_strong.json b/configs/spmd/large_scale/knn_strong.json
new file mode 100644
index 00000000..36daf3f1
--- /dev/null
+++ b/configs/spmd/large_scale/knn_strong.json
@@ -0,0 +1,37 @@
+{
+ "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+ "PARAMETERS_SETS": {
+ "spmd knn cls parameters": {
+ "algorithm": {
+ "estimator": "KNeighborsClassifier",
+ "estimator_params": {
+ "algorithm": "brute",
+ "metric": "minkowski",
+ "p": 2,
+ "weights": "uniform",
+ "n_neighbors": 100
+ },
+ "estimator_methods": {
+ "training": "fit",
+ "inference": "predict"
+ },
+ "sklearnex_context": { "use_raw_input": true }
+ }
+ },
+ "synthetic classification data": {
+ "data": [
+ { "source": "make_classification", "split_kwargs": { "train_size": 3000000, "test_size": 2000000 }, "generation_kwargs": { "n_samples": 5000000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "knn classifier": {
+ "SETS": [
+ "synthetic classification data",
+ "sklearnex spmd implementation",
+ "large scale strong <=64 parameters",
+ "spmd knn cls parameters"
+ ]
+ }
+ }
+}
diff --git a/configs/spmd/large_scale/knn_tier1.json b/configs/spmd/large_scale/knn_tier1.json
new file mode 100644
index 00000000..c230cc4e
--- /dev/null
+++ b/configs/spmd/large_scale/knn_tier1.json
@@ -0,0 +1,35 @@
+{
+ "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+ "PARAMETERS_SETS": {
+ "spmd knn cls parameters": {
+ "algorithm": {
+ "estimator": "KNeighborsClassifier",
+ "estimator_params": {
+ "algorithm": "brute",
+ "metric": "minkowski",
+ "p": 2,
+ "weights": "uniform"
+ },
+ "estimator_methods": {
+ "training": "fit",
+ "inference": "predict"
+ },
+ "sklearnex_context": { "use_raw_input": true }
+ }
+ },
+ "synthetic classification data": [
+ { "data": { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 100000}, "generation_kwargs": { "n_samples": 2000000, "n_features": 10, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, "algorithm": { "estimator_params": { "n_neighbors": 5 } } },
+ { "data": { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 10000}, "generation_kwargs": { "n_samples": 2000000, "n_features": 1000, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, "algorithm": { "estimator_params": { "n_neighbors": 100 } } }
+ ]
+ },
+ "TEMPLATES": {
+ "knn classifier": {
+ "SETS": [
+ "synthetic classification data",
+ "sklearnex spmd implementation",
+ "large scale 32 parameters",
+ "spmd knn cls parameters"
+ ]
+ }
+ }
+}
diff --git a/configs/spmd/large_scale/knn_tier2.json b/configs/spmd/large_scale/knn_tier2.json
new file mode 100644
index 00000000..ff0032e2
--- /dev/null
+++ b/configs/spmd/large_scale/knn_tier2.json
@@ -0,0 +1,37 @@
+{
+ "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+ "PARAMETERS_SETS": {
+ "spmd knn cls parameters": {
+ "algorithm": {
+ "estimator": "KNeighborsClassifier",
+ "estimator_params": {
+ "algorithm": "brute",
+ "metric": "minkowski",
+ "p": 2,
+ "weights": "uniform",
+ "n_neighbors": 5
+ },
+ "estimator_methods": {
+ "training": "fit",
+ "inference": "predict"
+ },
+ "sklearnex_context": { "use_raw_input": true }
+ }
+ },
+ "synthetic classification data": {
+ "data": [
+ { "source": "make_classification", "split_kwargs": { "train_size": 100, "test_size": 100}, "generation_kwargs": { "n_samples": 200, "n_features": 10, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "knn classifier": {
+ "SETS": [
+ "synthetic classification data",
+ "sklearnex spmd implementation",
+ "large scale 2k parameters",
+ "spmd knn cls parameters"
+ ]
+ }
+ }
+}
diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
new file mode 100644
index 00000000..28626dc9
--- /dev/null
+++ b/configs/spmd/large_scale/large_scale.json
@@ -0,0 +1,85 @@
+{
+ "PARAMETERS_SETS": {
+ "large scale default parameters": {
+ "data": {
+ "dtype": "float64",
+ "distributed_split": "None"
+ },
+ "bench": {
+ "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+ }
+ },
+ "large scale strong parameters": {
+ "data": {
+ "dtype": "float64",
+ "distributed_split": "rank_based"
+ },
+ "bench": {
+ "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+ }
+ },
+ "large scale 2k parameters": {
+ "data": {
+ "dtype": "float64",
+ "distributed_split": "None"
+ },
+ "bench": {
+ "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+ }
+ },
+ "large scale 2k parameters sample shift": {
+ "data": {
+ "dtype": "float64",
+ "distributed_split": "sample_shift"
+ },
+ "bench": {
+ "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+ }
+ },
+ "large scale 32 parameters": {
+ "data": {
+ "dtype": "float64",
+ "distributed_split": "None"
+ },
+ "bench": {
+ "mpi_params": {"n": [1,2,6,12,24,48,96,192,384], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+ }
+ },
+ "large scale <=64 parameters": {
+ "data": {
+ "dtype": "float64",
+ "distributed_split": "None"
+ },
+ "bench": {
+ "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+ }
+ },
+ "large scale strong 2k parameters": {
+ "data": {
+ "dtype": "float64",
+ "distributed_split": "rank_based"
+ },
+ "bench": {
+ "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+ }
+ },
+ "large scale strong <=64 parameters": {
+ "data": {
+ "dtype": "float64",
+ "distributed_split": "rank_based"
+ },
+ "bench": {
+ "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+ }
+ },
+ "large scale impi parameters": {
+ "data": {
+ "dtype": "float64",
+ "distributed_split": "no"
+ },
+ "bench": {
+ "mpi_params": {"n": [1,2,4,6,9,12], "ppn": 12}
+ }
+ }
+ }
+}
diff --git a/configs/spmd/large_scale/linreg.json b/configs/spmd/large_scale/linreg.json
new file mode 100644
index 00000000..7c7fb035
--- /dev/null
+++ b/configs/spmd/large_scale/linreg.json
@@ -0,0 +1,28 @@
+{
+ "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+ "PARAMETERS_SETS": {
+ "spmd linear parameters": {
+ "algorithm": {
+ "estimator": "LinearRegression",
+ "estimator_methods": { "training": "fit" },
+ "sklearnex_context": { "use_raw_input": true }
+ }
+ },
+ "synthetic data": {
+ "data": [
+ { "source": "make_regression", "generation_kwargs": { "n_samples": 100005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 100000000, "test_size": 5000 } },
+ { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 3000000, "test_size": 5000 } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "linreg": {
+ "SETS": [
+ "sklearnex spmd implementation",
+ "large scale 2k parameters",
+ "synthetic data",
+ "spmd linear parameters"
+ ]
+ }
+ }
+}
diff --git a/configs/spmd/large_scale/linreg_strong.json b/configs/spmd/large_scale/linreg_strong.json
new file mode 100644
index 00000000..ac5a6c7a
--- /dev/null
+++ b/configs/spmd/large_scale/linreg_strong.json
@@ -0,0 +1,27 @@
+{
+ "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+ "PARAMETERS_SETS": {
+ "spmd linear parameters": {
+ "algorithm": {
+ "estimator": "LinearRegression",
+ "estimator_methods": { "training": "fit" },
+ "sklearnex_context": { "use_raw_input": true }
+ }
+ },
+ "synthetic data": {
+ "data": [
+ { "source": "make_regression", "generation_kwargs": { "n_samples": 25005000, "n_features": 100, "noise": 1.25 }, "split_kwargs": { "train_size": 25000000, "test_size": 5000 } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "linreg": {
+ "SETS": [
+ "sklearnex spmd implementation",
+ "large scale strong <=64 parameters",
+ "synthetic data",
+ "spmd linear parameters"
+ ]
+ }
+ }
+}
diff --git a/configs/spmd/large_scale/logreg.json b/configs/spmd/large_scale/logreg.json
new file mode 100644
index 00000000..b7b4b998
--- /dev/null
+++ b/configs/spmd/large_scale/logreg.json
@@ -0,0 +1,30 @@
+{
+ "INCLUDE": ["../../common/sklearn.json", "../logreg.json", "large_scale.json"],
+ "PARAMETERS_SETS": {
+ "spmd logreg2 parameters": {
+ "algorithm":{
+ "estimator": "LogisticRegression",
+ "estimator_methods": { "inference": "predict" },
+ "estimator_params": { "max_iter": 10 },
+ "sklearnex_context": { "use_raw_input": true }
+ }
+ },
+ "synthetic data": {
+ "data": [
+ { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 10, "n_classes": 2 } },
+ { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 501000, "n_features": 1000, "n_classes": 2, "n_informative": 40, "n_clusters_per_class": 3, "flip_y": 0.05 } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "logreg": {
+ "SETS": [
+ "sklearnex spmd implementation",
+ "large scale 2k parameters",
+ "spmd logreg parameters",
+ "synthetic data",
+ "spmd logreg2 parameters"
+ ]
+ }
+ }
+}
diff --git a/configs/spmd/large_scale/logreg_strong.json b/configs/spmd/large_scale/logreg_strong.json
new file mode 100644
index 00000000..219840ea
--- /dev/null
+++ b/configs/spmd/large_scale/logreg_strong.json
@@ -0,0 +1,29 @@
+{
+ "INCLUDE": ["../../common/sklearn.json", "../logreg.json", "large_scale.json"],
+ "PARAMETERS_SETS": {
+ "spmd logreg2 parameters": {
+ "algorithm":{
+ "estimator": "LogisticRegression",
+ "estimator_methods": { "inference": "predict" },
+ "estimator_params": { "max_iter": 16 },
+ "sklearnex_context": { "use_raw_input": true }
+ }
+ },
+ "synthetic data": {
+ "data": [
+ { "source": "make_classification", "split_kwargs": { "train_size": 12000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 12001000, "n_features": 200, "n_classes": 2, "n_informative": 40, "n_clusters_per_class": 3, "flip_y": 0.05 } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "logreg": {
+ "SETS": [
+ "sklearnex spmd implementation",
+ "large scale strong 64 parameters",
+ "spmd logreg parameters",
+ "synthetic data",
+ "spmd logreg2 parameters"
+ ]
+ }
+ }
+}
diff --git a/configs/spmd/large_scale/pca.json b/configs/spmd/large_scale/pca.json
new file mode 100644
index 00000000..ce56bd8a
--- /dev/null
+++ b/configs/spmd/large_scale/pca.json
@@ -0,0 +1,31 @@
+{
+ "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+ "PARAMETERS_SETS": {
+ "spmd pca parameters": {
+ "algorithm": {
+ "estimator": "PCA",
+ "estimator_methods": { "training": "fit", "inference": "" },
+ "sklearnex_context": { "use_raw_input": true }
+ },
+ "data": {
+ "split_kwargs": { "test_size": 0.0001 }
+ }
+ },
+ "synthetic data": {
+ "data": [
+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000000, "n_features": 10, "centers": 1 } },
+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 1000, "centers": 1 } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "linreg": {
+ "SETS": [
+ "sklearnex spmd implementation",
+ "large scale 2k parameters",
+ "synthetic data",
+ "spmd pca parameters"
+ ]
+ }
+ }
+}
diff --git a/configs/spmd/large_scale/pca_strong.json b/configs/spmd/large_scale/pca_strong.json
new file mode 100644
index 00000000..70461ba7
--- /dev/null
+++ b/configs/spmd/large_scale/pca_strong.json
@@ -0,0 +1,30 @@
+{
+ "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+ "PARAMETERS_SETS": {
+ "spmd pca parameters": {
+ "algorithm": {
+ "estimator": "PCA",
+ "estimator_methods": { "training": "fit", "inference": "" },
+ "sklearnex_context": { "use_raw_input": true }
+ },
+ "data": {
+ "split_kwargs": { "test_size": 0.0001 }
+ }
+ },
+ "synthetic data": {
+ "data": [
+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000, "n_features": 100, "centers": 1 } }
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "linreg": {
+ "SETS": [
+ "sklearnex spmd implementation",
+ "large scale strong <=64 parameters",
+ "synthetic data",
+ "spmd pca parameters"
+ ]
+ }
+ }
+}
diff --git a/sklbench/benchmarks/custom_function.py b/sklbench/benchmarks/custom_function.py
index 25abb900..34b223ed 100644
--- a/sklbench/benchmarks/custom_function.py
+++ b/sklbench/benchmarks/custom_function.py
@@ -64,9 +64,13 @@ def get_function_args(bench_case: BenchCase, x_train, y_train, x_test, y_test) -
def measure_function_instance(bench_case, function_instance, args: Tuple, kwargs: Dict):
metrics = dict()
- metrics["time[ms]"], metrics["time std[ms]"], _ = measure_case(
- bench_case, function_instance, *args, **kwargs
- )
+ (
+ metrics["time[ms]"],
+ metrics["time std[ms]"],
+ metrics["first iter[ms]"],
+ metrics["box filter mean[ms]"],
+ metrics["box filter std[ms]"],
+ ) = measure_case(bench_case, function_instance, *args, **kwargs)
return metrics
diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
index f9c0a75e..819f5fb5 100644
--- a/sklbench/benchmarks/sklearn_estimator.py
+++ b/sklbench/benchmarks/sklearn_estimator.py
@@ -66,15 +66,15 @@ def get_estimator(library_name: str, estimator_name: str):
f"Using first {classes_map[estimator_name][0]}."
)
estimator = classes_map[estimator_name][0]
- if not issubclass(estimator, BaseEstimator):
- logger.info(f"{estimator} estimator is not derived from sklearn's BaseEstimator")
+ # if not issubclass(estimator, BaseEstimator):
+ # logger.info(f"{estimator} estimator is not derived from sklearn's BaseEstimator")
return estimator
def get_estimator_methods(bench_case: BenchCase) -> Dict[str, List[str]]:
# default estimator methods
estimator_methods = {
- "training": ["fit"],
+ "training": ["partial_fit", "fit"],
"inference": ["predict", "predict_proba", "transform"],
}
for stage in estimator_methods.keys():
@@ -134,6 +134,9 @@ def get_subset_metrics_of_estimator(
and isinstance(iterations[0], Union[Numeric, NumpyNumeric].__args__)
):
metrics.update({"iterations": int(iterations[0])})
+ if hasattr(estimator_instance, "_n_inner_iter"):
+ inner_iters = estimator_instance._n_inner_iter
+ metrics.update({"inner_iters": int(inner_iters)})
if task == "classification":
y_pred = convert_to_numpy(estimator_instance.predict(x))
metrics.update(
@@ -142,7 +145,7 @@ def get_subset_metrics_of_estimator(
"balanced accuracy": float(balanced_accuracy_score(y_compat, y_pred)),
}
)
- if hasattr(estimator_instance, "predict_proba") and not (
+ """if hasattr(estimator_instance, "predict_proba") and not (
hasattr(estimator_instance, "probability")
and getattr(estimator_instance, "probability") == False
):
@@ -162,7 +165,7 @@ def get_subset_metrics_of_estimator(
),
"logloss": float(log_loss(y_compat, y_pred_proba)),
}
- )
+ )"""
elif task == "regression":
y_pred = convert_to_numpy(estimator_instance.predict(x))
metrics.update(
@@ -188,19 +191,6 @@ def get_subset_metrics_of_estimator(
}
)
elif task == "clustering":
- if hasattr(estimator_instance, "inertia_"):
- # compute inertia manually using distances to cluster centers
- # provided by KMeans.transform
- metrics.update(
- {
- "inertia": float(
- np.power(
- convert_to_numpy(estimator_instance.transform(x)).min(axis=1),
- 2,
- ).sum()
- )
- }
- )
if hasattr(estimator_instance, "predict"):
y_pred = convert_to_numpy(estimator_instance.predict(x))
metrics.update(
@@ -334,34 +324,43 @@ def verify_patching(stream: io.StringIO, function_name) -> bool:
return acceleration_lines > 0 and fallback_lines == 0
-def create_online_function(method_instance, data_args, batch_size):
- n_batches = data_args[0].shape[0] // batch_size
+def create_online_function(
+ estimator_instance, method_instance, data_args, num_batches, batch_size
+):
if "y" in list(inspect.signature(method_instance).parameters):
def ndarray_function(x, y):
- for i in range(n_batches):
+ for i in range(num_batches):
method_instance(
x[i * batch_size : (i + 1) * batch_size],
y[i * batch_size : (i + 1) * batch_size],
)
+ if hasattr(estimator_instance, "_onedal_finalize_fit"):
+ estimator_instance._onedal_finalize_fit()
def dataframe_function(x, y):
- for i in range(n_batches):
+ for i in range(num_batches):
method_instance(
x.iloc[i * batch_size : (i + 1) * batch_size],
y.iloc[i * batch_size : (i + 1) * batch_size],
)
+ if hasattr(estimator_instance, "_onedal_finalize_fit"):
+ estimator_instance._onedal_finalize_fit()
else:
def ndarray_function(x):
- for i in range(n_batches):
+ for i in range(num_batches):
method_instance(x[i * batch_size : (i + 1) * batch_size])
+ if hasattr(estimator_instance, "_onedal_finalize_fit"):
+ estimator_instance._onedal_finalize_fit()
def dataframe_function(x):
- for i in range(n_batches):
+ for i in range(num_batches):
method_instance(x.iloc[i * batch_size : (i + 1) * batch_size])
+ if hasattr(estimator_instance, "_onedal_finalize_fit"):
+ estimator_instance._onedal_finalize_fit()
if "ndarray" in str(type(data_args[0])):
return ndarray_function
@@ -414,12 +413,28 @@ def measure_sklearn_estimator(
data_args = (x_train,)
else:
data_args = (x_test,)
- batch_size = get_bench_case_value(
- bench_case, f"algorithm:batch_size:{stage}"
- )
- if batch_size is not None:
+
+ if method == "partial_fit":
+ num_batches = get_bench_case_value(bench_case, "data:num_batches")
+ batch_size = get_bench_case_value(bench_case, "data:batch_size")
+
+ if batch_size is None:
+ if num_batches is None:
+ num_batches = 5
+ batch_size = (
+ data_args[0].shape[0] + num_batches - 1
+ ) // num_batches
+ if num_batches is None:
+ num_batches = (
+ data_args[0].shape[0] + batch_size - 1
+ ) // batch_size
+
method_instance = create_online_function(
- method_instance, data_args, batch_size
+ estimator_instance,
+ method_instance,
+ data_args,
+ num_batches,
+ batch_size,
)
# daal4py model builders enabling branch
if enable_modelbuilders and stage == "inference":
@@ -429,17 +444,14 @@ def measure_sklearn_estimator(
estimator_instance.get_booster()
)
method_instance = getattr(daal_model, method)
-
metrics[method] = dict()
(
metrics[method]["time[ms]"],
metrics[method]["time std[ms]"],
- _,
+ metrics[method]["first iter[ms]"],
+ metrics[method]["box filter mean[ms]"],
+ metrics[method]["box filter std[ms]"],
) = measure_case(bench_case, method_instance, *data_args)
- if batch_size is not None:
- metrics[method]["throughput[samples/ms]"] = (
- (data_args[0].shape[0] // batch_size) * batch_size
- ) / metrics[method]["time[ms]"]
if ensure_sklearnex_patching:
full_method_name = f"{estimator_class.__name__}.{method}"
sklearnex_logging_stream.seek(0)
@@ -490,7 +502,18 @@ def main(bench_case: BenchCase, filters: List[BenchCase]):
estimator_params = get_bench_case_value(
bench_case, "algorithm:estimator_params", dict()
)
+ # logger.debug("estimator params: " + str(estimator_params))
+ if (
+ "DBSCAN" in str(estimator_name)
+ and get_bench_case_value(bench_case, "data:distributed_split", None)
+ != "rank_based"
+ ):
+ if "min_samples" in estimator_params:
+ from mpi4py import MPI
+ estimator_params["min_samples"] = (
+ MPI.COMM_WORLD.Get_size() * estimator_params["min_samples"]
+ )
# get estimator methods for measurement
estimator_methods = get_estimator_methods(bench_case)
@@ -521,12 +544,12 @@ def main(bench_case: BenchCase, filters: List[BenchCase]):
result_template = enrich_result(result_template, bench_case)
if "assume_finite" in context_params:
result_template["assume_finite"] = context_params["assume_finite"]
- if hasattr(estimator_instance, "get_params"):
- estimator_params = estimator_instance.get_params()
+ # if hasattr(estimator_instance, "get_params"):
+ # estimator_params = estimator_instance.get_params()
# note: "handle" is not JSON-serializable
if "handle" in estimator_params:
del estimator_params["handle"]
- logger.debug(f"Estimator parameters:\n{custom_format(estimator_params)}")
+ # logger.debug(f"Estimator parameters:\n{custom_format(estimator_params)}")
result_template.update(estimator_params)
data_descs = {
diff --git a/sklbench/datasets/__init__.py b/sklbench/datasets/__init__.py
index 093875c4..d4bddca1 100644
--- a/sklbench/datasets/__init__.py
+++ b/sklbench/datasets/__init__.py
@@ -67,6 +67,11 @@ def load_data(bench_case: BenchCase) -> Tuple[Dict, Dict]:
generation_kwargs = get_bench_case_value(
bench_case, "data:generation_kwargs", dict()
)
+ if "center_box" in generation_kwargs:
+ generation_kwargs["center_box"] = (
+ -1 * generation_kwargs["center_box"],
+ generation_kwargs["center_box"],
+ )
return load_sklearn_synthetic_data(
function_name=source,
input_kwargs=generation_kwargs,
diff --git a/sklbench/datasets/common.py b/sklbench/datasets/common.py
index e7ed0160..28b62fe6 100644
--- a/sklbench/datasets/common.py
+++ b/sklbench/datasets/common.py
@@ -136,11 +136,11 @@ def cache_wrapper(**kwargs):
data_name = kwargs["data_name"]
data_cache = kwargs["data_cache"]
if len(get_filenames_by_prefix(data_cache, data_name)) > 0:
- logger.info(f'Loading "{data_name}" dataset from cache files')
+ # logger.info(f'Loading "{data_name}" dataset from cache files')
data = load_data_from_cache(data_cache, data_name)
data_desc = load_data_description(data_cache, data_name)
else:
- logger.info(f'Loading "{data_name}" dataset from scratch')
+ # logger.info(f'Loading "{data_name}" dataset from scratch')
data, data_desc = function(**kwargs)
save_data_to_cache(data, data_cache, data_name)
save_data_description(data_desc, data_cache, data_name)
diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index d2e63e9e..c63d3b20 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -14,6 +14,7 @@
# limitations under the License.
# ===============================================================================
+import math
import os
import numpy as np
@@ -109,7 +110,23 @@ def split_and_transform_data(bench_case, data, data_description):
y_train, y_test = None, None
distributed_split = get_bench_case_value(bench_case, "data:distributed_split", None)
- if distributed_split == "rank_based":
+ # knn_split_train = (
+ # "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "")
+ # and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1
+ # )
+ if distributed_split == "sample_shift":
+ from mpi4py import MPI
+
+ rank = MPI.COMM_WORLD.Get_rank()
+ # This approach was chosen to shift the distribution of synthetic data on each rank
+ # for KMeans weak scaling tests. When testing with a large number of tiles, this method avoids duplication of data on each rank.
+ # For example, if there are 24,576 tiles being used, each data point in the 24,576th tile would be multiplied by 1.47.
+ # The factor 0.003 was chosen arbitrarily and can be fine-tuned for other datasets and algorithms if needed.
+ adjust_number = (math.sqrt(rank) * 0.003) + 1
+ x_test = x_test * adjust_number
+ x_train = x_train * adjust_number
+
+ elif distributed_split == "rank_based":
from mpi4py import MPI
comm = MPI.COMM_WORLD
@@ -129,10 +146,12 @@ def split_and_transform_data(bench_case, data, data_description):
x_train[train_start:train_end],
y_train[train_start:train_end],
)
- x_test, y_test = x_test[test_start:test_end], y_test[test_start:test_end]
+ if distributed_split == "rank_based":
+ x_test, y_test = x_test[test_start:test_end], y_test[test_start:test_end]
else:
x_train = x_train[train_start:train_end]
- x_test = x_test[test_start:test_end]
+ if distributed_split == "rank_based":
+ x_test = x_test[test_start:test_end]
device = get_bench_case_value(bench_case, "algorithm:device", None)
common_data_format = get_bench_case_value(bench_case, "data:format", "pandas")
@@ -178,7 +197,7 @@ def split_and_transform_data(bench_case, data, data_description):
"format": data_format,
"order": data_order,
"dtype": data_dtype,
- "samples": converted_data.shape[0],
+ "samples (per rank)": converted_data.shape[0],
}
if len(converted_data.shape) == 2 and converted_data.shape[1] > 1:
data_description[subset_name]["features"] = converted_data.shape[1]
diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py
index 28fa2bb0..2bc3a05e 100644
--- a/sklbench/report/implementation.py
+++ b/sklbench/report/implementation.py
@@ -16,7 +16,7 @@
import argparse
import json
-from typing import Dict, List
+from typing import Dict, Hashable, List
import openpyxl as xl
import pandas as pd
@@ -32,6 +32,9 @@
METRICS = {
"lower is better": [
"time[ms]",
+ "first iter[ms]",
+ "box filter mean[ms]",
+ "box filter std[ms]",
"iterations",
# classification
"logloss",
@@ -239,6 +242,7 @@ def get_result_tables_as_df(
bench_cases = pd.DataFrame(
[flatten_dict(bench_case) for bench_case in results["bench_cases"]]
)
+ bench_cases = bench_cases.map(lambda x: str(x) if not isinstance(x, Hashable) else x)
if compatibility_mode:
bench_cases = transform_results_to_compatible(bench_cases)
@@ -248,7 +252,7 @@ def get_result_tables_as_df(
bench_cases.drop(columns=[column], inplace=True)
diffby_columns.remove(column)
- return split_df_by_columns(bench_cases, splitby_columns)
+ return split_df_by_columns(bench_cases, splitby_columns, False)
def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame:
@@ -258,7 +262,10 @@ def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame:
# only relative improvements are included in summary currently
if len(column) > 1 and column[1] == f"{metric_name} relative improvement":
metric_columns.append(column)
- summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T
+ if metric_columns:
+ summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T
+ else:
+ summary = pd.DataFrame()
summary.index = pd.Index([df_name])
return summary
diff --git a/sklbench/runner/commands_helper.py b/sklbench/runner/commands_helper.py
index 09e61369..aace5643 100644
--- a/sklbench/runner/commands_helper.py
+++ b/sklbench/runner/commands_helper.py
@@ -45,6 +45,10 @@ def generate_benchmark_command(
mpi_prefix = "mpirun"
for mpi_param_name, mpi_param_value in mpi_params.items():
mpi_prefix += f" -{mpi_param_name} {mpi_param_value}"
+ if mpi_param_name == "-hostfile":
+ import os
+
+ mpi_prefix += os.environ.get("PBS_NODEFILE")
command_prefix = f"{mpi_prefix} {command_prefix}"
# 3. Intel(R) VTune* profiling command prefix
vtune_profiling = get_bench_case_value(bench_case, "bench:vtune_profiling")
diff --git a/sklbench/utils/logger.py b/sklbench/utils/logger.py
index 90940630..250c5fa6 100644
--- a/sklbench/utils/logger.py
+++ b/sklbench/utils/logger.py
@@ -19,7 +19,7 @@
logger = logging.Logger("sklbench")
logging_channel = logging.StreamHandler()
-logging_formatter = logging.Formatter("%(levelname)s:%(name)s: %(message)s")
+logging_formatter = logging.Formatter("%(asctime)s - %(levelname)s:%(name)s: %(message)s")
logging_channel.setFormatter(logging_formatter)
logger.addHandler(logging_channel)
diff --git a/sklbench/utils/measurement.py b/sklbench/utils/measurement.py
index 989daefd..3677e760 100644
--- a/sklbench/utils/measurement.py
+++ b/sklbench/utils/measurement.py
@@ -40,6 +40,22 @@ def box_filter(timing, left=0.2, right=0.8):
return np.mean(result) * 1000, np.std(result) * 1000
+def large_scale_measurements(timing):
+ first_iter = timing[0] * 1000
+ mean = np.mean(timing[1:]) * 1000
+ stdev = np.std(timing[1:]) * 1000
+ timing_sorted = np.sort(timing)
+ Q1, Q3 = np.percentile(timing_sorted, [25, 75])
+ IQ = Q3 - Q1
+ lower, upper = Q1 - 1.5 * IQ, Q3 + 1.5 * IQ
+
+ filtered_times = timing_sorted[(timing_sorted >= lower) & (timing_sorted <= upper)]
+
+ box_filter_mean = np.mean(filtered_times) * 1000 if filtered_times.size > 0 else 0
+ box_filter_stdev = np.std(filtered_times) * 1000 if filtered_times.size > 0 else 0
+ return mean, stdev, first_iter, box_filter_mean, box_filter_stdev
+
+
def measure_time(
func,
*args,
@@ -56,12 +72,16 @@ def measure_time(
)
times = []
func_return_value = None
+ inners, iters = [], []
while len(times) < n_runs:
if enable_itt and itt_is_available:
itt.resume()
t0 = timeit.default_timer()
func_return_value = func(*args, **kwargs)
t1 = timeit.default_timer()
+ if hasattr(func, "__self__") and hasattr(func.__self__, "_n_inner_iter"):
+ inners.append(func.__self__._n_inner_iter)
+ iters.append(func.__self__.n_iter_)
if enable_itt and itt_is_available:
itt.pause()
times.append(t1 - t0)
@@ -72,13 +92,27 @@ def measure_time(
f"exceeded time limit ({time_limit} seconds)"
)
break
- mean, std = box_filter(times)
- if std / mean > std_mean_ratio:
- logger.warning(
- f'Measured "std / mean" time ratio of "{str(func)}" function is higher '
- f"than threshold ({round(std / mean, 3)} vs. {std_mean_ratio})"
- )
- return mean, std, func_return_value
+
+ try:
+ from mpi4py import MPI
+
+ if MPI.COMM_WORLD.Get_rank() == 0:
+ logger.debug(
+ "iters across n runs: "
+ + str(iters)
+ + ", inner iters across n runs: "
+ + str(inners)
+ )
+ logger.debug(f"Runtime for all {n_runs} iterations: {times}")
+ except ModuleNotFoundError:
+ logger.debug(f"Runtime for all {n_runs} iterations: {times}")
+ # mean, std = box_filter(times)
+ # if std / mean > std_mean_ratio:
+ # logger.warning(
+ # f'Measured "std / mean" time ratio of "{str(func)}" function is higher '
+ # f"than threshold ({round(std / mean, 3)} vs. {std_mean_ratio})"
+ # )
+ return large_scale_measurements(times)
# wrapper to get measurement params from benchmarking case