diff --git a/configs/README.md b/configs/README.md
index 8d3c5ac2..e1cf8390 100644
--- a/configs/README.md
+++ b/configs/README.md
@@ -104,7 +104,7 @@ Configs have the three highest parameter keys:
| `data`:`format` | `pandas` | `pandas`, `numpy`, `cudf` | Data format to use in benchmark. |
| `data`:`order` | `F` | `C`, `F` | Data order to use in benchmark: contiguous(C) or Fortran. |
| `data`:`dtype` | `float64` | | Data type to use in benchmark. |
-| `data`:`distributed_split` | None | None, `rank_based` | Split type used to distribute data between machines in distributed algorithm. `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. |
+| `data`:`distributed_split` | None | None, `rank_based`, `sample_shift` | Split type used to distribute data between machines in distributed algorithm. `sample_shift`: Shift each data point in each rank by sqrt (rank id) * 0.003) + 1. `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. |
|
Algorithm parameters
||||
| `algorithm`:`library` | None | | Python module containing measured entity (class or function). |
| `algorithm`:`device` | `default` | `default`, `cpu`, `gpu` | Device selected for computation. |
diff --git a/configs/spmd/large_scale/kmeans_narrow_weak.json b/configs/spmd/large_scale/kmeans_narrow_weak.json
new file mode 100644
index 00000000..69f0b6ac
--- /dev/null
+++ b/configs/spmd/large_scale/kmeans_narrow_weak.json
@@ -0,0 +1,33 @@
+{
+ "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+ "PARAMETERS_SETS": {
+ "spmd kmeans parameters": {
+ "algorithm": {
+ "estimator": "KMeans",
+ "estimator_params": {
+ "algorithm": "lloyd",
+ "max_iter": 20,
+ "n_clusters": 10,
+ "random_state": 42
+ },
+ "estimator_methods": { "training": "fit", "inference": "predict" },
+ "sklearnex_context": { "use_raw_input": true }
+ }
+ },
+ "synthetic data": {
+ "data": [
+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 2000000, "n_features": 100, "centers": 2000, "cluster_std": 3, "center_box": 100.0}}
+ ]
+ }
+ },
+ "TEMPLATES": {
+ "kmeans": {
+ "SETS": [
+ "synthetic data",
+ "sklearnex spmd implementation",
+ "large scale 2k parameters sample shift",
+ "spmd kmeans parameters"
+ ]
+ }
+ }
+}
diff --git a/configs/spmd/large_scale/kmeans_strong.json b/configs/spmd/large_scale/kmeans_strong.json
index 87fb7fac..90a1ea3f 100644
--- a/configs/spmd/large_scale/kmeans_strong.json
+++ b/configs/spmd/large_scale/kmeans_strong.json
@@ -5,16 +5,17 @@
"algorithm": {
"estimator": "KMeans",
"estimator_params": {
- "algorithm": "lloyd"
+ "algorithm": "lloyd",
+ "max_iter": 20,
+ "n_clusters": 100
},
- "estimator_methods": { "training": "fit", "inference": "predict" }
+ "estimator_methods": { "training": "fit", "inference": "predict" },
+ "sklearnex_context": { "use_raw_input": true }
}
- },
- "synthetic data": {
+ },
+ "synthetic data": {
"data": [
- { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
- { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000, "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
- { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } }
+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000, "n_features": 100, "centers": 100 }}
]
}
},
@@ -29,3 +30,4 @@
}
}
}
+
diff --git a/configs/spmd/large_scale/kmeans.json b/configs/spmd/large_scale/kmeans_wide_weak.json
similarity index 59%
rename from configs/spmd/large_scale/kmeans.json
rename to configs/spmd/large_scale/kmeans_wide_weak.json
index 1140823d..5520f10a 100644
--- a/configs/spmd/large_scale/kmeans.json
+++ b/configs/spmd/large_scale/kmeans_wide_weak.json
@@ -5,15 +5,18 @@
"algorithm": {
"estimator": "KMeans",
"estimator_params": {
- "algorithm": "lloyd"
+ "algorithm": "lloyd",
+ "max_iter": 20,
+ "n_clusters": 10,
+ "random_state": 42
},
- "estimator_methods": { "training": "fit", "inference": "predict" }
+ "estimator_methods": { "training": "fit", "inference": "predict" },
+ "sklearnex_context": { "use_raw_input": true }
}
- },
- "synthetic data": {
+ },
+ "synthetic data": {
"data": [
- { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
- { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000, "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }
+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 1000, "centers": 2000}}
]
}
},
@@ -28,3 +31,4 @@
}
}
}
+
diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
index 4e4c9d0c..28626dc9 100644
--- a/configs/spmd/large_scale/large_scale.json
+++ b/configs/spmd/large_scale/large_scale.json
@@ -27,6 +27,15 @@
"mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
}
},
+ "large scale 2k parameters sample shift": {
+ "data": {
+ "dtype": "float64",
+ "distributed_split": "sample_shift"
+ },
+ "bench": {
+ "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+ }
+ },
"large scale 32 parameters": {
"data": {
"dtype": "float64",
diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
index 4164a10d..819f5fb5 100644
--- a/sklbench/benchmarks/sklearn_estimator.py
+++ b/sklbench/benchmarks/sklearn_estimator.py
@@ -191,19 +191,6 @@ def get_subset_metrics_of_estimator(
}
)
elif task == "clustering":
- if hasattr(estimator_instance, "inertia_"):
- # compute inertia manually using distances to cluster centers
- # provided by KMeans.transform
- metrics.update(
- {
- "inertia": float(
- np.power(
- convert_to_numpy(estimator_instance.transform(x)).min(axis=1),
- 2,
- ).sum()
- )
- }
- )
if hasattr(estimator_instance, "predict"):
y_pred = convert_to_numpy(estimator_instance.predict(x))
metrics.update(
diff --git a/sklbench/datasets/__init__.py b/sklbench/datasets/__init__.py
index 093875c4..d4bddca1 100644
--- a/sklbench/datasets/__init__.py
+++ b/sklbench/datasets/__init__.py
@@ -67,6 +67,11 @@ def load_data(bench_case: BenchCase) -> Tuple[Dict, Dict]:
generation_kwargs = get_bench_case_value(
bench_case, "data:generation_kwargs", dict()
)
+ if "center_box" in generation_kwargs:
+ generation_kwargs["center_box"] = (
+ -1 * generation_kwargs["center_box"],
+ generation_kwargs["center_box"],
+ )
return load_sklearn_synthetic_data(
function_name=source,
input_kwargs=generation_kwargs,
diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
index 38b4fe3b..c63d3b20 100644
--- a/sklbench/datasets/transformer.py
+++ b/sklbench/datasets/transformer.py
@@ -14,6 +14,7 @@
# limitations under the License.
# ===============================================================================
+import math
import os
import numpy as np
@@ -113,7 +114,19 @@ def split_and_transform_data(bench_case, data, data_description):
# "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "")
# and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1
# )
- if distributed_split == "rank_based":
+ if distributed_split == "sample_shift":
+ from mpi4py import MPI
+
+ rank = MPI.COMM_WORLD.Get_rank()
+ # This approach was chosen to shift the distribution of synthetic data on each rank
+ # for KMeans weak scaling tests. When testing with a large number of tiles, this method avoids duplication of data on each rank.
+ # For example, if there are 24,576 tiles being used, each data point in the 24,576th tile would be multiplied by 1.47.
+ # The factor 0.003 was chosen arbitrarily and can be fine-tuned for other datasets and algorithms if needed.
+ adjust_number = (math.sqrt(rank) * 0.003) + 1
+ x_test = x_test * adjust_number
+ x_train = x_train * adjust_number
+
+ elif distributed_split == "rank_based":
from mpi4py import MPI
comm = MPI.COMM_WORLD