Skip to content

Commit 2edb597

Browse files
authoredMar 22, 2025··
Merge pull request #174 from IntelPython/dev/large_scale_kmeans
[Merge only onto large-scale] Large scale Kmeans changes.
2 parents 30b0b80 + 06944c1 commit 2edb597

File tree

8 files changed

+81
-28
lines changed

8 files changed

+81
-28
lines changed
 

‎configs/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ Configs have the three highest parameter keys:
104104
| `data`:`format` | `pandas` | `pandas`, `numpy`, `cudf` | Data format to use in benchmark. |
105105
| `data`:`order` | `F` | `C`, `F` | Data order to use in benchmark: contiguous(C) or Fortran. |
106106
| `data`:`dtype` | `float64` | | Data type to use in benchmark. |
107-
| `data`:`distributed_split` | None | None, `rank_based` | Split type used to distribute data between machines in distributed algorithm. `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. |
107+
| `data`:`distributed_split` | None | None, `rank_based`, `sample_shift` | Split type used to distribute data between machines in distributed algorithm. `sample_shift`: Shift each data point in each rank by sqrt (rank id) * 0.003) + 1. `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. |
108108
|<h3>Algorithm parameters</h3>||||
109109
| `algorithm`:`library` | None | | Python module containing measured entity (class or function). |
110110
| `algorithm`:`device` | `default` | `default`, `cpu`, `gpu` | Device selected for computation. |
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
{
2+
"INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
3+
"PARAMETERS_SETS": {
4+
"spmd kmeans parameters": {
5+
"algorithm": {
6+
"estimator": "KMeans",
7+
"estimator_params": {
8+
"algorithm": "lloyd",
9+
"max_iter": 20,
10+
"n_clusters": 10,
11+
"random_state": 42
12+
},
13+
"estimator_methods": { "training": "fit", "inference": "predict" },
14+
"sklearnex_context": { "use_raw_input": true }
15+
}
16+
},
17+
"synthetic data": {
18+
"data": [
19+
{ "source": "make_blobs", "generation_kwargs": { "n_samples": 2000000, "n_features": 100, "centers": 2000, "cluster_std": 3, "center_box": 100.0}}
20+
]
21+
}
22+
},
23+
"TEMPLATES": {
24+
"kmeans": {
25+
"SETS": [
26+
"synthetic data",
27+
"sklearnex spmd implementation",
28+
"large scale 2k parameters sample shift",
29+
"spmd kmeans parameters"
30+
]
31+
}
32+
}
33+
}

‎configs/spmd/large_scale/kmeans_strong.json

+9-7
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,17 @@
55
"algorithm": {
66
"estimator": "KMeans",
77
"estimator_params": {
8-
"algorithm": "lloyd"
8+
"algorithm": "lloyd",
9+
"max_iter": 20,
10+
"n_clusters": 100
911
},
10-
"estimator_methods": { "training": "fit", "inference": "predict" }
12+
"estimator_methods": { "training": "fit", "inference": "predict" },
13+
"sklearnex_context": { "use_raw_input": true }
1114
}
12-
},
13-
"synthetic data": {
15+
},
16+
"synthetic data": {
1417
"data": [
15-
{ "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
16-
{ "source": "make_blobs", "generation_kwargs": { "n_samples": 30000, "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
17-
{ "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } }
18+
{ "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000, "n_features": 100, "centers": 100 }}
1819
]
1920
}
2021
},
@@ -29,3 +30,4 @@
2930
}
3031
}
3132
}
33+

‎configs/spmd/large_scale/kmeans.json renamed to ‎configs/spmd/large_scale/kmeans_wide_weak.json

+10-6
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,18 @@
55
"algorithm": {
66
"estimator": "KMeans",
77
"estimator_params": {
8-
"algorithm": "lloyd"
8+
"algorithm": "lloyd",
9+
"max_iter": 20,
10+
"n_clusters": 10,
11+
"random_state": 42
912
},
10-
"estimator_methods": { "training": "fit", "inference": "predict" }
13+
"estimator_methods": { "training": "fit", "inference": "predict" },
14+
"sklearnex_context": { "use_raw_input": true }
1115
}
12-
},
13-
"synthetic data": {
16+
},
17+
"synthetic data": {
1418
"data": [
15-
{ "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
16-
{ "source": "make_blobs", "generation_kwargs": { "n_samples": 30000, "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }
19+
{ "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 1000, "centers": 2000}}
1720
]
1821
}
1922
},
@@ -28,3 +31,4 @@
2831
}
2932
}
3033
}
34+

‎configs/spmd/large_scale/large_scale.json

+9
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,15 @@
2727
"mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
2828
}
2929
},
30+
"large scale 2k parameters sample shift": {
31+
"data": {
32+
"dtype": "float64",
33+
"distributed_split": "sample_shift"
34+
},
35+
"bench": {
36+
"mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
37+
}
38+
},
3039
"large scale 32 parameters": {
3140
"data": {
3241
"dtype": "float64",

‎sklbench/benchmarks/sklearn_estimator.py

-13
Original file line numberDiff line numberDiff line change
@@ -191,19 +191,6 @@ def get_subset_metrics_of_estimator(
191191
}
192192
)
193193
elif task == "clustering":
194-
if hasattr(estimator_instance, "inertia_"):
195-
# compute inertia manually using distances to cluster centers
196-
# provided by KMeans.transform
197-
metrics.update(
198-
{
199-
"inertia": float(
200-
np.power(
201-
convert_to_numpy(estimator_instance.transform(x)).min(axis=1),
202-
2,
203-
).sum()
204-
)
205-
}
206-
)
207194
if hasattr(estimator_instance, "predict"):
208195
y_pred = convert_to_numpy(estimator_instance.predict(x))
209196
metrics.update(

‎sklbench/datasets/__init__.py

+5
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,11 @@ def load_data(bench_case: BenchCase) -> Tuple[Dict, Dict]:
6767
generation_kwargs = get_bench_case_value(
6868
bench_case, "data:generation_kwargs", dict()
6969
)
70+
if "center_box" in generation_kwargs:
71+
generation_kwargs["center_box"] = (
72+
-1 * generation_kwargs["center_box"],
73+
generation_kwargs["center_box"],
74+
)
7075
return load_sklearn_synthetic_data(
7176
function_name=source,
7277
input_kwargs=generation_kwargs,

‎sklbench/datasets/transformer.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
# limitations under the License.
1515
# ===============================================================================
1616

17+
import math
1718
import os
1819

1920
import numpy as np
@@ -113,7 +114,19 @@ def split_and_transform_data(bench_case, data, data_description):
113114
# "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "")
114115
# and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1
115116
# )
116-
if distributed_split == "rank_based":
117+
if distributed_split == "sample_shift":
118+
from mpi4py import MPI
119+
120+
rank = MPI.COMM_WORLD.Get_rank()
121+
# This approach was chosen to shift the distribution of synthetic data on each rank
122+
# for KMeans weak scaling tests. When testing with a large number of tiles, this method avoids duplication of data on each rank.
123+
# For example, if there are 24,576 tiles being used, each data point in the 24,576th tile would be multiplied by 1.47.
124+
# The factor 0.003 was chosen arbitrarily and can be fine-tuned for other datasets and algorithms if needed.
125+
adjust_number = (math.sqrt(rank) * 0.003) + 1
126+
x_test = x_test * adjust_number
127+
x_train = x_train * adjust_number
128+
129+
elif distributed_split == "rank_based":
117130
from mpi4py import MPI
118131

119132
comm = MPI.COMM_WORLD

0 commit comments

Comments
 (0)
Please sign in to comment.