diff --git a/configs/README.md b/configs/README.md index 8d3c5ac2..e1cf8390 100644 --- a/configs/README.md +++ b/configs/README.md @@ -104,7 +104,7 @@ Configs have the three highest parameter keys: | `data`:`format` | `pandas` | `pandas`, `numpy`, `cudf` | Data format to use in benchmark. | | `data`:`order` | `F` | `C`, `F` | Data order to use in benchmark: contiguous(C) or Fortran. | | `data`:`dtype` | `float64` | | Data type to use in benchmark. | -| `data`:`distributed_split` | None | None, `rank_based` | Split type used to distribute data between machines in distributed algorithm. `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. | +| `data`:`distributed_split` | None | None, `rank_based`, `sample_shift` | Split type used to distribute data between machines in distributed algorithm. `sample_shift`: Shift each data point in each rank by sqrt (rank id) * 0.003) + 1. `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. | |

Algorithm parameters

|||| | `algorithm`:`library` | None | | Python module containing measured entity (class or function). | | `algorithm`:`device` | `default` | `default`, `cpu`, `gpu` | Device selected for computation. | diff --git a/configs/spmd/large_scale/kmeans_narrow_weak.json b/configs/spmd/large_scale/kmeans_narrow_weak.json new file mode 100644 index 00000000..69f0b6ac --- /dev/null +++ b/configs/spmd/large_scale/kmeans_narrow_weak.json @@ -0,0 +1,33 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd kmeans parameters": { + "algorithm": { + "estimator": "KMeans", + "estimator_params": { + "algorithm": "lloyd", + "max_iter": 20, + "n_clusters": 10, + "random_state": 42 + }, + "estimator_methods": { "training": "fit", "inference": "predict" }, + "sklearnex_context": { "use_raw_input": true } + } + }, + "synthetic data": { + "data": [ + { "source": "make_blobs", "generation_kwargs": { "n_samples": 2000000, "n_features": 100, "centers": 2000, "cluster_std": 3, "center_box": 100.0}} + ] + } + }, + "TEMPLATES": { + "kmeans": { + "SETS": [ + "synthetic data", + "sklearnex spmd implementation", + "large scale 2k parameters sample shift", + "spmd kmeans parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/kmeans_strong.json b/configs/spmd/large_scale/kmeans_strong.json index 87fb7fac..90a1ea3f 100644 --- a/configs/spmd/large_scale/kmeans_strong.json +++ b/configs/spmd/large_scale/kmeans_strong.json @@ -5,16 +5,17 @@ "algorithm": { "estimator": "KMeans", "estimator_params": { - "algorithm": "lloyd" + "algorithm": "lloyd", + "max_iter": 20, + "n_clusters": 100 }, - "estimator_methods": { "training": "fit", "inference": "predict" } + "estimator_methods": { "training": "fit", "inference": "predict" }, + "sklearnex_context": { "use_raw_input": true } } - }, - "synthetic data": { + }, + "synthetic data": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }, - { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000, "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }, - { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } } + { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000, "n_features": 100, "centers": 100 }} ] } }, @@ -29,3 +30,4 @@ } } } + diff --git a/configs/spmd/large_scale/kmeans.json b/configs/spmd/large_scale/kmeans_wide_weak.json similarity index 59% rename from configs/spmd/large_scale/kmeans.json rename to configs/spmd/large_scale/kmeans_wide_weak.json index 1140823d..5520f10a 100644 --- a/configs/spmd/large_scale/kmeans.json +++ b/configs/spmd/large_scale/kmeans_wide_weak.json @@ -5,15 +5,18 @@ "algorithm": { "estimator": "KMeans", "estimator_params": { - "algorithm": "lloyd" + "algorithm": "lloyd", + "max_iter": 20, + "n_clusters": 10, + "random_state": 42 }, - "estimator_methods": { "training": "fit", "inference": "predict" } + "estimator_methods": { "training": "fit", "inference": "predict" }, + "sklearnex_context": { "use_raw_input": true } } - }, - "synthetic data": { + }, + "synthetic data": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }, - { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000, "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } } + { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 1000, "centers": 2000}} ] } }, @@ -28,3 +31,4 @@ } } } + diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json index 4e4c9d0c..28626dc9 100644 --- a/configs/spmd/large_scale/large_scale.json +++ b/configs/spmd/large_scale/large_scale.json @@ -27,6 +27,15 @@ "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } } }, + "large scale 2k parameters sample shift": { + "data": { + "dtype": "float64", + "distributed_split": "sample_shift" + }, + "bench": { + "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" } + } + }, "large scale 32 parameters": { "data": { "dtype": "float64", diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index 4164a10d..819f5fb5 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -191,19 +191,6 @@ def get_subset_metrics_of_estimator( } ) elif task == "clustering": - if hasattr(estimator_instance, "inertia_"): - # compute inertia manually using distances to cluster centers - # provided by KMeans.transform - metrics.update( - { - "inertia": float( - np.power( - convert_to_numpy(estimator_instance.transform(x)).min(axis=1), - 2, - ).sum() - ) - } - ) if hasattr(estimator_instance, "predict"): y_pred = convert_to_numpy(estimator_instance.predict(x)) metrics.update( diff --git a/sklbench/datasets/__init__.py b/sklbench/datasets/__init__.py index 093875c4..d4bddca1 100644 --- a/sklbench/datasets/__init__.py +++ b/sklbench/datasets/__init__.py @@ -67,6 +67,11 @@ def load_data(bench_case: BenchCase) -> Tuple[Dict, Dict]: generation_kwargs = get_bench_case_value( bench_case, "data:generation_kwargs", dict() ) + if "center_box" in generation_kwargs: + generation_kwargs["center_box"] = ( + -1 * generation_kwargs["center_box"], + generation_kwargs["center_box"], + ) return load_sklearn_synthetic_data( function_name=source, input_kwargs=generation_kwargs, diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index 38b4fe3b..c63d3b20 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -14,6 +14,7 @@ # limitations under the License. # =============================================================================== +import math import os import numpy as np @@ -113,7 +114,19 @@ def split_and_transform_data(bench_case, data, data_description): # "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "") # and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1 # ) - if distributed_split == "rank_based": + if distributed_split == "sample_shift": + from mpi4py import MPI + + rank = MPI.COMM_WORLD.Get_rank() + # This approach was chosen to shift the distribution of synthetic data on each rank + # for KMeans weak scaling tests. When testing with a large number of tiles, this method avoids duplication of data on each rank. + # For example, if there are 24,576 tiles being used, each data point in the 24,576th tile would be multiplied by 1.47. + # The factor 0.003 was chosen arbitrarily and can be fine-tuned for other datasets and algorithms if needed. + adjust_number = (math.sqrt(rank) * 0.003) + 1 + x_test = x_test * adjust_number + x_train = x_train * adjust_number + + elif distributed_split == "rank_based": from mpi4py import MPI comm = MPI.COMM_WORLD