Merge pull request #174 from IntelPython/dev/large_scale_kmeans

KateBlueSky · web-flow · commit 2edb59726027 · 2025-03-21T19:50:19.000-07:00
[Merge only onto large-scale] Large scale Kmeans changes.
diff --git a/configs/README.md b/configs/README.md
@@ -104,7 +104,7 @@ Configs have the three highest parameter keys:
 | `data`:`format` | `pandas` | `pandas`, `numpy`, `cudf` | Data format to use in benchmark. |
 | `data`:`order` | `F` | `C`, `F` | Data order to use in benchmark: contiguous(C) or Fortran. |
 | `data`:`dtype` | `float64` |  | Data type to use in benchmark. |
-| `data`:`distributed_split` | None | None, `rank_based` | Split type used to distribute data between machines in distributed algorithm. `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. |
+| `data`:`distributed_split` | None | None, `rank_based`, `sample_shift` | Split type used to distribute data between machines in distributed algorithm. `sample_shift`: Shift each data point in each rank by sqrt (rank id) * 0.003) + 1. `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. |
 |<h3>Algorithm parameters</h3>||||
 | `algorithm`:`library` | None |  | Python module containing measured entity (class or function). |
 | `algorithm`:`device` | `default` | `default`, `cpu`, `gpu` | Device selected for computation. |
diff --git a/configs/spmd/large_scale/kmeans_narrow_weak.json b/configs/spmd/large_scale/kmeans_narrow_weak.json
@@ -0,0 +1,33 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd kmeans parameters": {
+            "algorithm": {
+                "estimator": "KMeans",
+                "estimator_params": {
+                    "algorithm": "lloyd",
+		    "max_iter": 20,
+		    "n_clusters": 10,
+		    "random_state": 42
+                },
+                "estimator_methods": { "training": "fit", "inference": "predict" },
+                "sklearnex_context": { "use_raw_input": true }
+            }
+        },
+        "synthetic data": {
+                "data": [
+		    { "source": "make_blobs", "generation_kwargs": { "n_samples": 2000000,  "n_features": 100, "centers": 2000, "cluster_std": 3, "center_box": 100.0}}
+                ]
+        }
+    },
+    "TEMPLATES": {
+        "kmeans": {
+            "SETS": [
+                "synthetic data",
+                "sklearnex spmd implementation",
+                "large scale 2k parameters sample shift",
+                "spmd kmeans parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/kmeans_strong.json b/configs/spmd/large_scale/kmeans_strong.json
@@ -5,16 +5,17 @@
             "algorithm": {
                 "estimator": "KMeans",
                 "estimator_params": {
-                    "algorithm": "lloyd"
+                    "algorithm": "lloyd",
+		    "max_iter": 20,
+		    "n_clusters": 100
                 },
-                "estimator_methods": { "training": "fit", "inference": "predict" }
+                "estimator_methods": { "training": "fit", "inference": "predict" },
+                "sklearnex_context": { "use_raw_input": true }
             }
-	},
-	"synthetic data": {
+        },
+        "synthetic data": {
                 "data": [
-                        { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000,  "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
-	                { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000,  "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
-			{ "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000,  "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } }
+                        { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000,  "n_features": 100, "centers": 100 }}
                 ]
         }
     },
@@ -29,3 +30,4 @@
         }
     }
 }
+
diff --git a/configs/spmd/large_scale/kmeans_wide_weak.json b/configs/spmd/large_scale/kmeans_wide_weak.json
@@ -5,15 +5,18 @@
             "algorithm": {
                 "estimator": "KMeans",
                 "estimator_params": {
-                    "algorithm": "lloyd"
+                    "algorithm": "lloyd",
+		    "max_iter": 20,
+		    "n_clusters": 10,
+		    "random_state": 42
                 },
-                "estimator_methods": { "training": "fit", "inference": "predict" }
+                "estimator_methods": { "training": "fit", "inference": "predict" },
+                "sklearnex_context": { "use_raw_input": true }
             }
-	},
-	"synthetic data": {
+        },
+        "synthetic data": {
                 "data": [
-                    { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000,  "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
-	                { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000,  "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }
+		    { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000,  "n_features": 1000, "centers": 2000}}
                 ]
         }
     },
@@ -28,3 +31,4 @@
         }
     }
 }
+
diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
@@ -27,6 +27,15 @@
                 "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
+        "large scale 2k parameters sample shift": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "sample_shift"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
         "large scale 32 parameters": {
             "data": {
                 "dtype": "float64",
diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
@@ -191,19 +191,6 @@ def get_subset_metrics_of_estimator(
                     }
                 )
     elif task == "clustering":
-        if hasattr(estimator_instance, "inertia_"):
-            # compute inertia manually using distances to cluster centers
-            # provided by KMeans.transform
-            metrics.update(
-                {
-                    "inertia": float(
-                        np.power(
-                            convert_to_numpy(estimator_instance.transform(x)).min(axis=1),
-                            2,
-                        ).sum()
-                    )
-                }
-            )
         if hasattr(estimator_instance, "predict"):
             y_pred = convert_to_numpy(estimator_instance.predict(x))
             metrics.update(
diff --git a/sklbench/datasets/__init__.py b/sklbench/datasets/__init__.py
@@ -67,6 +67,11 @@ def load_data(bench_case: BenchCase) -> Tuple[Dict, Dict]:
             generation_kwargs = get_bench_case_value(
                 bench_case, "data:generation_kwargs", dict()
             )
+            if "center_box" in generation_kwargs:
+                generation_kwargs["center_box"] = (
+                    -1 * generation_kwargs["center_box"],
+                    generation_kwargs["center_box"],
+                )
             return load_sklearn_synthetic_data(
                 function_name=source,
                 input_kwargs=generation_kwargs,
diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 # ===============================================================================
 
+import math
 import os
 
 import numpy as np
@@ -113,7 +114,19 @@ def split_and_transform_data(bench_case, data, data_description):
     #     "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "")
     #     and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1
     # )
-    if distributed_split == "rank_based":
+    if distributed_split == "sample_shift":
+        from mpi4py import MPI
+
+        rank = MPI.COMM_WORLD.Get_rank()
+        # This approach was chosen to shift the distribution of synthetic data on each rank
+        # for KMeans weak scaling tests. When testing with a large number of tiles, this method avoids duplication of data on each rank.
+        # For example, if there are 24,576 tiles being used, each data point in the 24,576th tile would be multiplied by 1.47.
+        # The factor 0.003 was chosen arbitrarily and can be fine-tuned for other datasets and algorithms if needed.
+        adjust_number = (math.sqrt(rank) * 0.003) + 1
+        x_test = x_test * adjust_number
+        x_train = x_train * adjust_number
+
+    elif distributed_split == "rank_based":
         from mpi4py import MPI
 
         comm = MPI.COMM_WORLD

Original file line number	Diff line number	Diff line change
`@@ -5,16 +5,17 @@`
`5`	`5`	`"algorithm": {`
`6`	`6`	`"estimator": "KMeans",`
`7`	`7`	`"estimator_params": {`
`8`		`- "algorithm": "lloyd"`
	`8`	`+ "algorithm": "lloyd",`
	`9`	`+ "max_iter": 20,`
	`10`	`+ "n_clusters": 100`
`9`	`11`	`},`
`10`		`- "estimator_methods": { "training": "fit", "inference": "predict" }`
	`12`	`+ "estimator_methods": { "training": "fit", "inference": "predict" },`
	`13`	`+ "sklearnex_context": { "use_raw_input": true }`
`11`	`14`	`}`
`12`		`- },`
`13`		`- "synthetic data": {`
	`15`	`+ },`
	`16`	`+ "synthetic data": {`
`14`	`17`	`"data": [`
`15`		`- { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },`
`16`		`- { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000, "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },`
`17`		`- { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } }`
	`18`	`+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000, "n_features": 100, "centers": 100 }}`
`18`	`19`	`]`
`19`	`20`	`}`
`20`	`21`	`},`
`@@ -29,3 +30,4 @@`
`29`	`30`	`}`
`30`	`31`	`}`
`31`	`32`	`}`
	`33`	`+`
Original file line number	Diff line number	Diff line change
`@@ -5,15 +5,18 @@`
`5`	`5`	`"algorithm": {`
`6`	`6`	`"estimator": "KMeans",`
`7`	`7`	`"estimator_params": {`
`8`		`- "algorithm": "lloyd"`
	`8`	`+ "algorithm": "lloyd",`
	`9`	`+ "max_iter": 20,`
	`10`	`+ "n_clusters": 10,`
	`11`	`+ "random_state": 42`
`9`	`12`	`},`
`10`		`- "estimator_methods": { "training": "fit", "inference": "predict" }`
	`13`	`+ "estimator_methods": { "training": "fit", "inference": "predict" },`
	`14`	`+ "sklearnex_context": { "use_raw_input": true }`
`11`	`15`	`}`
`12`		`- },`
`13`		`- "synthetic data": {`
	`16`	`+ },`
	`17`	`+ "synthetic data": {`
`14`	`18`	`"data": [`
`15`		`- { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },`
`16`		`- { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000, "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }`
	`19`	`+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 1000, "centers": 2000}}`
`17`	`20`	`]`
`18`	`21`	`}`
`19`	`22`	`},`
`@@ -28,3 +31,4 @@`
`28`	`31`	`}`
`29`	`32`	`}`
`30`	`33`	`}`
	`34`	`+`