initial alignment of configs to final results

ethanglaser · ethanglaser · commit fc290111b079 · 2025-03-18T17:30:58.000-07:00
diff --git a/configs/spmd/large_scale/basic_stats.json b/configs/spmd/large_scale/basic_stats.json
@@ -4,16 +4,17 @@
         "spmd basicstats parameters": {
             "algorithm": {
                 "estimator": "BasicStatistics",
-                "estimator_methods": { "training": "fit" }
+                "estimator_methods": { "training": "fit" },
+                "sklearnex_context": { "use_raw_input": true }
             },
-	    "data": {
-		"split_kwargs": { "test_size": 0.0001 }
-	    }
+            "data": {
+                "split_kwargs": { "test_size": 0.0001 }
+            }
         },
         "synthetic data": {
             "data": [
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
-		{ "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000000,  "n_features": 10, "centers": 1 } },
+		        { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000,  "n_features": 1000, "centers": 1 } }
             ]
         }
     },
@@ -22,7 +23,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
-		"synthetic data",
+		        "synthetic data",
                 "spmd basicstats parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/basic_stats_strong.json b/configs/spmd/large_scale/basic_stats_strong.json
@@ -4,11 +4,12 @@
         "spmd basicstats parameters": {
             "algorithm": {
                 "estimator": "BasicStatistics",
-                "estimator_methods": { "training": "fit" }
+                "estimator_methods": { "training": "fit" },
+                "sklearnex_context": { "use_raw_input": true }
             },
-	    "data": {
-		"split_kwargs": { "test_size": 0.0001 }
-	    }
+            "data": {
+                "split_kwargs": { "test_size": 0.0001 }
+            }
         },
         "synthetic data": {
             "data": [
@@ -20,8 +21,8 @@
         "basicstats": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale strong 2k parameters",
-		"synthetic data",
+                "large scale strong <=64 parameters",
+                "synthetic data",
                 "spmd basicstats parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/covariance.json b/configs/spmd/large_scale/covariance.json
@@ -4,16 +4,17 @@
         "spmd basicstats parameters": {
             "algorithm": {
                 "estimator": "EmpiricalCovariance",
-                "estimator_methods": { "training": "fit" }
+                "estimator_methods": { "training": "fit" },
+                "sklearnex_context": { "use_raw_input": true }
             },
             "data": {
                 "split_kwargs": { "test_size": 0.0001 }
             }
         },
         "synthetic data": {
             "data": [
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000000,  "n_features": 10, "centers": 1 } },
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000,  "n_features": 1000, "centers": 1 } }
             ]
         }
     },
@@ -22,7 +23,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
-		"synthetic data",
+                "synthetic data",
                 "spmd basicstats parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/covariance_strong.json b/configs/spmd/large_scale/covariance_strong.json
@@ -4,7 +4,8 @@
         "spmd basicstats parameters": {
             "algorithm": {
                 "estimator": "EmpiricalCovariance",
-                "estimator_methods": { "training": "fit" }
+                "estimator_methods": { "training": "fit" },
+                "sklearnex_context": { "use_raw_input": true }
             },
             "data": {
                 "split_kwargs": { "test_size": 0.0001 }
@@ -20,8 +21,8 @@
         "covariance": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale strong 2k parameters",
-		"synthetic data",
+                "large scale strong <=64 parameters",
+                "synthetic data",
                 "spmd basicstats parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/dbscan.json b/configs/spmd/large_scale/dbscan.json
@@ -9,15 +9,16 @@
 		},
 		"estimator_params" : {
 			"eps": 10, "min_samples": 5
-		}
+		},
+		"sklearnex_context": { "use_raw_input": true }
 	    },
 	    "data": {
 		"dtype": "float64"
 	    }
 	},
 	"synthetic dataset": {
             "data": [
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 100, "centers": 10 } }
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 40000,  "n_features": 100, "centers": 10 } }
             ]
 	}
     },
diff --git a/configs/spmd/large_scale/dbscan_strong.json b/configs/spmd/large_scale/dbscan_strong.json
@@ -3,21 +3,22 @@
     "PARAMETERS_SETS": {
         "spmd dbscan parameters": {
 	    "algorithm": {
-		"estimator": "DBSCAN",
-		"estimator_methods": {
-		    "training": "fit"
+                "estimator": "DBSCAN",
+                "estimator_methods": {
+                "training": "fit"
 		},
                 "estimator_params" : {
-                        "eps": 10, "min_samples": 5
-                }
+                        "eps": 15, "min_samples": 50
+                },
+		"sklearnex_context": { "use_raw_input": true }
 	    },
 	    "data": {
 		"dtype": "float64"
 	    }
 	},
 	"synthetic dataset": {
             "data": [
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 500000,  "n_features": 100, "centers": 10 } }
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 4000000,  "n_features": 100, "centers": 10 } }
             ]
 	}
     },
@@ -27,7 +28,7 @@
                 "common dbscan parameters",
                 "synthetic dataset",
                 "sklearnex spmd implementation",
-		"large scale strong <=64 parameters",
+                "large scale strong <=64 parameters",
                 "spmd dbscan parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/forest_max_samples.json b/configs/spmd/large_scale/forest_max_samples.json
@@ -0,0 +1,28 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd forest classification parameters": {
+            "algorithm": {
+                "estimator": "RandomForestClassifier",
+                "estimator_methods": { "training": "fit" },
+                "estimator_params": { "n_estimators": 20, "max_depth": 10 },
+                "sklearnex_context": { "use_raw_input": true }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_classification", "split_kwargs": { "train_size": 1000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 1001000, "n_features": 100, "n_classes": 2 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "forestCls": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale 32 parameters",
+                "synthetic data",
+                "spmd forest classification parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/forest_no_max_samples.json b/configs/spmd/large_scale/forest_no_max_samples.json
@@ -4,14 +4,13 @@
         "spmd forest classification parameters": {
             "algorithm": {
                 "estimator": "RandomForestClassifier",
-		"estimator_methods": { "training": "fit" },
-		"estimator_params": { "n_estimators": 20, "max_depth": 4 }
+                "estimator_params": { "n_estimators": 100, "max_depth": 7 },
+                "sklearnex_context": { "use_raw_input": true }
             }
         },
         "synthetic data": {
             "data": [
-                { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 501000, "n_features": 10, "n_classes": 2 } },
-                { "source": "make_classification", "split_kwargs": { "train_size": 10000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 11000, "n_features": 1000, "n_classes": 2 } }
+                { "source": "make_classification", "split_kwargs": { "train_size": 1000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 1001000, "n_features": 100, "n_classes": 2 }, "n_informative": "[SPECIAL_VALUE]0.5" }
             ]
         }
     },
@@ -20,7 +19,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
-		"synthetic data",
+                "synthetic data",
                 "spmd forest classification parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/forest_strong.json b/configs/spmd/large_scale/forest_strong.json
@@ -4,22 +4,23 @@
         "spmd forest classification parameters": {
             "algorithm": {
                 "estimator": "RandomForestClassifier",
-		"estimator_methods": { "training": "fit" },
-		"estimator_params": { "n_estimators": 20, "max_depth": 4 }
+                "estimator_methods": { "training": "fit" },
+                "estimator_params": { "n_estimators": 100, "max_depth": 8 },
+                "sklearnex_context": {"use_raw_input": true}
             }
         },
         "synthetic data": {
             "data": [
-                { "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 10001000, "n_features": 100, "n_classes": 2 } }
+                { "source": "make_classification", "split_kwargs": { "train_size": 20000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 21000, "n_features": 200, "n_classes": 2 } }
             ]
         }
     },
     "TEMPLATES": {
         "forestCls": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale strong 2k parameters",
-		"synthetic data",
+                "large scale strong <=64 parameters",
+                "synthetic data",
                 "spmd forest classification parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/knn_strong.json b/configs/spmd/large_scale/knn_strong.json
@@ -3,23 +3,24 @@
     "PARAMETERS_SETS": {
         "spmd knn cls parameters": {
             "algorithm": {
-		"estimator": "KNeighborsClassifier",
+                "estimator": "KNeighborsClassifier",
                 "estimator_params": {
                     "algorithm": "brute",
                     "metric": "minkowski",
                     "p": 2,
                     "weights": "uniform",
-		    "n_neighbors": 5
+                    "n_neighbors": 100
                 },
-		"estimator_methods": {
-			"training": "fit",
-			"inference": "predict"
-		}
+                "estimator_methods": {
+                    "training": "fit",
+                    "inference": "predict"
+                },
+		"sklearnex_context": { "use_raw_input": true }
 	    }
         },
         "synthetic classification data": {
             "data": [
-		{ "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000000 },   "generation_kwargs": {  "n_samples": 1500000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
+                { "source": "make_classification", "split_kwargs": { "train_size": 3000000, "test_size": 2000000 },   "generation_kwargs": {  "n_samples": 5000000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
             ]
         }	
     },
@@ -28,7 +29,7 @@
             "SETS": [
                 "synthetic classification data",
                 "sklearnex spmd implementation",
-		"large scale strong <=64 parameters",
+                "large scale strong <=64 parameters",
                 "spmd knn cls parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/knn_tier1.json b/configs/spmd/large_scale/knn_tier1.json
@@ -0,0 +1,35 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd knn cls parameters": {
+            "algorithm": {
+                "estimator": "KNeighborsClassifier",
+                "estimator_params": {
+                    "algorithm": "brute",
+                    "metric": "minkowski",
+                    "p": 2,
+                    "weights": "uniform"
+                },
+                "estimator_methods": {
+                    "training": "fit",
+                    "inference": "predict"
+                },
+                "sklearnex_context": { "use_raw_input": true }
+            }
+        },
+        "synthetic classification data": [
+            { "data": { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 100000},   "generation_kwargs": {  "n_samples": 2000000,  "n_features": 10, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }, "algorithm": { "estimator_params": { "n_neighbors": 5 } } },
+            { "data": { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 10000},   "generation_kwargs": {  "n_samples": 2000000,  "n_features": 1000, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }, "algorithm": { "estimator_params": { "n_neighbors": 100 } } }
+        ]
+    },
+    "TEMPLATES": {
+        "knn classifier": {
+            "SETS": [
+                "synthetic classification data",
+                "sklearnex spmd implementation",
+                "large scale 32 parameters",
+                "spmd knn cls parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/knn_tier2.json b/configs/spmd/large_scale/knn_tier2.json
@@ -3,23 +3,24 @@
     "PARAMETERS_SETS": {
         "spmd knn cls parameters": {
             "algorithm": {
-		"estimator": "KNeighborsClassifier",
+                "estimator": "KNeighborsClassifier",
                 "estimator_params": {
                     "algorithm": "brute",
                     "metric": "minkowski",
                     "p": 2,
                     "weights": "uniform",
-		    "n_neighbors": 5
+                    "n_neighbors": 5
                 },
-		"estimator_methods": {
-			"training": "fit",
-			"inference": "predict"
-		}
-	    }
+                "estimator_methods": {
+                    "training": "fit",
+                    "inference": "predict"
+                },
+                "sklearnex_context": { "use_raw_input": true }
+            }
         },
         "synthetic classification data": {
             "data": [
-		{ "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 5000 },   "generation_kwargs": {  "n_samples": 5005000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
+                { "source": "make_classification", "split_kwargs": { "train_size": 100, "test_size": 100},   "generation_kwargs": {  "n_samples": 200,  "n_features": 10, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
             ]
         }	
     },
@@ -28,7 +29,7 @@
             "SETS": [
                 "synthetic classification data",
                 "sklearnex spmd implementation",
-		"large scale 2k parameters",
+                "large scale 2k parameters",
                 "spmd knn cls parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/linreg.json b/configs/spmd/large_scale/linreg.json
@@ -4,13 +4,14 @@
         "spmd linear parameters": {
             "algorithm": {
                 "estimator": "LinearRegression",
-                "estimator_methods": { "training": "fit" }
+                "estimator_methods": { "training": "fit" },
+                "sklearnex_context": { "use_raw_input": true }
             }
         },
         "synthetic data": {
             "data": [
-                { "source": "make_regression", "generation_kwargs": { "n_samples": 30005000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 30000000, "test_size": 5000 } },
-		        { "source": "make_regression", "generation_kwargs": { "n_samples": 305000,  "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 300000, "test_size": 5000 } }
+		    { "source": "make_regression", "generation_kwargs": { "n_samples": 100005000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 100000000, "test_size": 5000 } },
+		    { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000,  "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 3000000, "test_size": 5000 } }
             ]
         }
     },
@@ -19,7 +20,7 @@
             "SETS": [
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
-		"synthetic data",
+                "synthetic data",
                 "spmd linear parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/linreg_strong.json b/configs/spmd/large_scale/linreg_strong.json
diff --git a/configs/spmd/large_scale/logreg.json b/configs/spmd/large_scale/logreg.json
diff --git a/configs/spmd/large_scale/logreg_strong.json b/configs/spmd/large_scale/logreg_strong.json
diff --git a/configs/spmd/large_scale/pca.json b/configs/spmd/large_scale/pca.json
diff --git a/configs/spmd/large_scale/pca_strong.json b/configs/spmd/large_scale/pca_strong.json

Original file line number	Diff line number	Diff line change
`@@ -4,16 +4,17 @@`
`4`	`4`	`"spmd basicstats parameters": {`
`5`	`5`	`"algorithm": {`
`6`	`6`	`"estimator": "EmpiricalCovariance",`
`7`		`- "estimator_methods": { "training": "fit" }`
	`7`	`+ "estimator_methods": { "training": "fit" },`
	`8`	`+ "sklearnex_context": { "use_raw_input": true }`
`8`	`9`	`},`
`9`	`10`	`"data": {`
`10`	`11`	`"split_kwargs": { "test_size": 0.0001 }`
`11`	`12`	`}`
`12`	`13`	`},`
`13`	`14`	`"synthetic data": {`
`14`	`15`	`"data": [`
`15`		`- { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } },`
`16`		`- { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } }`
	`16`	`+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000000, "n_features": 10, "centers": 1 } },`
	`17`	`+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 1000, "centers": 1 } }`
`17`	`18`	`]`
`18`	`19`	`}`
`19`	`20`	`},`
`@@ -22,7 +23,7 @@`
`22`	`23`	`"SETS": [`
`23`	`24`	`"sklearnex spmd implementation",`
`24`	`25`	`"large scale 2k parameters",`
`25`		`- "synthetic data",`
	`26`	`+ "synthetic data",`
`26`	`27`	`"spmd basicstats parameters"`
`27`	`28`	`]`
`28`	`29`	`}`
Original file line number	Diff line number	Diff line change
`@@ -9,15 +9,16 @@`
`9`	`9`	`},`
`10`	`10`	`"estimator_params" : {`
`11`	`11`	`"eps": 10, "min_samples": 5`
`12`		`- }`
	`12`	`+ },`
	`13`	`+ "sklearnex_context": { "use_raw_input": true }`
`13`	`14`	`},`
`14`	`15`	`"data": {`
`15`	`16`	`"dtype": "float64"`
`16`	`17`	`}`
`17`	`18`	`},`
`18`	`19`	`"synthetic dataset": {`
`19`	`20`	`"data": [`
`20`		`- { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 100, "centers": 10 } }`
	`21`	`+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 40000, "n_features": 100, "centers": 10 } }`
`21`	`22`	`]`
`22`	`23`	`}`
`23`	`24`	`},`
Original file line number	Diff line number	Diff line change
`@@ -4,14 +4,13 @@`
`4`	`4`	`"spmd forest classification parameters": {`
`5`	`5`	`"algorithm": {`
`6`	`6`	`"estimator": "RandomForestClassifier",`
`7`		`- "estimator_methods": { "training": "fit" },`
`8`		`- "estimator_params": { "n_estimators": 20, "max_depth": 4 }`
	`7`	`+ "estimator_params": { "n_estimators": 100, "max_depth": 7 },`
	`8`	`+ "sklearnex_context": { "use_raw_input": true }`
`9`	`9`	`}`
`10`	`10`	`},`
`11`	`11`	`"synthetic data": {`
`12`	`12`	`"data": [`
`13`		`- { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 501000, "n_features": 10, "n_classes": 2 } },`
`14`		`- { "source": "make_classification", "split_kwargs": { "train_size": 10000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 11000, "n_features": 1000, "n_classes": 2 } }`
	`13`	`+ { "source": "make_classification", "split_kwargs": { "train_size": 1000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 1001000, "n_features": 100, "n_classes": 2 }, "n_informative": "[SPECIAL_VALUE]0.5" }`
`15`	`14`	`]`
`16`	`15`	`}`
`17`	`16`	`},`
`@@ -20,7 +19,7 @@`
`20`	`19`	`"SETS": [`
`21`	`20`	`"sklearnex spmd implementation",`
`22`	`21`	`"large scale 2k parameters",`
`23`		`- "synthetic data",`
	`22`	`+ "synthetic data",`
`24`	`23`	`"spmd forest classification parameters"`
`25`	`24`	`]`
`26`	`25`	`}`
Original file line number	Diff line number	Diff line change
`@@ -4,22 +4,23 @@`
`4`	`4`	`"spmd forest classification parameters": {`
`5`	`5`	`"algorithm": {`
`6`	`6`	`"estimator": "RandomForestClassifier",`
`7`		`- "estimator_methods": { "training": "fit" },`
`8`		`- "estimator_params": { "n_estimators": 20, "max_depth": 4 }`
	`7`	`+ "estimator_methods": { "training": "fit" },`
	`8`	`+ "estimator_params": { "n_estimators": 100, "max_depth": 8 },`
	`9`	`+ "sklearnex_context": {"use_raw_input": true}`
`9`	`10`	`}`
`10`	`11`	`},`
`11`	`12`	`"synthetic data": {`
`12`	`13`	`"data": [`
`13`		`- { "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 10001000, "n_features": 100, "n_classes": 2 } }`
	`14`	`+ { "source": "make_classification", "split_kwargs": { "train_size": 20000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 21000, "n_features": 200, "n_classes": 2 } }`
`14`	`15`	`]`
`15`	`16`	`}`
`16`	`17`	`},`
`17`	`18`	`"TEMPLATES": {`
`18`	`19`	`"forestCls": {`
`19`	`20`	`"SETS": [`
`20`	`21`	`"sklearnex spmd implementation",`
`21`		`- "large scale strong 2k parameters",`
`22`		`- "synthetic data",`
	`22`	`+ "large scale strong <=64 parameters",`
	`23`	`+ "synthetic data",`
`23`	`24`	`"spmd forest classification parameters"`
`24`	`25`	`]`
`25`	`26`	`}`