diff --git a/configs/incremental.json b/configs/incremental.json deleted file mode 100644 index e1f589a4..00000000 --- a/configs/incremental.json +++ /dev/null @@ -1,100 +0,0 @@ -{ "INCLUDE": ["./common/sklearn.json"], - "PARAMETERS_SETS": { - "common": {"bench": {"n_runs": 10, "time_limit": 60}}, - "covariance data": { - "data": [ - { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 12000000, - "n_features": [10, 100] - }, - "split_kwargs": {"ignore": true} - } - ] - }, - "basic_statistics data": { - "data": { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 12000000, - "n_features": [10, 100] - }, - "split_kwargs": {"ignore": true} - } - }, - "linear_regression data": { - "data": { - "source": "make_regression", - "split_kwargs": {"train_size": 0.2, "test_size": 0.8}, - "generation_kwargs": { - "n_samples": 12000000, - "n_features": [10, 100], - "n_informative": 5, - "noise": 2.0 - } - } - }, - "pca data": { - "data": { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 12000000, - "n_features": [10, 100] - }, - "split_kwargs": {"ignore": true} - } - }, - "covariance": { - "algorithm": [ - { - "estimator": "IncrementalEmpiricalCovariance", - "library": "sklearnex.covariance", - "estimator_methods": {"training": "partial_fit"}, - "num_batches": {"training": 12} - } - ] - }, - "basic_statistics": { - "algorithm": [ - { - "estimator": "IncrementalBasicStatistics", - "library": "sklearnex.basic_statistics", - "estimator_methods": {"training": "partial_fit"}, - "num_batches": {"training": 12} - } - ] - }, - "linear_regression": { - "algorithm": [ - { - "estimator": "IncrementalLinearRegression", - "library": "sklearnex.linear_model", - "estimator_methods": {"training": "partial_fit"}, - "num_batches": {"training": 12} - } - ] - }, - "pca": { - "algorithm": [ - { - "estimator": "IncrementalPCA", - "library": "sklearnex.preview.decomposition", - "estimator_methods": {"training": "partial_fit"}, - "num_batches": {"training": 12} - } - ] - } - }, - "TEMPLATES": { - "basic_statistics": {"SETS": ["common", "basic_statistics", "basic_statistics data", "sklearn-ex[gpu] implementations"]}, - "covariance": {"SETS": ["common", "covariance", "covariance data", "sklearn-ex[gpu] implementations"]}, - "linear_regression": { - "SETS": ["common", "linear_regression", "linear_regression data", "sklearn-ex[gpu] implementations"] - }, - "pca": {"SETS": ["common", "pca", "pca data", "sklearn-ex[gpu] implementations"]} - } -} diff --git a/configs/spmd/large_scale/basic_stats.json b/configs/spmd/large_scale/basic_stats.json index d6c2c4d2..f8f44e4e 100644 --- a/configs/spmd/large_scale/basic_stats.json +++ b/configs/spmd/large_scale/basic_stats.json @@ -4,16 +4,17 @@ "spmd basicstats parameters": { "algorithm": { "estimator": "BasicStatistics", - "estimator_methods": { "training": "fit" } + "estimator_methods": { "training": "fit" }, + "sklearnex_context": { "use_raw_input": true } }, - "data": { - "split_kwargs": { "test_size": 0.0001 } - } + "data": { + "split_kwargs": { "test_size": 0.0001 } + } }, "synthetic data": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }, - { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } + { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000000, "n_features": 10, "centers": 1 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 1000, "centers": 1 } } ] } }, @@ -22,7 +23,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale 2k parameters", - "synthetic data", + "synthetic data", "spmd basicstats parameters" ] } diff --git a/configs/spmd/large_scale/basic_stats_strong.json b/configs/spmd/large_scale/basic_stats_strong.json index b5b0ef69..0c7c671e 100644 --- a/configs/spmd/large_scale/basic_stats_strong.json +++ b/configs/spmd/large_scale/basic_stats_strong.json @@ -4,11 +4,12 @@ "spmd basicstats parameters": { "algorithm": { "estimator": "BasicStatistics", - "estimator_methods": { "training": "fit" } + "estimator_methods": { "training": "fit" }, + "sklearnex_context": { "use_raw_input": true } }, - "data": { - "split_kwargs": { "test_size": 0.0001 } - } + "data": { + "split_kwargs": { "test_size": 0.0001 } + } }, "synthetic data": { "data": [ @@ -20,8 +21,8 @@ "basicstats": { "SETS": [ "sklearnex spmd implementation", - "large scale strong 2k parameters", - "synthetic data", + "large scale strong <=64 parameters", + "synthetic data", "spmd basicstats parameters" ] } diff --git a/configs/spmd/large_scale/covariance.json b/configs/spmd/large_scale/covariance.json index 20da8d15..7f4d6d7d 100644 --- a/configs/spmd/large_scale/covariance.json +++ b/configs/spmd/large_scale/covariance.json @@ -4,7 +4,8 @@ "spmd basicstats parameters": { "algorithm": { "estimator": "EmpiricalCovariance", - "estimator_methods": { "training": "fit" } + "estimator_methods": { "training": "fit" }, + "sklearnex_context": { "use_raw_input": true } }, "data": { "split_kwargs": { "test_size": 0.0001 } @@ -12,8 +13,8 @@ }, "synthetic data": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }, - { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } + { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000000, "n_features": 10, "centers": 1 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 1000, "centers": 1 } } ] } }, @@ -22,7 +23,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale 2k parameters", - "synthetic data", + "synthetic data", "spmd basicstats parameters" ] } diff --git a/configs/spmd/large_scale/covariance_strong.json b/configs/spmd/large_scale/covariance_strong.json index b8424d92..8e388801 100644 --- a/configs/spmd/large_scale/covariance_strong.json +++ b/configs/spmd/large_scale/covariance_strong.json @@ -4,7 +4,8 @@ "spmd basicstats parameters": { "algorithm": { "estimator": "EmpiricalCovariance", - "estimator_methods": { "training": "fit" } + "estimator_methods": { "training": "fit" }, + "sklearnex_context": { "use_raw_input": true } }, "data": { "split_kwargs": { "test_size": 0.0001 } @@ -20,8 +21,8 @@ "covariance": { "SETS": [ "sklearnex spmd implementation", - "large scale strong 2k parameters", - "synthetic data", + "large scale strong <=64 parameters", + "synthetic data", "spmd basicstats parameters" ] } diff --git a/configs/spmd/large_scale/dbscan.json b/configs/spmd/large_scale/dbscan.json index e4996c9e..bf60b7cc 100644 --- a/configs/spmd/large_scale/dbscan.json +++ b/configs/spmd/large_scale/dbscan.json @@ -9,7 +9,8 @@ }, "estimator_params" : { "eps": 10, "min_samples": 5 - } + }, + "sklearnex_context": { "use_raw_input": true } }, "data": { "dtype": "float64" @@ -17,7 +18,7 @@ }, "synthetic dataset": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 100, "centers": 10 } } + { "source": "make_blobs", "generation_kwargs": { "n_samples": 40000, "n_features": 100, "centers": 10 } } ] } }, diff --git a/configs/spmd/large_scale/dbscan_strong.json b/configs/spmd/large_scale/dbscan_strong.json index 04fb9016..5e7ab322 100644 --- a/configs/spmd/large_scale/dbscan_strong.json +++ b/configs/spmd/large_scale/dbscan_strong.json @@ -3,13 +3,14 @@ "PARAMETERS_SETS": { "spmd dbscan parameters": { "algorithm": { - "estimator": "DBSCAN", - "estimator_methods": { - "training": "fit" + "estimator": "DBSCAN", + "estimator_methods": { + "training": "fit" }, "estimator_params" : { - "eps": 10, "min_samples": 5 - } + "eps": 15, "min_samples": 50 + }, + "sklearnex_context": { "use_raw_input": true } }, "data": { "dtype": "float64" @@ -17,7 +18,7 @@ }, "synthetic dataset": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 500000, "n_features": 100, "centers": 10 } } + { "source": "make_blobs", "generation_kwargs": { "n_samples": 4000000, "n_features": 100, "centers": 10 } } ] } }, @@ -27,7 +28,7 @@ "common dbscan parameters", "synthetic dataset", "sklearnex spmd implementation", - "large scale strong <=64 parameters", + "large scale strong <=64 parameters", "spmd dbscan parameters" ] } diff --git a/configs/spmd/large_scale/forest_max_samples.json b/configs/spmd/large_scale/forest_max_samples.json new file mode 100644 index 00000000..95affb16 --- /dev/null +++ b/configs/spmd/large_scale/forest_max_samples.json @@ -0,0 +1,28 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd forest classification parameters": { + "algorithm": { + "estimator": "RandomForestClassifier", + "estimator_methods": { "training": "fit" }, + "estimator_params": { "n_estimators": 20, "max_depth": 10 }, + "sklearnex_context": { "use_raw_input": true } + } + }, + "synthetic data": { + "data": [ + { "source": "make_classification", "split_kwargs": { "train_size": 1000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 1001000, "n_features": 100, "n_classes": 2 } } + ] + } + }, + "TEMPLATES": { + "forestCls": { + "SETS": [ + "sklearnex spmd implementation", + "large scale 32 parameters", + "synthetic data", + "spmd forest classification parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/forest.json b/configs/spmd/large_scale/forest_no_max_samples.json similarity index 58% rename from configs/spmd/large_scale/forest.json rename to configs/spmd/large_scale/forest_no_max_samples.json index b4402442..c371371b 100644 --- a/configs/spmd/large_scale/forest.json +++ b/configs/spmd/large_scale/forest_no_max_samples.json @@ -4,14 +4,13 @@ "spmd forest classification parameters": { "algorithm": { "estimator": "RandomForestClassifier", - "estimator_methods": { "training": "fit" }, - "estimator_params": { "n_estimators": 20, "max_depth": 4 } + "estimator_params": { "n_estimators": 100, "max_depth": 7 }, + "sklearnex_context": { "use_raw_input": true } } }, "synthetic data": { "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 501000, "n_features": 10, "n_classes": 2 } }, - { "source": "make_classification", "split_kwargs": { "train_size": 10000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 11000, "n_features": 1000, "n_classes": 2 } } + { "source": "make_classification", "split_kwargs": { "train_size": 1000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 1001000, "n_features": 100, "n_classes": 2 }, "n_informative": "[SPECIAL_VALUE]0.5" } ] } }, @@ -20,7 +19,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale 2k parameters", - "synthetic data", + "synthetic data", "spmd forest classification parameters" ] } diff --git a/configs/spmd/large_scale/forest_strong.json b/configs/spmd/large_scale/forest_strong.json index 23b982f5..653c70dc 100644 --- a/configs/spmd/large_scale/forest_strong.json +++ b/configs/spmd/large_scale/forest_strong.json @@ -4,13 +4,14 @@ "spmd forest classification parameters": { "algorithm": { "estimator": "RandomForestClassifier", - "estimator_methods": { "training": "fit" }, - "estimator_params": { "n_estimators": 20, "max_depth": 4 } + "estimator_methods": { "training": "fit" }, + "estimator_params": { "n_estimators": 100, "max_depth": 8 }, + "sklearnex_context": {"use_raw_input": true} } }, "synthetic data": { "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 10001000, "n_features": 100, "n_classes": 2 } } + { "source": "make_classification", "split_kwargs": { "train_size": 20000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 21000, "n_features": 200, "n_classes": 2 } } ] } }, @@ -18,8 +19,8 @@ "forestCls": { "SETS": [ "sklearnex spmd implementation", - "large scale strong 2k parameters", - "synthetic data", + "large scale strong <=64 parameters", + "synthetic data", "spmd forest classification parameters" ] } diff --git a/configs/spmd/large_scale/incremental.json b/configs/spmd/large_scale/incremental.json new file mode 100644 index 00000000..195074ee --- /dev/null +++ b/configs/spmd/large_scale/incremental.json @@ -0,0 +1,77 @@ +{ "INCLUDE": [ ], + "PARAMETERS_SETS": { + "common incremental raw gpu params": { + "algorithm": { + "device": "gpu", + "sklearnex_context": { "use_raw_input": true } + }, + "data": { + "format":"dpctl", + "order": "C" + } + }, + "statistical batches and data": [ + { "algorithm": { "num_batches": { "training": [1,2,6,12]} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000000, "n_features": 10, "centers": 1 } } }, + { "algorithm": { "num_batches": { "training": 2} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 50000000, "n_features": 10, "centers": 1 } } }, + { "algorithm": { "num_batches": { "training": 6} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 16666667, "n_features": 10, "centers": 1 } } }, + { "algorithm": { "num_batches": { "training": 12} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 8333333, "n_features": 10, "centers": 1 } } }, + { "algorithm": { "num_batches": { "training": [1,2,6,12]} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 1000, "centers": 1 } } }, + { "algorithm": { "num_batches": { "training": 2} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 500000, "n_features": 1000, "centers": 1 } } }, + { "algorithm": { "num_batches": { "training": 6} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 166667, "n_features": 1000, "centers": 1 } } }, + { "algorithm": { "num_batches": { "training": 12} }, "data": { "source": "make_blobs", "generation_kwargs": { "n_samples": 83333, "n_features": 1000, "centers": 1 } } } + ], + "regression batches and data": [ + { "algorithm": { "num_batches": { "training": [1,2,6,12]} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 100005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 100000000, "test_size": 5000 } } }, + { "algorithm": { "num_batches": { "training": 2} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 100005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 50000000, "test_size": 5000 } } }, + { "algorithm": { "num_batches": { "training": 6} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 100005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 16666667, "test_size": 5000 } } }, + { "algorithm": { "num_batches": { "training": 12} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 100005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 8333333, "test_size": 5000 } } }, + { "algorithm": { "num_batches": { "training": [1,2,6,12]} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 3000000, "test_size": 5000 } } }, + { "algorithm": { "num_batches": { "training": 2} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 1500000, "test_size": 5000 } } }, + { "algorithm": { "num_batches": { "training": 6} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 500000, "test_size": 5000 } } }, + { "algorithm": { "num_batches": { "training": 12} }, "data": { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 250000, "test_size": 5000 } } } + ], + "covariance": { + "algorithm": { + "estimator": "IncrementalEmpiricalCovariance", + "library": "sklearnex", + "estimator_methods": {"training": "partial_fit"} + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + }, + "basic_statistics": { + "algorithm": { + "estimator": "IncrementalBasicStatistics", + "library": "sklearnex", + "estimator_methods": {"training": "partial_fit"} + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + }, + "linear_regression": { + "algorithm": { + "estimator": "IncrementalLinearRegression", + "library": "sklearnex", + "estimator_methods": {"training": "partial_fit"} + } + }, + "pca": { + "algorithm": { + "estimator": "IncrementalPCA", + "library": "sklearnex.preview", + "estimator_methods": {"training": "partial_fit"} + }, + "data": { + "split_kwargs": { "test_size": 0.0001 } + } + } + }, + "TEMPLATES": { + "basic_statistics": { "SETS": ["common incremental raw gpu params", "basic_statistics", "statistical batches and data"] }, + "covariance": { "SETS": ["common incremental raw gpu params", "covariance", "statistical batches and data"] }, + "linear_regression": { "SETS": ["common incremental raw gpu params", "linear_regression", "regression batches and data"] }, + "pca": { "SETS": ["common incremental raw gpu params", "pca", "statistical batches and data"] } + } +} diff --git a/configs/spmd/large_scale/incremental/basic_stats.json b/configs/spmd/large_scale/incremental/basic_stats.json deleted file mode 100644 index ca9e3eb9..00000000 --- a/configs/spmd/large_scale/incremental/basic_stats.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "INCLUDE": ["../../../common/sklearn.json", "../../../spmd/stats_covariance.json", "../large_scale.json"], - "PARAMETERS_SETS": { - "spmd basicstats parameters": { - "algorithm": { - "estimator": "IncrementalBasicStatistics", - "estimator_methods": { "training": "fit" }, - "num_batches": {"training": 10} - }, - "data": { - "split_kwargs": { "test_size": 0.0001 } - } - }, - "synthetic data": { - "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } - ] - } - }, - "TEMPLATES": { - "basicstats": { - "SETS": [ - "sklearnex spmd implementation", - "large scale 32 parameters", - "synthetic data", - "spmd basicstats parameters" - ] - } - } -} diff --git a/configs/spmd/large_scale/incremental/covariance.json b/configs/spmd/large_scale/incremental/covariance.json deleted file mode 100644 index 04fcd76b..00000000 --- a/configs/spmd/large_scale/incremental/covariance.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "INCLUDE": ["../../../common/sklearn.json", "../../../spmd/stats_covariance.json", "../large_scale.json"], - "PARAMETERS_SETS": { - "spmd covariance parameters": { - "algorithm": { - "estimator": "IncrementalEmpiricalCovariance", - "estimator_methods": { "training": "fit" }, - "num_batches": {"training": 10} - }, - "data": { - "split_kwargs": { "test_size": 0.0001 } - } - }, - "synthetic data": { - "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } - ] - } - }, - "TEMPLATES": { - "covariance": { - "SETS": [ - "sklearnex spmd implementation", - "large scale 32 parameters", - "synthetic data", - "spmd covariance parameters" - ] - } - } -} diff --git a/configs/spmd/large_scale/incremental/linear_model.json b/configs/spmd/large_scale/incremental/linear_model.json deleted file mode 100644 index a483f613..00000000 --- a/configs/spmd/large_scale/incremental/linear_model.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "INCLUDE": ["../../../common/sklearn.json", "../../../regular/linear_model.json", "../large_scale.json"], - "PARAMETERS_SETS": { - "spmd linear parameters": { - "algorithm": { - "estimator": "IncrementalLinearRegression", - "estimator_methods": { "training": "fit" }, - "num_batches": {"training": 10} - } - }, - "synthetic data": { - "data": [ - { "source": "make_regression", "generation_kwargs": { "n_samples": 305000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 300000, "test_size": 5000 } } - ] - } - }, - "TEMPLATES": { - "linreg": { - "SETS": [ - "sklearnex spmd implementation", - "large scale 32 parameters", - "synthetic data", - "spmd linear parameters" - ] - } - } -} diff --git a/configs/spmd/large_scale/incremental/pca.json b/configs/spmd/large_scale/incremental/pca.json deleted file mode 100644 index 11fa5125..00000000 --- a/configs/spmd/large_scale/incremental/pca.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "INCLUDE": ["../../../common/sklearn.json", "../../../regular/pca.json", "../large_scale.json"], - "PARAMETERS_SETS": { - "spmd pca parameters": { - "algorithm": { - "estimator": "IncrementalPCA", - "estimator_methods": { "training": "fit", "inference": "" }, - "num_batches": {"training": 10} - }, - "data": { - "split_kwargs": { "test_size": 0.0001 } - } - }, - "synthetic data": { - "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } - ] - } - }, - "TEMPLATES": { - "linreg": { - "SETS": [ - "sklearnex spmd implementation", - "large scale 32 parameters", - "synthetic data", - "spmd pca parameters" - ] - } - } -} diff --git a/configs/spmd/large_scale/knn_strong.json b/configs/spmd/large_scale/knn_strong.json index d202f6e4..36daf3f1 100644 --- a/configs/spmd/large_scale/knn_strong.json +++ b/configs/spmd/large_scale/knn_strong.json @@ -3,23 +3,24 @@ "PARAMETERS_SETS": { "spmd knn cls parameters": { "algorithm": { - "estimator": "KNeighborsClassifier", + "estimator": "KNeighborsClassifier", "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": 2, "weights": "uniform", - "n_neighbors": 5 + "n_neighbors": 100 }, - "estimator_methods": { - "training": "fit", - "inference": "predict" - } + "estimator_methods": { + "training": "fit", + "inference": "predict" + }, + "sklearnex_context": { "use_raw_input": true } } }, "synthetic classification data": { "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000000 }, "generation_kwargs": { "n_samples": 1500000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } + { "source": "make_classification", "split_kwargs": { "train_size": 3000000, "test_size": 2000000 }, "generation_kwargs": { "n_samples": 5000000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } ] } }, @@ -28,7 +29,7 @@ "SETS": [ "synthetic classification data", "sklearnex spmd implementation", - "large scale strong <=64 parameters", + "large scale strong <=64 parameters", "spmd knn cls parameters" ] } diff --git a/configs/spmd/large_scale/knn_tier1.json b/configs/spmd/large_scale/knn_tier1.json new file mode 100644 index 00000000..c230cc4e --- /dev/null +++ b/configs/spmd/large_scale/knn_tier1.json @@ -0,0 +1,35 @@ +{ + "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], + "PARAMETERS_SETS": { + "spmd knn cls parameters": { + "algorithm": { + "estimator": "KNeighborsClassifier", + "estimator_params": { + "algorithm": "brute", + "metric": "minkowski", + "p": 2, + "weights": "uniform" + }, + "estimator_methods": { + "training": "fit", + "inference": "predict" + }, + "sklearnex_context": { "use_raw_input": true } + } + }, + "synthetic classification data": [ + { "data": { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 100000}, "generation_kwargs": { "n_samples": 2000000, "n_features": 10, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, "algorithm": { "estimator_params": { "n_neighbors": 5 } } }, + { "data": { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 10000}, "generation_kwargs": { "n_samples": 2000000, "n_features": 1000, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }, "algorithm": { "estimator_params": { "n_neighbors": 100 } } } + ] + }, + "TEMPLATES": { + "knn classifier": { + "SETS": [ + "synthetic classification data", + "sklearnex spmd implementation", + "large scale 32 parameters", + "spmd knn cls parameters" + ] + } + } +} diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn_tier2.json similarity index 55% rename from configs/spmd/large_scale/knn.json rename to configs/spmd/large_scale/knn_tier2.json index b68b94af..ff0032e2 100644 --- a/configs/spmd/large_scale/knn.json +++ b/configs/spmd/large_scale/knn_tier2.json @@ -3,23 +3,24 @@ "PARAMETERS_SETS": { "spmd knn cls parameters": { "algorithm": { - "estimator": "KNeighborsClassifier", + "estimator": "KNeighborsClassifier", "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": 2, "weights": "uniform", - "n_neighbors": 5 + "n_neighbors": 5 }, - "estimator_methods": { - "training": "fit", - "inference": "predict" - } - } + "estimator_methods": { + "training": "fit", + "inference": "predict" + }, + "sklearnex_context": { "use_raw_input": true } + } }, "synthetic classification data": { "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 5000 }, "generation_kwargs": { "n_samples": 5005000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } + { "source": "make_classification", "split_kwargs": { "train_size": 100, "test_size": 100}, "generation_kwargs": { "n_samples": 200, "n_features": 10, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } } ] } }, @@ -28,7 +29,7 @@ "SETS": [ "synthetic classification data", "sklearnex spmd implementation", - "large scale 2k parameters", + "large scale 2k parameters", "spmd knn cls parameters" ] } diff --git a/configs/spmd/large_scale/linreg.json b/configs/spmd/large_scale/linreg.json index ea45a52c..7c7fb035 100644 --- a/configs/spmd/large_scale/linreg.json +++ b/configs/spmd/large_scale/linreg.json @@ -4,13 +4,14 @@ "spmd linear parameters": { "algorithm": { "estimator": "LinearRegression", - "estimator_methods": { "training": "fit" } + "estimator_methods": { "training": "fit" }, + "sklearnex_context": { "use_raw_input": true } } }, "synthetic data": { "data": [ - { "source": "make_regression", "generation_kwargs": { "n_samples": 30005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 30000000, "test_size": 5000 } }, - { "source": "make_regression", "generation_kwargs": { "n_samples": 305000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 300000, "test_size": 5000 } } + { "source": "make_regression", "generation_kwargs": { "n_samples": 100005000, "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 100000000, "test_size": 5000 } }, + { "source": "make_regression", "generation_kwargs": { "n_samples": 3005000, "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 3000000, "test_size": 5000 } } ] } }, @@ -19,7 +20,7 @@ "SETS": [ "sklearnex spmd implementation", "large scale 2k parameters", - "synthetic data", + "synthetic data", "spmd linear parameters" ] } diff --git a/configs/spmd/large_scale/linreg_strong.json b/configs/spmd/large_scale/linreg_strong.json index 629bf544..ac5a6c7a 100644 --- a/configs/spmd/large_scale/linreg_strong.json +++ b/configs/spmd/large_scale/linreg_strong.json @@ -4,7 +4,8 @@ "spmd linear parameters": { "algorithm": { "estimator": "LinearRegression", - "estimator_methods": { "training": "fit" } + "estimator_methods": { "training": "fit" }, + "sklearnex_context": { "use_raw_input": true } } }, "synthetic data": { @@ -17,8 +18,8 @@ "linreg": { "SETS": [ "sklearnex spmd implementation", - "large scale strong 2k parameters", - "synthetic data", + "large scale strong <=64 parameters", + "synthetic data", "spmd linear parameters" ] } diff --git a/configs/spmd/large_scale/logreg.json b/configs/spmd/large_scale/logreg.json index 326f2580..b7b4b998 100644 --- a/configs/spmd/large_scale/logreg.json +++ b/configs/spmd/large_scale/logreg.json @@ -2,15 +2,16 @@ "INCLUDE": ["../../common/sklearn.json", "../logreg.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd logreg2 parameters": { - "algorithm":{ - "estimator": "LogisticRegression", + "algorithm":{ + "estimator": "LogisticRegression", "estimator_methods": { "inference": "predict" }, - "estimator_params": { "max_iter": 20 } + "estimator_params": { "max_iter": 10 }, + "sklearnex_context": { "use_raw_input": true } } - }, + }, "synthetic data": { "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 10, "n_classes": 2 } }, + { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 10, "n_classes": 2 } }, { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 501000, "n_features": 1000, "n_classes": 2, "n_informative": 40, "n_clusters_per_class": 3, "flip_y": 0.05 } } ] } @@ -21,8 +22,8 @@ "sklearnex spmd implementation", "large scale 2k parameters", "spmd logreg parameters", - "synthetic data", - "spmd logreg2 parameters" + "synthetic data", + "spmd logreg2 parameters" ] } } diff --git a/configs/spmd/large_scale/logreg_strong.json b/configs/spmd/large_scale/logreg_strong.json index 0b79ba9d..219840ea 100644 --- a/configs/spmd/large_scale/logreg_strong.json +++ b/configs/spmd/large_scale/logreg_strong.json @@ -2,16 +2,16 @@ "INCLUDE": ["../../common/sklearn.json", "../logreg.json", "large_scale.json"], "PARAMETERS_SETS": { "spmd logreg2 parameters": { - "algorithm":{ - "estimator": "LogisticRegression", + "algorithm":{ + "estimator": "LogisticRegression", "estimator_methods": { "inference": "predict" }, - "estimator_params": { "max_iter": 16 } + "estimator_params": { "max_iter": 16 }, + "sklearnex_context": { "use_raw_input": true } } - }, + }, "synthetic data": { "data": [ - { "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 10001000, "n_features": 100, "n_classes": 2 } }, - { "source": "make_classification", "split_kwargs": { "train_size": 12000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 12001000, "n_features": 200, "n_classes": 2, "n_informative": 40, "n_clusters_per_class": 3, "flip_y": 0.05 } } + { "source": "make_classification", "split_kwargs": { "train_size": 12000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 12001000, "n_features": 200, "n_classes": 2, "n_informative": 40, "n_clusters_per_class": 3, "flip_y": 0.05 } } ] } }, @@ -19,10 +19,10 @@ "logreg": { "SETS": [ "sklearnex spmd implementation", - "large scale strong 2k parameters", + "large scale strong 64 parameters", "spmd logreg parameters", - "synthetic data", - "spmd logreg2 parameters" + "synthetic data", + "spmd logreg2 parameters" ] } } diff --git a/configs/spmd/large_scale/pca.json b/configs/spmd/large_scale/pca.json index d0ee879a..ce56bd8a 100644 --- a/configs/spmd/large_scale/pca.json +++ b/configs/spmd/large_scale/pca.json @@ -4,7 +4,8 @@ "spmd pca parameters": { "algorithm": { "estimator": "PCA", - "estimator_methods": { "training": "fit", "inference": "" } + "estimator_methods": { "training": "fit", "inference": "" }, + "sklearnex_context": { "use_raw_input": true } }, "data": { "split_kwargs": { "test_size": 0.0001 } @@ -12,18 +13,18 @@ }, "synthetic data": { "data": [ - { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }, - { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "centers": 1 } } + { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000000, "n_features": 10, "centers": 1 } }, + { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 1000, "centers": 1 } } ] } }, "TEMPLATES": { "linreg": { "SETS": [ - "sklearnex spmd implementation", + "sklearnex spmd implementation", "large scale 2k parameters", "synthetic data", - "spmd pca parameters" + "spmd pca parameters" ] } } diff --git a/configs/spmd/large_scale/pca_strong.json b/configs/spmd/large_scale/pca_strong.json index 3cb33e72..70461ba7 100644 --- a/configs/spmd/large_scale/pca_strong.json +++ b/configs/spmd/large_scale/pca_strong.json @@ -4,7 +4,8 @@ "spmd pca parameters": { "algorithm": { "estimator": "PCA", - "estimator_methods": { "training": "fit", "inference": "" } + "estimator_methods": { "training": "fit", "inference": "" }, + "sklearnex_context": { "use_raw_input": true } }, "data": { "split_kwargs": { "test_size": 0.0001 } @@ -19,10 +20,10 @@ "TEMPLATES": { "linreg": { "SETS": [ - "sklearnex spmd implementation", - "large scale strong 2k parameters", + "sklearnex spmd implementation", + "large scale strong <=64 parameters", "synthetic data", - "spmd pca parameters" + "spmd pca parameters" ] } } diff --git a/configs/spmd/large_scale/spmd_for_online.json b/configs/spmd/large_scale/spmd_for_online.json deleted file mode 100644 index 2ef60f5b..00000000 --- a/configs/spmd/large_scale/spmd_for_online.json +++ /dev/null @@ -1,96 +0,0 @@ -{ - "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], - "PARAMETERS_SETS": { - "covariance data": { - "data": [ - { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 1000000, - "n_features": [10, 100] - }, - "split_kwargs": {"ignore": true} - } - ] - }, - "basic_statistics data": { - "data": { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 1000000, - "n_features": [10, 100] - }, - "split_kwargs": {"ignore": true} - } - }, - "linear_regression data": { - "data": { - "source": "make_regression", - "split_kwargs": {"train_size": 0.2, "test_size": 0.8}, - "generation_kwargs": { - "n_samples": 1000000, - "n_features": [10, 100], - "n_informative": 5, - "noise": 2.0 - } - } - }, - "pca data": { - "data": { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 1000000, - "n_features": [10, 100] - }, - "split_kwargs": {"ignore": true} - } - }, - "basic_statistics": { - "algorithm": [ - { - "estimator": "BasicStatistics", - "library": "sklearnex.spmd", - "estimator_methods": {"training": "fit"} - } - ] - }, - "covariance": { - "algorithm": [ - { - "estimator": "EmpiricalCovariance", - "library": "sklearnex.spmd", - "estimator_methods": {"training": "fit"} - } - ] - }, - "linear_regression": { - "algorithm": [ - { - "estimator": "LinearRegression", - "library": "sklearnex.spmd", - "estimator_methods": {"training": "fit"} - } - ] - }, - "pca": { - "algorithm": [ - { - "estimator": "PCA", - "library": "sklearnex.spmd", - "estimator_methods": {"training": "fit", "inference": ""} - } - ] - } - }, - "TEMPLATES": { - "basic_statistics": {"SETS": ["basic_statistics", "basic_statistics data", "sklearnex spmd implementation", "large scale full one node parameters"]}, - "covariance": {"SETS": ["covariance", "covariance data", "sklearnex spmd implementation", "large scale full one node parameters"]}, - "linear_regression": { - "SETS": ["linear_regression", "linear_regression data", "sklearnex spmd implementation", "large scale full one node parameters"] - }, - "pca": {"SETS": ["pca", "pca data", "sklearnex spmd implementation", "large scale full one node parameters"]} - } -} diff --git a/configs/spmd/large_scale/spmd_for_online_strong.json b/configs/spmd/large_scale/spmd_for_online_strong.json deleted file mode 100644 index 77a25075..00000000 --- a/configs/spmd/large_scale/spmd_for_online_strong.json +++ /dev/null @@ -1,60 +0,0 @@ -{ - "INCLUDE": ["../../common/sklearn.json", "large_scale.json"], - "PARAMETERS_SETS": { - "covariance data": { - "data": [ - { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 12000000, - "n_features": [10, 100] - }, - "split_kwargs": {"ignore": true} - } - ] - }, - "basic_statistics data": { - "data": { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 12000000, - "n_features": [10, 100] - }, - "split_kwargs": {"ignore": true} - } - }, - "linear_regression data": { - "data": { - "source": "make_regression", - "split_kwargs": {"train_size": 0.2, "test_size": 0.8}, - "generation_kwargs": { - "n_samples": 12000000, - "n_features": [10, 100], - "n_informative": 5, - "noise": 2.0 - } - } - }, - "pca data": { - "data": { - "source": "make_blobs", - "generation_kwargs": { - "centers": 1, - "n_samples": 12000000, - "n_features": [10, 100] - }, - "split_kwargs": {"ignore": true} - } - } - }, - "TEMPLATES": { - "basic_statistics": {"SETS": ["basic_statistics data", "spmd default parameters", "sklearnex spmd implementation", "large scale strong full one node parameters"]}, - "covariance": {"SETS": ["covariance data", "spmd default parameters","sklearnex spmd implementation", "large scale strong full one node parameters"]}, - "linear_regression": { - "SETS": ["linear_regression data", "spmd default parameters", "sklearnex spmd implementation", "large scale strong full one node parameters"] - }, - "pca": {"SETS": ["pca data", "spmd default parameters", "sklearnex spmd implementation", "large scale strong full one node parameters"]} - } -} diff --git a/sklbench/benchmarks/custom_function.py b/sklbench/benchmarks/custom_function.py index 25abb900..34b223ed 100644 --- a/sklbench/benchmarks/custom_function.py +++ b/sklbench/benchmarks/custom_function.py @@ -64,9 +64,13 @@ def get_function_args(bench_case: BenchCase, x_train, y_train, x_test, y_test) - def measure_function_instance(bench_case, function_instance, args: Tuple, kwargs: Dict): metrics = dict() - metrics["time[ms]"], metrics["time std[ms]"], _ = measure_case( - bench_case, function_instance, *args, **kwargs - ) + ( + metrics["time[ms]"], + metrics["time std[ms]"], + metrics["first iter[ms]"], + metrics["box filter mean[ms]"], + metrics["box filter std[ms]"], + ) = measure_case(bench_case, function_instance, *args, **kwargs) return metrics diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py index e57a9038..4164a10d 100644 --- a/sklbench/benchmarks/sklearn_estimator.py +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -516,7 +516,11 @@ def main(bench_case: BenchCase, filters: List[BenchCase]): bench_case, "algorithm:estimator_params", dict() ) # logger.debug("estimator params: " + str(estimator_params)) - if "DBSCAN" in str(estimator_name): + if ( + "DBSCAN" in str(estimator_name) + and get_bench_case_value(bench_case, "data:distributed_split", None) + != "rank_based" + ): if "min_samples" in estimator_params: from mpi4py import MPI diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py index 86944ead..38b4fe3b 100644 --- a/sklbench/datasets/transformer.py +++ b/sklbench/datasets/transformer.py @@ -109,11 +109,11 @@ def split_and_transform_data(bench_case, data, data_description): y_train, y_test = None, None distributed_split = get_bench_case_value(bench_case, "data:distributed_split", None) - knn_split_train = ( - "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "") - and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1 - ) - if distributed_split == "rank_based" or knn_split_train: + # knn_split_train = ( + # "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "") + # and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1 + # ) + if distributed_split == "rank_based": from mpi4py import MPI comm = MPI.COMM_WORLD diff --git a/sklbench/utils/measurement.py b/sklbench/utils/measurement.py index bfabbdc0..3677e760 100644 --- a/sklbench/utils/measurement.py +++ b/sklbench/utils/measurement.py @@ -79,7 +79,7 @@ def measure_time( t0 = timeit.default_timer() func_return_value = func(*args, **kwargs) t1 = timeit.default_timer() - if hasattr(func.__self__, "_n_inner_iter"): + if hasattr(func, "__self__") and hasattr(func.__self__, "_n_inner_iter"): inners.append(func.__self__._n_inner_iter) iters.append(func.__self__.n_iter_) if enable_itt and itt_is_available: @@ -92,16 +92,20 @@ def measure_time( f"exceeded time limit ({time_limit} seconds)" ) break - from mpi4py import MPI - - if MPI.COMM_WORLD.Get_rank() == 0: - logger.debug( - "iters across n runs: " - + str(iters) - + ", inner iters across n runs: " - + str(inners) - ) - logger.debug(times) + + try: + from mpi4py import MPI + + if MPI.COMM_WORLD.Get_rank() == 0: + logger.debug( + "iters across n runs: " + + str(iters) + + ", inner iters across n runs: " + + str(inners) + ) + logger.debug(f"Runtime for all {n_runs} iterations: {times}") + except ModuleNotFoundError: + logger.debug(f"Runtime for all {n_runs} iterations: {times}") # mean, std = box_filter(times) # if std / mean > std_mean_ratio: # logger.warning( diff --git a/test-configuration-linux.yml b/test-configuration-linux.yml index 722d1008..a37769ce 100644 --- a/test-configuration-linux.yml +++ b/test-configuration-linux.yml @@ -45,11 +45,6 @@ steps: conda activate bench-env python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json displayName: Sklearn example run - - script: | - source /usr/share/miniconda/etc/profile.d/conda.sh - conda activate bench-env - python -m sklbench --report -l DEBUG --report -c configs/incremental.json - displayName: Incremental algorithms example run - script: | source /usr/share/miniconda/etc/profile.d/conda.sh conda activate bench-env diff --git a/test-configuration-win.yml b/test-configuration-win.yml index f3ac1595..a1eddaeb 100644 --- a/test-configuration-win.yml +++ b/test-configuration-win.yml @@ -43,10 +43,6 @@ steps: call activate bench-env python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json displayName: Sklearn example run - - script: | - call activate bench-env - python -m sklbench --report -l DEBUG --report -c configs/incremental.json - displayName: Incremental algorithms example run - script: | call activate bench-env python -m sklbench --report -l DEBUG --report -c configs/xgboost_example.json