From 9b1190b39775c1e72f3eaa7bf70f3aa642681d41 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Fri, 16 Jul 2021 13:52:58 +0300 Subject: [PATCH 01/10] Additional gpu config + codeowner --- .github/CODEOWNERS | 7 +- .../xgboost/xgb_gpu_additional_config.json | 141 ++++++++++++++++++ ...u_config.json => xgb_gpu_main_config.json} | 0 3 files changed, 145 insertions(+), 3 deletions(-) create mode 100644 configs/xgboost/xgb_gpu_additional_config.json rename configs/xgboost/{xgb_gpu_config.json => xgb_gpu_main_config.json} (100%) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 914918261..aacf53040 100755 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,8 +1,9 @@ #owners and reviewers -sklearn_bench/* @PetrovKP @Alexsandruss -daal4py_bench/* @PetrovKP @Alexsandruss cuml_bench/* @PetrovKP @Alexsandruss -datasets/* @PetrovKP @Alexsandruss +daal4py_bench/* @PetrovKP @Alexsandruss +datasets/* @PetrovKP @Alexsandruss @RukhovichIV modelbuilders_bench/* @ShvetsKS @RukhovichIV +report_generator/* @PetrovKP @Alexsandruss @RukhovichIV +sklearn_bench/* @PetrovKP @Alexsandruss xgboost_bench/* @ShvetsKS @RukhovichIV *.md @outoftardis diff --git a/configs/xgboost/xgb_gpu_additional_config.json b/configs/xgboost/xgb_gpu_additional_config.json new file mode 100644 index 000000000..75036ad4b --- /dev/null +++ b/configs/xgboost/xgb_gpu_additional_config.json @@ -0,0 +1,141 @@ +{ + "common": { + "lib": "xgboost", + "data-format": "cudf", + "data-order": "F", + "dtype": "float32", + "algorithm": "gbt", + "tree-method": "gpu_hist", + "count-dmatrix": "", + "max-depth": 8, + "learning-rate": 0.1, + "reg-lambda": 1, + "max-leaves": 256 + }, + "cases": [ + { + "objective": "binary:logistic", + "scale-pos-weight": 2.1067817411664587, + "dataset": [ + { + "source": "npy", + "name": "airline", + "training": { + "x": "data/airline_x_train.npy", + "y": "data/airline_y_train.npy" + }, + "testing": { + "x": "data/airline_x_test.npy", + "y": "data/airline_y_test.npy" + } + } + ] + }, + { + "objective": "binary:logistic", + "scale-pos-weight": 173.63348001466812, + "dataset": [ + { + "source": "npy", + "name": "bosch", + "training": { + "x": "data/bosch_x_train.npy", + "y": "data/bosch_y_train.npy" + }, + "testing": { + "x": "data/bosch_x_test.npy", + "y": "data/bosch_y_test.npy" + } + } + ] + }, + { + "objective": "multi:softmax", + "dataset": [ + { + "source": "npy", + "name": "covtype", + "training": { + "x": "data/covtype_x_train.npy", + "y": "data/covtype_y_train.npy" + }, + "testing": { + "x": "data/covtype_x_test.npy", + "y": "data/covtype_y_test.npy" + } + } + ] + }, + { + "objective": "binary:logistic", + "scale-pos-weight": 2.0017715678375363, + "dataset": [ + { + "source": "npy", + "name": "epsilon", + "training": { + "x": "data/epsilon_x_train.npy", + "y": "data/epsilon_y_train.npy" + }, + "testing": { + "x": "data/epsilon_x_test.npy", + "y": "data/epsilon_y_test.npy" + } + } + ] + }, + { + "objective": "binary:logistic", + "scale-pos-weight": 578.2868020304569, + "dataset": [ + { + "source": "npy", + "name": "fraud", + "training": { + "x": "data/fraud_x_train.npy", + "y": "data/fraud_y_train.npy" + }, + "testing": { + "x": "data/fraud_x_test.npy", + "y": "data/fraud_y_test.npy" + } + } + ] + }, + { + "objective": "binary:logistic", + "scale-pos-weight": 1.8872389605086624, + "dataset": [ + { + "source": "npy", + "name": "higgs", + "training": { + "x": "data/higgs_x_train.npy", + "y": "data/higgs_y_train.npy" + }, + "testing": { + "x": "data/higgs_x_test.npy", + "y": "data/higgs_y_test.npy" + } + } + ] + }, + { + "objective": "reg:squarederror", + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ] + } + ] +} diff --git a/configs/xgboost/xgb_gpu_config.json b/configs/xgboost/xgb_gpu_main_config.json similarity index 100% rename from configs/xgboost/xgb_gpu_config.json rename to configs/xgboost/xgb_gpu_main_config.json From 91a97e923fc053dfc62d65d4103b74c1ded787a4 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Fri, 16 Jul 2021 13:55:26 +0300 Subject: [PATCH 02/10] tryin to use pandas on v100 --- configs/xgboost/xgb_gpu_additional_config.json | 3 +-- configs/xgboost/xgb_gpu_main_config.json | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/configs/xgboost/xgb_gpu_additional_config.json b/configs/xgboost/xgb_gpu_additional_config.json index 75036ad4b..252900801 100644 --- a/configs/xgboost/xgb_gpu_additional_config.json +++ b/configs/xgboost/xgb_gpu_additional_config.json @@ -1,12 +1,11 @@ { "common": { "lib": "xgboost", - "data-format": "cudf", + "data-format": "pandas", "data-order": "F", "dtype": "float32", "algorithm": "gbt", "tree-method": "gpu_hist", - "count-dmatrix": "", "max-depth": 8, "learning-rate": 0.1, "reg-lambda": 1, diff --git a/configs/xgboost/xgb_gpu_main_config.json b/configs/xgboost/xgb_gpu_main_config.json index 11144ca35..9d54c255f 100644 --- a/configs/xgboost/xgb_gpu_main_config.json +++ b/configs/xgboost/xgb_gpu_main_config.json @@ -1,12 +1,11 @@ { "common": { "lib": "xgboost", - "data-format": "cudf", + "data-format": "pandas", "data-order": "F", "dtype": "float32", "algorithm": "gbt", - "tree-method": "gpu_hist", - "count-dmatrix": "" + "tree-method": "gpu_hist" }, "cases": [ { From 29e31891cd3831e9e996ce117694b3b2f95ff268 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Fri, 16 Jul 2021 13:55:26 +0300 Subject: [PATCH 03/10] tryin to use pandas on v100 --- configs/xgboost/xgb_gpu_additional_config.json | 3 +-- configs/xgboost/xgb_gpu_main_config.json | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/configs/xgboost/xgb_gpu_additional_config.json b/configs/xgboost/xgb_gpu_additional_config.json index 75036ad4b..252900801 100644 --- a/configs/xgboost/xgb_gpu_additional_config.json +++ b/configs/xgboost/xgb_gpu_additional_config.json @@ -1,12 +1,11 @@ { "common": { "lib": "xgboost", - "data-format": "cudf", + "data-format": "pandas", "data-order": "F", "dtype": "float32", "algorithm": "gbt", "tree-method": "gpu_hist", - "count-dmatrix": "", "max-depth": 8, "learning-rate": 0.1, "reg-lambda": 1, diff --git a/configs/xgboost/xgb_gpu_main_config.json b/configs/xgboost/xgb_gpu_main_config.json index 11144ca35..9d54c255f 100644 --- a/configs/xgboost/xgb_gpu_main_config.json +++ b/configs/xgboost/xgb_gpu_main_config.json @@ -1,12 +1,11 @@ { "common": { "lib": "xgboost", - "data-format": "cudf", + "data-format": "pandas", "data-order": "F", "dtype": "float32", "algorithm": "gbt", - "tree-method": "gpu_hist", - "count-dmatrix": "" + "tree-method": "gpu_hist" }, "cases": [ { From ac50ba322992b953e6d60bb7d6ca478bbf4802b0 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Fri, 16 Jul 2021 16:48:58 +0300 Subject: [PATCH 04/10] XGBoost bench with dmatrix reporting --- modelbuilders_bench/xgb_mb.py | 6 +++++- xgboost_bench/gbt.py | 21 +++++++++++++-------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/modelbuilders_bench/xgb_mb.py b/modelbuilders_bench/xgb_mb.py index 92764142d..d688e9bb7 100644 --- a/modelbuilders_bench/xgb_mb.py +++ b/modelbuilders_bench/xgb_mb.py @@ -85,6 +85,9 @@ def convert_xgb_predictions(y_pred, objective): help='The tree construction algorithm used in XGBoost') params = bench.parse_args(parser) +# Default seed +if params.seed == 12345: + params.seed = 0 X_train, X_test, y_train, y_test = bench.load_data(params) @@ -198,4 +201,5 @@ def predict(dmatrix): # type: ignore predict_time_daal], accuracy_type=metric_name, accuracies=[None, train_metric, None, test_metric, None, test_metric_daal], - data=[X_train, X_train, X_test, X_test, X_test, X_test]) + data=[X_train, X_train, X_test, X_test, X_test, X_test], + alg_instance=booster, alg_params=xgb_params) diff --git a/xgboost_bench/gbt.py b/xgboost_bench/gbt.py index aa54a094d..9e8418a8e 100644 --- a/xgboost_bench/gbt.py +++ b/xgboost_bench/gbt.py @@ -140,8 +140,10 @@ def convert_xgb_predictions(y_pred, objective): if params.n_classes > 2: xgb_params['num_class'] = params.n_classes -dtrain = xgb.DMatrix(X_train, y_train) -dtest = xgb.DMatrix(X_test, y_test) +t_creat_train, dtrain = bench.measure_function_time(xgb.DMatrix, X_train, + params=params, label=y_train) +t_creat_test, dtest = bench.measure_function_time( + xgb.DMatrix, X_test, params=params, label=y_test) def fit(dmatrix): @@ -173,9 +175,12 @@ def predict(dmatrix): # type: ignore predict, None if params.inplace_predict or params.count_dmatrix else dtest, params=params) test_metric = metric_func(convert_xgb_predictions(y_pred, params.objective), y_test) -bench.print_output(library='xgboost', algorithm=f'gradient_boosted_trees_{task}', - stages=['training', 'prediction'], - params=params, functions=['gbt.fit', 'gbt.predict'], - times=[fit_time, predict_time], accuracy_type=metric_name, - accuracies=[train_metric, test_metric], data=[X_train, X_test], - alg_instance=booster, alg_params=xgb_params) +bench.print_output( + library='xgboost', algorithm=f'xgboost_{task}', + stages=['training_preparation', 'training', 'prediction_preparation', 'prediction'], + params=params, + functions=['xgb.dmatrix.train', 'xgb.train', 'xgb.dmatrix.test', 'xgb.predict'], + times=[t_creat_train, fit_time, t_creat_test, predict_time], + accuracy_type=metric_name, accuracies=[None, train_metric, None, test_metric], + data=[X_train, X_train, X_test, X_test], + alg_instance=booster, alg_params=xgb_params) From 5449446486c5b3e44f08e92414a96908f5ccc0f3 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Fri, 16 Jul 2021 16:58:41 +0300 Subject: [PATCH 05/10] Pandas for big datasets --- .../xgboost/xgb_gpu_additional_config.json | 8 ++++- configs/xgboost/xgb_gpu_main_config.json | 33 +++++++++++-------- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/configs/xgboost/xgb_gpu_additional_config.json b/configs/xgboost/xgb_gpu_additional_config.json index 252900801..b8ca4fda3 100644 --- a/configs/xgboost/xgb_gpu_additional_config.json +++ b/configs/xgboost/xgb_gpu_additional_config.json @@ -1,7 +1,6 @@ { "common": { "lib": "xgboost", - "data-format": "pandas", "data-order": "F", "dtype": "float32", "algorithm": "gbt", @@ -14,6 +13,7 @@ "cases": [ { "objective": "binary:logistic", + "data-format": "pandas", "scale-pos-weight": 2.1067817411664587, "dataset": [ { @@ -32,6 +32,7 @@ }, { "objective": "binary:logistic", + "data-format": "cudf", "scale-pos-weight": 173.63348001466812, "dataset": [ { @@ -50,6 +51,7 @@ }, { "objective": "multi:softmax", + "data-format": "cudf", "dataset": [ { "source": "npy", @@ -67,6 +69,7 @@ }, { "objective": "binary:logistic", + "data-format": "pandas", "scale-pos-weight": 2.0017715678375363, "dataset": [ { @@ -85,6 +88,7 @@ }, { "objective": "binary:logistic", + "data-format": "cudf", "scale-pos-weight": 578.2868020304569, "dataset": [ { @@ -103,6 +107,7 @@ }, { "objective": "binary:logistic", + "data-format": "cudf", "scale-pos-weight": 1.8872389605086624, "dataset": [ { @@ -121,6 +126,7 @@ }, { "objective": "reg:squarederror", + "data-format": "cudf", "dataset": [ { "source": "npy", diff --git a/configs/xgboost/xgb_gpu_main_config.json b/configs/xgboost/xgb_gpu_main_config.json index 9d54c255f..3bbe9f1bc 100644 --- a/configs/xgboost/xgb_gpu_main_config.json +++ b/configs/xgboost/xgb_gpu_main_config.json @@ -1,7 +1,6 @@ { "common": { "lib": "xgboost", - "data-format": "pandas", "data-order": "F", "dtype": "float32", "algorithm": "gbt", @@ -9,6 +8,8 @@ }, "cases": [ { + "objective": "reg:squarederror", + "data-format": "cudf", "dataset": [ { "source": "npy", @@ -25,10 +26,11 @@ ], "learning-rate": 0.03, "max-depth": 6, - "n-estimators": 1000, - "objective": "reg:squarederror" + "n-estimators": 1000 }, { + "objective": "binary:logistic", + "data-format": "pandas", "dataset": [ { "source": "npy", @@ -52,10 +54,11 @@ "min-child-weight": 0, "max-depth": 8, "max-leaves": 256, - "n-estimators": 1000, - "objective": "binary:logistic" + "n-estimators": 1000 }, { + "objective": "binary:logistic", + "data-format": "pandas", "dataset": [ { "source": "npy", @@ -80,10 +83,11 @@ "max-depth": 8, "max-leaves": 256, "n-estimators": 1000, - "objective": "binary:logistic", "inplace-predict": "" }, { + "objective": "multi:softprob", + "data-format": "cudf", "dataset": [ { "source": "npy", @@ -100,10 +104,11 @@ ], "learning-rate": 0.03, "max-depth": 6, - "n-estimators": 1000, - "objective": "multi:softprob" + "n-estimators": 1000 }, { + "objective": "multi:softprob", + "data-format": "cudf", "dataset": [ { "source": "npy", @@ -121,10 +126,11 @@ "min-child-weight": 1, "min-split-loss": 0.1, "max-depth": 8, - "n-estimators": 200, - "objective": "multi:softprob" + "n-estimators": 200 }, { + "objective": "reg:squarederror", + "data-format": "cudf", "dataset": [ { "source": "npy", @@ -136,7 +142,6 @@ } ], "n-estimators": 100, - "objective": "reg:squarederror", "max-depth": 8, "scale-pos-weight": 2, "learning-rate": 0.1, @@ -147,6 +152,8 @@ "max-leaves": 256 }, { + "objective": "multi:softprob", + "data-format": "cudf", "dataset": [ { "source": "npy", @@ -162,12 +169,13 @@ } ], "n-estimators": 60, - "objective": "multi:softprob", "max-depth": 7, "subsample": 0.7, "colsample-bytree": 0.7 }, { + "objective": "binary:logistic", + "data-format": "cudf", "dataset": [ { "source": "npy", @@ -183,7 +191,6 @@ } ], "n-estimators": 10000, - "objective": "binary:logistic", "max-depth": 1, "subsample": 0.5, "eta": 0.1, From 362076cb9f13535aad64c97f8d96ebcb7701b509 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Thu, 29 Jul 2021 18:10:24 +0300 Subject: [PATCH 06/10] lgbm-mb benchmark update + r2_score --- bench.py | 7 +++ modelbuilders_bench/lgbm_mb.py | 93 ++++++++++++++++------------ modelbuilders_bench/xgb_mb.py | 19 +++--- report_generator/report_generator.py | 2 +- xgboost_bench/gbt.py | 19 +++--- 5 files changed, 83 insertions(+), 57 deletions(-) diff --git a/bench.py b/bench.py index cd26c166e..932bdb0c7 100644 --- a/bench.py +++ b/bench.py @@ -354,6 +354,13 @@ def rmse_score(y, yp): y, yp, lambda y1, y2: float(np.sqrt(np.mean((y1 - y2)**2)))) +def r2_score(y, yp): + from sklearn.metrics import r2_score as sklearn_r2_score + y = convert_to_numpy(y) + yp = convert_to_numpy(yp) + return sklearn_r2_score(y, yp) + + def convert_data(data, dtype, data_order, data_format): ''' Convert input data (numpy array) to needed format, type and order diff --git a/modelbuilders_bench/lgbm_mb.py b/modelbuilders_bench/lgbm_mb.py index 2b4c29616..56659cf12 100644 --- a/modelbuilders_bench/lgbm_mb.py +++ b/modelbuilders_bench/lgbm_mb.py @@ -22,7 +22,7 @@ import lightgbm as lgbm import numpy as np -import modelbuilders_bench.mb_utils as utils +# import modelbuilders_bench.mb_utils as utils parser = argparse.ArgumentParser( description='lightgbm gbt + model transform + daal predict benchmark') @@ -30,6 +30,8 @@ parser.add_argument('--colsample-bytree', type=float, default=1, help='Subsample ratio of columns ' 'when constructing each tree') +parser.add_argument('--count-dmatrix', default=False, action='store_true', + help='Count DMatrix creation in time measurements') parser.add_argument('--learning-rate', '--eta', type=float, default=0.3, help='Step size shrinkage used in update ' 'to prevents overfitting') @@ -87,65 +89,80 @@ if params.threads != -1: lgbm_params.update({'nthread': params.threads}) -if 'OMP_NUM_THREADS' in os.environ.keys(): - lgbm_params['nthread'] = int(os.environ['OMP_NUM_THREADS']) - if params.objective.startswith('reg'): task = 'regression' metric_name, metric_func = 'rmse', bench.rmse_score else: task = 'classification' - metric_name, metric_func = 'accuracy[%]', utils.get_accuracy + metric_name, metric_func = 'accuracy', bench.accuracy_score if 'cudf' in str(type(y_train)): params.n_classes = y_train[y_train.columns[0]].nunique() else: params.n_classes = len(np.unique(y_train)) + + # Covtype has one class more than there is in train + if params.dataset_name == 'covtype': + params.n_classes += 1 + if params.n_classes > 2: lgbm_params['num_class'] = params.n_classes -t_creat_train, lgbm_train = bench.measure_function_time(lgbm.Dataset, X_train, - y_train, params=params, - free_raw_data=False) - -t_creat_test, lgbm_test = bench.measure_function_time(lgbm.Dataset, X_test, y_test, - params=params, reference=lgbm_train, - free_raw_data=False) - -t_train, model_lgbm = bench.measure_function_time(lgbm.train, lgbm_params, lgbm_train, - params=params, - num_boost_round=params.n_estimators, - valid_sets=lgbm_train, - verbose_eval=False) -train_metric = None -if not X_train.equals(X_test): - y_train_pred = model_lgbm.predict(X_train) - train_metric = metric_func(y_train, y_train_pred) - -t_lgbm_pred, y_test_pred = bench.measure_function_time(model_lgbm.predict, X_test, - params=params) -test_metric_lgbm = metric_func(y_test, y_test_pred) - -t_trans, model_daal = bench.measure_function_time( +t_creat_train, dtrain = bench.measure_function_time(lgbm.Dataset, X_train, + y_train, params=params, + free_raw_data=False) + +t_creat_test, dtest = bench.measure_function_time(lgbm.Dataset, X_test, y_test, + params=params, reference=dtrain, + free_raw_data=False) + + +def fit(dataset): + if dataset is None: + dataset = lgbm.Dataset(X_train, y_train, free_raw_data=False) + return lgbm.train( + lgbm_params, dataset, num_boost_round=params.n_estimators, valid_sets=dataset, + verbose_eval=False) + + +def predict(dataset): # type: ignore + if dataset is None: + dataset = lgbm.Dataset(X_test, y_test, free_raw_data=False) + return model_lgbm.predict(dataset) + + +fit_time, model_lgbm = bench.measure_function_time( + fit, None if params.count_dmatrix else dtrain, params=params) +train_metric = metric_func(model_lgbm.predict(dtrain), y_train) + +predict_time, y_pred = bench.measure_function_time( + predict, None if params.count_dmatrix else dtest, params=params) +test_metric = metric_func(y_pred, y_test) + +transform_time, model_daal = bench.measure_function_time( daal4py.get_gbt_model_from_lightgbm, model_lgbm, params=params) if hasattr(params, 'n_classes'): predict_algo = daal4py.gbt_classification_prediction( nClasses=params.n_classes, resultsToEvaluate='computeClassLabels', fptype='float') - t_daal_pred, daal_pred = bench.measure_function_time( + predict_time_daal, daal_pred = bench.measure_function_time( predict_algo.compute, X_test, model_daal, params=params) test_metric_daal = metric_func(y_test, daal_pred.prediction) else: predict_algo = daal4py.gbt_regression_prediction() - t_daal_pred, daal_pred = bench.measure_function_time( + predict_time_daal, daal_pred = bench.measure_function_time( predict_algo.compute, X_test, model_daal, params=params) test_metric_daal = metric_func(y_test, daal_pred.prediction) -utils.print_output( +bench.print_output( library='modelbuilders', algorithm=f'lightgbm_{task}_and_modelbuilder', - stages=['lgbm_train', 'lgbm_predict', 'daal4py_predict'], - params=params, functions=['lgbm_dataset', 'lgbm_dataset', 'lgbm_train', - 'lgbm_predict', 'lgbm_to_daal', 'daal_compute'], - times=[t_creat_train, t_train, t_creat_test, t_lgbm_pred, t_trans, t_daal_pred], - accuracy_type=metric_name, accuracies=[train_metric, test_metric_lgbm, - test_metric_daal], - data=[X_train, X_test, X_test]) + stages=['training_preparation', 'training', 'prediction_preparation', 'prediction', + 'transformation', 'alternative_prediction'], + params=params, + functions=['lgbm.Dataset.train', 'lgbm.train', 'lgbm.Dataset.test', 'lgbm.predict', + 'daal4py.get_gbt_model_from_lightgbm', 'daal4py.compute'], + times=[t_creat_train, fit_time, t_creat_test, predict_time, transform_time, + predict_time_daal], + accuracy_type=metric_name, + accuracies=[None, train_metric, None, test_metric, None, test_metric_daal], + data=[X_train, X_train, X_test, X_test, X_test, X_test], + alg_instance=model_lgbm, alg_params=lgbm_params) \ No newline at end of file diff --git a/modelbuilders_bench/xgb_mb.py b/modelbuilders_bench/xgb_mb.py index d688e9bb7..31fda3fbd 100644 --- a/modelbuilders_bench/xgb_mb.py +++ b/modelbuilders_bench/xgb_mb.py @@ -26,11 +26,12 @@ def convert_probs_to_classes(y_prob): return np.array([np.argmax(y_prob[i]) for i in range(y_prob.shape[0])]) -def convert_xgb_predictions(y_pred, objective): - if objective == 'multi:softprob': - y_pred = convert_probs_to_classes(y_pred) - elif objective == 'binary:logistic': - y_pred = y_pred.astype(np.int32) +def convert_xgb_predictions(y_pred, objective, metric_name): + if metric_name == "accuracy": + if objective == 'multi:softprob': + y_pred = convert_probs_to_classes(y_pred) + elif objective == 'binary:logistic': + y_pred = y_pred.astype(np.int32) return y_pred @@ -126,8 +127,7 @@ def convert_xgb_predictions(y_pred, objective): metric_name, metric_func = 'rmse', bench.rmse_score else: task = 'classification' - metric_name = 'accuracy' - metric_func = bench.accuracy_score + metric_name, metric_func = 'accuracy', bench.accuracy_score if 'cudf' in str(type(y_train)): params.n_classes = y_train[y_train.columns[0]].nunique() else: @@ -168,12 +168,13 @@ def predict(dmatrix): # type: ignore train_metric = metric_func( convert_xgb_predictions( booster.predict(dtrain), - params.objective), + params.objective, metric_name), y_train) predict_time, y_pred = bench.measure_function_time( predict, None if params.inplace_predict or params.count_dmatrix else dtest, params=params) -test_metric = metric_func(convert_xgb_predictions(y_pred, params.objective), y_test) +test_metric = metric_func(convert_xgb_predictions( + y_pred, params.objective, metric_name), y_test) transform_time, model_daal = bench.measure_function_time( daal4py.get_gbt_model_from_xgboost, booster, params=params) diff --git a/report_generator/report_generator.py b/report_generator/report_generator.py index 8f1c0d601..dc4e38f3f 100755 --- a/report_generator/report_generator.py +++ b/report_generator/report_generator.py @@ -179,7 +179,7 @@ def create_list(res_entry, props_list): 'inference': ['prediction_preparation', 'prediction', 'alternative_prediction', 'transformation', 'search', 'predict_proba'] } -possible_metrics = {'accuracy', 'accuracy[%]', 'rmse', +possible_metrics = {'accuracy', 'accuracy[%]', 'rmse', 'r2', 'davies_bouldin_score', 'inertia', 'log_loss'} for stage_key in stages_splitter.keys(): diff --git a/xgboost_bench/gbt.py b/xgboost_bench/gbt.py index 9e8418a8e..7b1490439 100644 --- a/xgboost_bench/gbt.py +++ b/xgboost_bench/gbt.py @@ -25,11 +25,12 @@ def convert_probs_to_classes(y_prob): return np.array([np.argmax(y_prob[i]) for i in range(y_prob.shape[0])]) -def convert_xgb_predictions(y_pred, objective): - if objective == 'multi:softprob': - y_pred = convert_probs_to_classes(y_pred) - elif objective == 'binary:logistic': - y_pred = y_pred.astype(np.int32) +def convert_xgb_predictions(y_pred, objective, metric_name): + if metric_name == "accuracy": + if objective == 'multi:softprob': + y_pred = convert_probs_to_classes(y_pred) + elif objective == 'binary:logistic': + y_pred = y_pred.astype(np.int32) return y_pred @@ -126,8 +127,7 @@ def convert_xgb_predictions(y_pred, objective): metric_name, metric_func = 'rmse', bench.rmse_score else: task = 'classification' - metric_name = 'accuracy' - metric_func = bench.accuracy_score + metric_name, metric_func = 'accuracy', bench.accuracy_score if 'cudf' in str(type(y_train)): params.n_classes = y_train[y_train.columns[0]].nunique() else: @@ -168,12 +168,13 @@ def predict(dmatrix): # type: ignore train_metric = metric_func( convert_xgb_predictions( booster.predict(dtrain), - params.objective), + params.objective, metric_name), y_train) predict_time, y_pred = bench.measure_function_time( predict, None if params.inplace_predict or params.count_dmatrix else dtest, params=params) -test_metric = metric_func(convert_xgb_predictions(y_pred, params.objective), y_test) +test_metric = metric_func(convert_xgb_predictions( + y_pred, params.objective, metric_name), y_test) bench.print_output( library='xgboost', algorithm=f'xgboost_{task}', From 8ea64ece048ce43144d701f19736dfeac2d2c9cc Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Fri, 6 Aug 2021 11:32:18 +0300 Subject: [PATCH 07/10] Remove second r2 impl --- bench.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/bench.py b/bench.py index 2f12da641..e56630ac4 100644 --- a/bench.py +++ b/bench.py @@ -376,13 +376,6 @@ def davies_bouldin_score(X, labels): return res -def r2_score(y, yp): - from sklearn.metrics import r2_score as sklearn_r2_score - y = convert_to_numpy(y) - yp = convert_to_numpy(yp) - return sklearn_r2_score(y, yp) - - def convert_data(data, dtype, data_order, data_format): ''' Convert input data (numpy array) to needed format, type and order From c63b55d2b24aff45a618ab518347cb26be706d47 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Fri, 6 Aug 2021 11:48:38 +0300 Subject: [PATCH 08/10] Applying codefactor ci --- bench.py | 73 +++++++++++++++------------- report_generator/report_generator.py | 13 +++-- 2 files changed, 45 insertions(+), 41 deletions(-) diff --git a/bench.py b/bench.py index e56630ac4..b398bc8fc 100644 --- a/bench.py +++ b/bench.py @@ -389,14 +389,13 @@ def convert_data(data, dtype, data_order, data_format): # Secondly, change format of data if data_format == 'numpy': return data - elif data_format == 'pandas': + if data_format == 'pandas': import pandas as pd if data.ndim == 1: return pd.Series(data) - else: - return pd.DataFrame(data) - elif data_format == 'cudf': + return pd.DataFrame(data) + if data_format == 'cudf': import cudf import pandas as pd @@ -512,36 +511,42 @@ def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None, def print_output(library, algorithm, stages, params, functions, times, metric_type, metrics, data, alg_instance=None, alg_params=None): - if params.output_format == 'json': - output = [] - for i, stage in enumerate(stages): - result = gen_basic_dict(library, algorithm, stage, params, - data[i], alg_instance, alg_params) - result.update({'time[s]': times[i]}) - if metric_type is not None: - if isinstance(metric_type, str): - result.update({f'{metric_type}': metrics[i]}) - elif isinstance(metric_type, list): - for ind, val in enumerate(metric_type): - if metrics[ind][i] is not None: - result.update({f'{val}': metrics[ind][i]}) - if hasattr(params, 'n_classes'): - result['input_data'].update({'classes': params.n_classes}) - if hasattr(params, 'n_clusters'): - if algorithm == 'kmeans': - result['input_data'].update( - {'n_clusters': params.n_clusters}) - elif algorithm == 'dbscan': - result.update({'n_clusters': params.n_clusters}) - # replace non-string init with string for kmeans benchmarks - if alg_instance is not None: - if 'init' in result['algorithm_parameters'].keys(): - if not isinstance(result['algorithm_parameters']['init'], str): - result['algorithm_parameters']['init'] = 'random' - if 'handle' in result['algorithm_parameters'].keys(): - del result['algorithm_parameters']['handle'] - output.append(result) - print(json.dumps(output, indent=4)) + if params.output_format != 'json': + return + + output = [] + for i, stage in enumerate(stages): + result = gen_basic_dict(library, algorithm, stage, params, + data[i], alg_instance, alg_params) + result.update({'time[s]': times[i]}) + + if metric_type is not None: + if isinstance(metric_type, str): + result.update({f'{metric_type}': metrics[i]}) + elif isinstance(metric_type, list): + for ind, val in enumerate(metric_type): + if metrics[ind][i] is not None: + result.update({f'{val}': metrics[ind][i]}) + + if hasattr(params, 'n_classes'): + result['input_data'].update({'classes': params.n_classes}) + if hasattr(params, 'n_clusters'): + if algorithm == 'kmeans': + result['input_data'].update( + {'n_clusters': params.n_clusters}) + elif algorithm == 'dbscan': + result.update({'n_clusters': params.n_clusters}) + + # replace non-string init with string for kmeans benchmarks + if alg_instance is not None: + if 'init' in result['algorithm_parameters'].keys(): + if not isinstance(result['algorithm_parameters']['init'], str): + result['algorithm_parameters']['init'] = 'random' + if 'handle' in result['algorithm_parameters'].keys(): + del result['algorithm_parameters']['handle'] + output.append(result) + + print(json.dumps(output, indent=4)) def run_with_context(params, function): diff --git a/report_generator/report_generator.py b/report_generator/report_generator.py index dc4e38f3f..deb48b606 100755 --- a/report_generator/report_generator.py +++ b/report_generator/report_generator.py @@ -65,10 +65,9 @@ def results_are_mergeable(first_res, second_res, merging): sw_hash_equality = first_res['software_hash'] == second_res['software_hash'] if merging == 'hw_only': return hw_hash_equality - elif merging == 'sw_only': + if merging == 'sw_only': return sw_hash_equality - else: - return sw_hash_equality and hw_hash_equality + return sw_hash_equality and hw_hash_equality excel_header_columns = list(ascii_uppercase) @@ -232,14 +231,14 @@ def create_list(res_entry, props_list): ws[xy_to_excel_cell(0, 0)] = \ f"Software configuration {i} (hash: {json_res['software_hash']})" sw_conf = json.dumps(json_res['software'], indent=4).split('\n') - for j in range(len(sw_conf)): - ws[xy_to_excel_cell(0, 1 + j)] = sw_conf[j] + for j, elem in enumerate(sw_conf): + ws[xy_to_excel_cell(0, j + 1)] = elem ws = wb.create_sheet(title=f"HW config n{i}_{json_res['hardware_hash']}") ws[xy_to_excel_cell(0, 0)] = \ f"Hardware configuration {i} (hash: {json_res['hardware_hash']})" hw_conf = json.dumps(json_res['hardware'], indent=4).split('\n') - for j in range(len(hw_conf)): - ws[xy_to_excel_cell(0, 1 + j)] = hw_conf[j] + for j, elem in enumerate(hw_conf): + ws[xy_to_excel_cell(0, j + 1)] = elem wb.save(args.report_file) From eaa1fdc8ed7ed4dd2fc2d0e75797bd571e3b8c43 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Fri, 6 Aug 2021 11:50:06 +0300 Subject: [PATCH 09/10] Applying pep8 --- modelbuilders_bench/lgbm_mb.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modelbuilders_bench/lgbm_mb.py b/modelbuilders_bench/lgbm_mb.py index 1be88d6f7..0332fb60a 100644 --- a/modelbuilders_bench/lgbm_mb.py +++ b/modelbuilders_bench/lgbm_mb.py @@ -15,7 +15,6 @@ # =============================================================================== import argparse -import os import bench import daal4py From ca5b8552b52575302a8e50cb250a7312d2e76189 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Fri, 13 Aug 2021 15:33:10 +0300 Subject: [PATCH 10/10] Removed unnecesarry file --- modelbuilders_bench/lgbm_mb.py | 2 - modelbuilders_bench/mb_utils.py | 71 --------------------------------- 2 files changed, 73 deletions(-) delete mode 100644 modelbuilders_bench/mb_utils.py diff --git a/modelbuilders_bench/lgbm_mb.py b/modelbuilders_bench/lgbm_mb.py index 0332fb60a..e6daab0b2 100644 --- a/modelbuilders_bench/lgbm_mb.py +++ b/modelbuilders_bench/lgbm_mb.py @@ -21,8 +21,6 @@ import lightgbm as lgbm import numpy as np -# import modelbuilders_bench.mb_utils as utils - parser = argparse.ArgumentParser( description='lightgbm gbt + model transform + daal predict benchmark') diff --git a/modelbuilders_bench/mb_utils.py b/modelbuilders_bench/mb_utils.py deleted file mode 100644 index 7c54efd92..000000000 --- a/modelbuilders_bench/mb_utils.py +++ /dev/null @@ -1,71 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import json - -import numpy as np - - -def get_accuracy(true_labels, prediction): - errors = 0 - for i, true_label in enumerate(true_labels): - pred_label = 0 - if isinstance(prediction[i], (float, np.single, np.float)): - pred_label = prediction[i] > 0.5 - elif prediction[i].shape[0] == 1: - pred_label = prediction[i][0] - else: - pred_label = np.argmax(prediction[i]) - if true_label != pred_label: - errors += 1 - return 100 * (1 - errors / len(true_labels)) - - -def print_output(library, algorithm, stages, params, functions, - times, metric_type, metrics, data): - if params.output_format == 'json': - output = [] - output.append({ - 'library': library, - 'algorithm': algorithm, - 'input_data': { - 'data_format': params.data_format, - 'data_order': params.data_order, - 'data_type': str(params.dtype), - 'dataset_name': params.dataset_name, - 'rows': data[0].shape[0], - 'columns': data[0].shape[1] - } - }) - if hasattr(params, 'n_classes'): - output[-1]['input_data'].update({'classes': params.n_classes}) - for i, stage in enumerate(stages): - result = { - 'stage': stage, - } - if 'daal' in stage: - result.update({'conversion_to_daal4py': times[2 * i], - 'prediction_time': times[2 * i + 1]}) - elif 'train' in stage: - result.update({'matrix_creation_time': times[2 * i], - 'training_time': times[2 * i + 1]}) - else: - result.update({'matrix_creation_time': times[2 * i], - 'prediction_time': times[2 * i + 1]}) - if metrics[i] is not None: - result.update({f'{metric_type}': metrics[i]}) - output.append(result) - print(json.dumps(output, indent=4))