Skip to content

Commit 989317d

Browse files
committed
Added hyperparameter-tuning
1 parent 93fc861 commit 989317d

File tree

1 file changed

+214
-26
lines changed

1 file changed

+214
-26
lines changed

Stackoverflow_Survey_Analysis.ipynb

Lines changed: 214 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -14783,6 +14783,17 @@
1478314783
" return metrics"
1478414784
]
1478514785
},
14786+
{
14787+
"cell_type": "code",
14788+
"execution_count": null,
14789+
"metadata": {},
14790+
"outputs": [],
14791+
"source": [
14792+
"#For hyperparameter tuning we will use optuna\n",
14793+
"!pip install optuna\n",
14794+
"import optuna"
14795+
]
14796+
},
1478614797
{
1478714798
"cell_type": "code",
1478814799
"execution_count": 801,
@@ -14800,18 +14811,40 @@
1480014811
],
1480114812
"source": [
1480214813
"#DecisionTreeClassifier\n",
14814+
"# Define the objective function\n",
14815+
"def objective(trial):\n",
14816+
" max_depth = trial.suggest_int('max_depth', 1, 20)\n",
14817+
" min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 50)\n",
14818+
"\n",
14819+
" model = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf)\n",
14820+
" model.fit(X_train, y_train)\n",
14821+
" \n",
14822+
" y_pred = model.predict(X_test)\n",
14823+
" accuracy = accuracy_score(y_test, y_pred)\n",
14824+
" \n",
14825+
" return accuracy\n",
14826+
"\n",
14827+
"# Create a study object and optimize the objective function\n",
14828+
"study = optuna.create_study(direction='maximize')\n",
14829+
"study.optimize(objective, n_trials=100)\n",
14830+
"\n",
14831+
"# Print the best hyperparameters\n",
14832+
"print('Best hyperparameters: ', study.best_params)\n",
14833+
"\n",
14834+
"# Train the model with the best hyperparameters\n",
14835+
"best_params = study.best_params\n",
1480314836
"start = time.time()\n",
14804-
"modelDC = DecisionTreeClassifier(max_depth = 12, min_samples_leaf = 10)\n",
14837+
"modelDC = DecisionTreeClassifier(**best_params)\n",
1480514838
"modelDC.fit(X_train, y_train)\n",
1480614839
"end = time.time()\n",
1480714840
"TimeDC = end - start\n",
1480814841
"print('Time: ', TimeDC)\n",
1480914842
"\n",
14810-
"#Evaluating model on test set\n",
14843+
"# Evaluating model on test set\n",
1481114844
"y_pred = modelDC.predict(X_test)\n",
1481214845
"all_metrics.update(metrics_data(\"Decision Trees\", y_test, y_pred))\n",
1481314846
"\n",
14814-
"#Evaluating model on train set\n",
14847+
"# Evaluating model on train set\n",
1481514848
"y_pred = modelDC.predict(X_train)\n",
1481614849
"accuracyDC2 = accuracy_score(y_train, y_pred)\n",
1481714850
"print('Accuracy on train set: {}'.format(accuracyDC2))"
@@ -14833,17 +14866,39 @@
1483314866
],
1483414867
"source": [
1483514868
"#MultinomialNB\n",
14869+
"# Define the objective function\n",
14870+
"def objective(trial):\n",
14871+
" alpha = trial.suggest_float('alpha', 1e-3, 1e-1, log=True)\n",
14872+
"\n",
14873+
" model = MultinomialNB(alpha=alpha)\n",
14874+
" model.fit(X_train, y_train)\n",
14875+
" \n",
14876+
" y_pred = model.predict(X_test)\n",
14877+
" accuracy = accuracy_score(y_test, y_pred)\n",
14878+
" \n",
14879+
" return accuracy\n",
14880+
"\n",
14881+
"# Create a study object and optimize the objective function\n",
14882+
"study = optuna.create_study(direction='maximize')\n",
14883+
"study.optimize(objective, n_trials=100)\n",
14884+
"\n",
14885+
"# Print the best hyperparameters\n",
14886+
"print('Best hyperparameters: ', study.best_params)\n",
14887+
"\n",
14888+
"# Train the model with the best hyperparameters\n",
14889+
"best_params = study.best_params\n",
1483614890
"start = time.time()\n",
14837-
"modelNB = MultinomialNB(alpha=0.005)\n",
14891+
"modelNB = MultinomialNB(**best_params)\n",
1483814892
"modelNB.fit(X_train, y_train)\n",
1483914893
"end = time.time()\n",
1484014894
"TimeNB = end - start\n",
14895+
"print('Time: ', TimeNB)\n",
1484114896
"\n",
14842-
"#Evaluating model on test set\n",
14897+
"# Evaluating model on test set\n",
1484314898
"y_pred = modelNB.predict(X_test)\n",
1484414899
"all_metrics.update(metrics_data(\"Multinomial Naive Bayes\", y_test, y_pred))\n",
1484514900
"\n",
14846-
"#Evaluating model on train set\n",
14901+
"# Evaluating model on train set\n",
1484714902
"y_pred = modelNB.predict(X_train)\n",
1484814903
"accuracyNB2 = accuracy_score(y_train, y_pred)\n",
1484914904
"print('Accuracy on train set: {}'.format(accuracyNB2))"
@@ -14931,18 +14986,40 @@
1493114986
],
1493214987
"source": [
1493314988
"#GaussianNB\n",
14989+
"\n",
14990+
"# Define the objective function\n",
14991+
"def objective(trial):\n",
14992+
" var_smoothing = trial.suggest_float('var_smoothing', 1e-11, 1e-7, log=True)\n",
14993+
"\n",
14994+
" model = GaussianNB(var_smoothing=var_smoothing)\n",
14995+
" model.fit(X_train, y_train)\n",
14996+
" \n",
14997+
" y_pred = model.predict(X_test)\n",
14998+
" accuracy = accuracy_score(y_test, y_pred)\n",
14999+
" \n",
15000+
" return accuracy\n",
15001+
"\n",
15002+
"# Create a study object and optimize the objective function\n",
15003+
"study = optuna.create_study(direction='maximize')\n",
15004+
"study.optimize(objective, n_trials=100)\n",
15005+
"\n",
15006+
"# Print the best hyperparameters\n",
15007+
"print('Best hyperparameters: ', study.best_params)\n",
15008+
"\n",
15009+
"# Train the model with the best hyperparameters\n",
15010+
"best_params = study.best_params\n",
1493415011
"start = time.time()\n",
14935-
"modelGNB = GaussianNB()\n",
15012+
"modelGNB = GaussianNB(**best_params)\n",
1493615013
"modelGNB.fit(X_train, y_train)\n",
1493715014
"end = time.time()\n",
1493815015
"TimeGNB = end - start\n",
1493915016
"print('Time: ', TimeGNB)\n",
1494015017
"\n",
14941-
"#Evaluating model on test set\n",
15018+
"# Evaluating model on test set\n",
1494215019
"y_pred = modelGNB.predict(X_test)\n",
1494315020
"all_metrics.update(metrics_data(\"Gaussian Naive Bayes\", y_test, y_pred))\n",
1494415021
"\n",
14945-
"#Evaluating model on train set\n",
15022+
"# Evaluating model on train set\n",
1494615023
"y_pred = modelGNB.predict(X_train)\n",
1494715024
"accuracyGNB2 = accuracy_score(y_train, y_pred)\n",
1494815025
"print('Accuracy on train set: {}'.format(accuracyGNB2))"
@@ -14965,18 +15042,41 @@
1496515042
],
1496615043
"source": [
1496715044
"#Logistic Regression\n",
15045+
"# Define the objective function\n",
15046+
"def objective(trial):\n",
15047+
" # Define the search space for hyperparameters\n",
15048+
" C = trial.suggest_float('C', 1e-4, 1e2, log=True)\n",
15049+
" solver = trial.suggest_categorical('solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag', 'saga'])\n",
15050+
"\n",
15051+
" model = LogisticRegression(C=C, solver=solver, max_iter=1000)\n",
15052+
" model.fit(X_train, y_train)\n",
15053+
" \n",
15054+
" y_pred = model.predict(X_test)\n",
15055+
" accuracy = accuracy_score(y_test, y_pred)\n",
15056+
" \n",
15057+
" return accuracy\n",
15058+
"\n",
15059+
"# Create a study object and optimize the objective function\n",
15060+
"study = optuna.create_study(direction='maximize')\n",
15061+
"study.optimize(objective, n_trials=100)\n",
15062+
"\n",
15063+
"# Print the best hyperparameters\n",
15064+
"print('Best hyperparameters: ', study.best_params)\n",
15065+
"\n",
15066+
"# Train the model with the best hyperparameters\n",
15067+
"best_params = study.best_params\n",
1496815068
"start = time.time()\n",
14969-
"modelLR = LogisticRegression()\n",
15069+
"modelLR = LogisticRegression(**best_params, max_iter=1000)\n",
1497015070
"modelLR.fit(X_train, y_train)\n",
1497115071
"end = time.time()\n",
1497215072
"TimeLR = end - start\n",
1497315073
"print('Time: ', TimeLR)\n",
1497415074
"\n",
14975-
"#Evaluating model on test set\n",
15075+
"# Evaluating model on test set\n",
1497615076
"y_pred = modelLR.predict(X_test)\n",
1497715077
"all_metrics.update(metrics_data(\"Logistic Regression\", y_test, y_pred))\n",
1497815078
"\n",
14979-
"#Evaluating model on train set\n",
15079+
"# Evaluating model on train set\n",
1498015080
"y_pred = modelLR.predict(X_train)\n",
1498115081
"accuracyLR2 = accuracy_score(y_train, y_pred)\n",
1498215082
"print('Accuracy on train set: {}'.format(accuracyLR2))"
@@ -14999,18 +15099,51 @@
1499915099
],
1500015100
"source": [
1500115101
"#RandomForestClassifier\n",
15102+
"# Define the objective function\n",
15103+
"def objective(trial):\n",
15104+
" # Define the search space for hyperparameters\n",
15105+
" n_estimators = trial.suggest_int('n_estimators', 100, 1000)\n",
15106+
" max_depth = trial.suggest_int('max_depth', 10, 50)\n",
15107+
" min_samples_split = trial.suggest_int('min_samples_split', 2, 10)\n",
15108+
" min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)\n",
15109+
" max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])\n",
15110+
" \n",
15111+
" model = RandomForestClassifier(\n",
15112+
" n_estimators=n_estimators,\n",
15113+
" max_depth=max_depth,\n",
15114+
" min_samples_split=min_samples_split,\n",
15115+
" min_samples_leaf=min_samples_leaf,\n",
15116+
" max_features=max_features,\n",
15117+
" random_state=42\n",
15118+
" )\n",
15119+
" model.fit(X_train, y_train)\n",
15120+
" \n",
15121+
" y_pred = model.predict(X_test)\n",
15122+
" accuracy = accuracy_score(y_test, y_pred)\n",
15123+
" \n",
15124+
" return accuracy\n",
15125+
"\n",
15126+
"# Create a study object and optimize the objective function\n",
15127+
"study = optuna.create_study(direction='maximize')\n",
15128+
"study.optimize(objective, n_trials=100)\n",
15129+
"\n",
15130+
"# Print the best hyperparameters\n",
15131+
"print('Best hyperparameters: ', study.best_params)\n",
15132+
"\n",
15133+
"# Train the model with the best hyperparameters\n",
15134+
"best_params = study.best_params\n",
1500215135
"start = time.time()\n",
15003-
"rfc = RandomForestClassifier()\n",
15136+
"rfc = RandomForestClassifier(**best_params, random_state=42)\n",
1500415137
"rfc.fit(X_train, y_train)\n",
1500515138
"end = time.time()\n",
1500615139
"TimeRFC = end - start\n",
1500715140
"print('Time: ', TimeRFC)\n",
1500815141
"\n",
15009-
"#Evaluating model on test set\n",
15142+
"# Evaluating model on test set\n",
1501015143
"y_pred = rfc.predict(X_test)\n",
1501115144
"all_metrics.update(metrics_data(\"Random Forest\", y_test, y_pred))\n",
1501215145
"\n",
15013-
"#Evaluating model on train set\n",
15146+
"# Evaluating model on train set\n",
1501415147
"y_pred = rfc.predict(X_train)\n",
1501515148
"accuracyRFC2 = accuracy_score(y_train, y_pred)\n",
1501615149
"print('Accuracy on train set: {}'.format(accuracyRFC2))"
@@ -15033,18 +15166,41 @@
1503315166
],
1503415167
"source": [
1503515168
"#LinearSVC\n",
15169+
"def objective(trial):\n",
15170+
" # Define the search space for hyperparameters\n",
15171+
" C = trial.suggest_float('C', 1e-4, 1e2, log=True)\n",
15172+
" max_iter = trial.suggest_int('max_iter', 1000, 10000)\n",
15173+
" loss = trial.suggest_categorical('loss', ['hinge', 'squared_hinge'])\n",
15174+
" \n",
15175+
" model = LinearSVC(C=C, max_iter=max_iter, loss=loss, random_state=42)\n",
15176+
" model.fit(X_train, y_train)\n",
15177+
" \n",
15178+
" y_pred = model.predict(X_test)\n",
15179+
" accuracy = accuracy_score(y_test, y_pred)\n",
15180+
" \n",
15181+
" return accuracy\n",
15182+
"\n",
15183+
"# Create a study object and optimize the objective function\n",
15184+
"study = optuna.create_study(direction='maximize')\n",
15185+
"study.optimize(objective, n_trials=100)\n",
15186+
"\n",
15187+
"# Print the best hyperparameters\n",
15188+
"print('Best hyperparameters: ', study.best_params)\n",
15189+
"\n",
15190+
"# Train the model with the best hyperparameters\n",
15191+
"best_params = study.best_params\n",
1503615192
"start = time.time()\n",
15037-
"svc = LinearSVC()\n",
15038-
"svc.fit(X_train, y_train) \n",
15193+
"svc = LinearSVC(**best_params, random_state=42)\n",
15194+
"svc.fit(X_train, y_train)\n",
1503915195
"end = time.time()\n",
1504015196
"TimeSVC = end - start\n",
1504115197
"print('Time: ', TimeSVC)\n",
1504215198
"\n",
15043-
"#Evaluating model on test set\n",
15199+
"# Evaluating model on test set\n",
1504415200
"y_pred = svc.predict(X_test)\n",
1504515201
"all_metrics.update(metrics_data(\"LinearSVC\", y_test, y_pred))\n",
1504615202
"\n",
15047-
"#Evaluating model on train set\n",
15203+
"# Evaluating model on train set\n",
1504815204
"y_pred = svc.predict(X_train)\n",
1504915205
"accuracySVC2 = accuracy_score(y_train, y_pred)\n",
1505015206
"print('Accuracy on train set: {}'.format(accuracySVC2))"
@@ -15067,20 +15223,52 @@
1506715223
],
1506815224
"source": [
1506915225
"#Gradient Boosting Classifier\n",
15070-
"start = time.time()\n",
15226+
"ef objective(trial):\n",
15227+
" # Define the search space for hyperparameters\n",
15228+
" n_estimators = trial.suggest_int('n_estimators', 100, 1000)\n",
15229+
" learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)\n",
15230+
" max_depth = trial.suggest_int('max_depth', 3, 20)\n",
15231+
" min_samples_split = trial.suggest_int('min_samples_split', 2, 10)\n",
15232+
" min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)\n",
15233+
" subsample = trial.suggest_float('subsample', 0.5, 1.0)\n",
15234+
" \n",
15235+
" model = GradientBoostingClassifier(\n",
15236+
" n_estimators=n_estimators,\n",
15237+
" learning_rate=learning_rate,\n",
15238+
" max_depth=max_depth,\n",
15239+
" min_samples_split=min_samples_split,\n",
15240+
" min_samples_leaf=min_samples_leaf,\n",
15241+
" subsample=subsample,\n",
15242+
" random_state=42\n",
15243+
" )\n",
15244+
" model.fit(X_train, y_train)\n",
15245+
" \n",
15246+
" y_pred = model.predict(X_test)\n",
15247+
" accuracy = accuracy_score(y_test, y_pred)\n",
15248+
" \n",
15249+
" return accuracy\n",
1507115250
"\n",
15072-
"grb= GradientBoostingClassifier()\n",
15073-
"grb.fit(X_train,y_train)\n",
15251+
"# Create a study object and optimize the objective function\n",
15252+
"study = optuna.create_study(direction='maximize')\n",
15253+
"study.optimize(objective, n_trials=100)\n",
15254+
"\n",
15255+
"# Print the best hyperparameters\n",
15256+
"print('Best hyperparameters: ', study.best_params)\n",
15257+
"\n",
15258+
"# Train the model with the best hyperparameters\n",
15259+
"best_params = study.best_params\n",
15260+
"start = time.time()\n",
15261+
"grb = GradientBoostingClassifier(**best_params, random_state=42)\n",
15262+
"grb.fit(X_train, y_train)\n",
1507415263
"end = time.time()\n",
1507515264
"Timegrb = end - start\n",
1507615265
"print('Time: ', Timegrb)\n",
1507715266
"\n",
15078-
"#Evaluating model on test set\n",
15267+
"# Evaluating model on test set\n",
1507915268
"y_pred = grb.predict(X_test)\n",
1508015269
"all_metrics.update(metrics_data(\"Gradient Boosting Classifier\", y_test, y_pred))\n",
1508115270
"\n",
15082-
"\n",
15083-
"#Evaluating model on train set\n",
15271+
"# Evaluating model on train set\n",
1508415272
"y_pred = grb.predict(X_train)\n",
1508515273
"accuracygrb2 = accuracy_score(y_train, y_pred)\n",
1508615274
"print('Accuracy on train set: {}'.format(accuracygrb2))\n"
@@ -21615,7 +21803,7 @@
2161521803
"name": "python",
2161621804
"nbconvert_exporter": "python",
2161721805
"pygments_lexer": "ipython3",
21618-
"version": "3.11.5"
21806+
"version": "3.11.3"
2161921807
}
2162021808
},
2162121809
"nbformat": 4,

0 commit comments

Comments
 (0)