Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Variable nr of permutations for feature importance #280

Merged
merged 1 commit into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions R/commonMachineLearningRegression.R
Original file line number Diff line number Diff line change
Expand Up @@ -770,9 +770,9 @@
table <- createJaspTable(title = gettext("Feature Importance Metrics"))
table$position <- position
if (purpose == "regression") {
table$dependOn(options = c(.mlRegressionDependencies(options), "featureImportanceTable"))
table$dependOn(options = c(.mlRegressionDependencies(options), "featureImportanceTable", "featureImportancePermutations"))
} else {
table$dependOn(options = c(.mlClassificationDependencies(options), "featureImportanceTable"))
table$dependOn(options = c(.mlClassificationDependencies(options), "featureImportanceTable", "featureImportancePermutations"))
}
table$addColumnInfo(name = "predictor", title = "", type = "string")
table$addColumnInfo(name = "dl", title = gettext("Mean dropout loss"), type = "number")
Expand All @@ -786,13 +786,13 @@
)
.setSeedJASP(options) # Set the seed to make results reproducible
if (purpose == "regression") {
fi <- DALEX::model_parts(result[["explainer"]], B = 50)
fi <- DALEX::model_parts(result[["explainer"]], B = options[["featureImportancePermutations"]])
} else if (purpose == "classification") {
fi <- DALEX::model_parts(result[["explainer_fi"]], B = 50)
fi <- DALEX::model_parts(result[["explainer_fi"]], B = options[["featureImportancePermutations"]])
}
fi <- aggregate(x = fi[["dropout_loss"]], by = list(y = fi[["variable"]]), FUN = mean)
df <- data.frame(predictor = options[["predictors"]], dl = fi[match(options[["predictors"]], fi[["y"]]), "x"])
df <- df[order(-df[["dl"]]), ]
table$setData(df)
table$addFootnote(gettext("Mean dropout loss is based on 50 permutations."))
table$addFootnote(gettextf("Mean dropout loss is based on %1$s permutations.", options[["featureImportancePermutations"]]))
}
10 changes: 5 additions & 5 deletions R/mlRegressionBoosting.R
Original file line number Diff line number Diff line change
Expand Up @@ -160,9 +160,9 @@ mlRegressionBoosting <- function(jaspResults, dataset, options, ...) {
table <- createJaspTable(title = gettext("Feature Importance Metrics"))
table$position <- position
if (purpose == "regression") {
table$dependOn(options = c("featureImportanceTable", .mlRegressionDependencies()))
table$dependOn(options = c("featureImportanceTable", .mlRegressionDependencies(), "featureImportancePermutations"))
} else {
table$dependOn(options = c("featureImportanceTable", .mlClassificationDependencies()))
table$dependOn(options = c("featureImportanceTable", .mlClassificationDependencies(), "featureImportancePermutations"))
}
table$addColumnInfo(name = "predictor", title = "", type = "string")
table$addColumnInfo(name = "relIn", title = gettext("Relative Influence"), type = "number")
Expand All @@ -180,13 +180,13 @@ mlRegressionBoosting <- function(jaspResults, dataset, options, ...) {
table[["relIn"]] <- result[["relInf"]]$rel.inf
.setSeedJASP(options) # Set the seed to make results reproducible
if (purpose == "regression") {
fi <- DALEX::model_parts(result[["explainer"]], B = 50)
fi <- DALEX::model_parts(result[["explainer"]], B = options[["featureImportancePermutations"]])
} else if (purpose == "classification") {
fi <- DALEX::model_parts(result[["explainer_fi"]], B = 50)
fi <- DALEX::model_parts(result[["explainer_fi"]], B = options[["featureImportancePermutations"]])
}
fi <- aggregate(x = fi[["dropout_loss"]], by = list(y = fi[["variable"]]), FUN = mean)
table[["dl"]] <- fi[match(vars, fi[["y"]]), "x"]
table$addFootnote(gettext("Mean dropout loss is based on 50 permutations."))
table$addFootnote(gettextf("Mean dropout loss is based on %1$s permutations.", options[["featureImportancePermutations"]]))
}

.mlBoostingPlotOobImprovement <- function(options, jaspResults, ready, position, purpose) {
Expand Down
8 changes: 4 additions & 4 deletions R/mlRegressionDecisionTree.R
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ mlRegressionDecisionTree <- function(jaspResults, dataset, options, state = NULL
table <- createJaspTable(title = gettext("Feature Importance Metrics"))
table$position <- position
table$dependOn(options = c(
"featureImportanceTable", "trainingDataManual", "scaleVariables", "target", "predictors", "seed", "setSeed",
"featureImportanceTable", "featureImportancePermutations", "trainingDataManual", "scaleVariables", "target", "predictors", "seed", "setSeed",
"testSetIndicatorVariable", "testSetIndicator", "holdoutData", "testDataManual", "minObservationsForSplit", "minObservationsInNode", "interactionDepth", "complexityParameter"
))
table$addColumnInfo(name = "predictor", title = " ", type = "string")
Expand All @@ -169,13 +169,13 @@ mlRegressionDecisionTree <- function(jaspResults, dataset, options, state = NULL
table[["imp"]] <- as.numeric(varImpOrder) / sum(as.numeric(varImpOrder)) * 100
.setSeedJASP(options) # Set the seed to make results reproducible
if (purpose == "regression") {
fi <- DALEX::model_parts(result[["explainer"]], B = 50)
fi <- DALEX::model_parts(result[["explainer"]], B = options[["featureImportancePermutations"]])
} else if (purpose == "classification") {
fi <- DALEX::model_parts(result[["explainer_fi"]], B = 50)
fi <- DALEX::model_parts(result[["explainer_fi"]], B = options[["featureImportancePermutations"]])
}
fi <- aggregate(x = fi[["dropout_loss"]], by = list(y = fi[["variable"]]), FUN = mean)
table[["dl"]] <- fi[match(vars, fi[["y"]]), "x"]
table$addFootnote(gettext("Mean dropout loss is based on 50 permutations."))
table$addFootnote(gettextf("Mean dropout loss is based on %1$s permutations.", options[["featureImportancePermutations"]]))
}

.mlDecisionTreeTableSplits <- function(options, jaspResults, ready, position, purpose) {
Expand Down
10 changes: 5 additions & 5 deletions R/mlRegressionRandomForest.R
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,9 @@ mlRegressionRandomForest <- function(jaspResults, dataset, options, ...) {
table <- createJaspTable(title = gettext("Feature Importance Metrics"))
table$position <- position
if (purpose == "regression") {
table$dependOn(options = c("featureImportanceTable", .mlRegressionDependencies()))
table$dependOn(options = c("featureImportanceTable", .mlRegressionDependencies(), "featureImportancePermutations"))
} else {
table$dependOn(options = c("featureImportanceTable", .mlClassificationDependencies()))
table$dependOn(options = c("featureImportanceTable", .mlClassificationDependencies(), "featureImportancePermutations"))
}
table$addColumnInfo(name = "predictor", title = " ", type = "string")
table$addColumnInfo(name = "MDiA", title = gettext("Mean decrease in accuracy"), type = "number")
Expand All @@ -184,13 +184,13 @@ mlRegressionRandomForest <- function(jaspResults, dataset, options, ...) {
table[["MDiNI"]] <- result[["varImp"]]$TotalDecrNodeImp
.setSeedJASP(options) # Set the seed to make results reproducible
if (purpose == "regression") {
fi <- DALEX::model_parts(result[["explainer"]], B = 50)
fi <- DALEX::model_parts(result[["explainer"]], B = options[["featureImportancePermutations"]])
} else if (purpose == "classification") {
fi <- DALEX::model_parts(result[["explainer_fi"]], B = 50)
fi <- DALEX::model_parts(result[["explainer_fi"]], B = options[["featureImportancePermutations"]])
}
fi <- aggregate(x = fi[["dropout_loss"]], by = list(y = fi[["variable"]]), FUN = mean)
table[["dl"]] <- fi[match(vars, fi[["y"]]), "x"]
table$addFootnote(gettext("Mean dropout loss is based on 50 permutations."))
table$addFootnote(gettextf("Mean dropout loss is based on %1$s permutations.", options[["featureImportancePermutations"]]))
}

.mlRandomForestPlotError <- function(options, jaspResults, ready, position, purpose) {
Expand Down
1 change: 1 addition & 0 deletions inst/help/mlClassificationBoosting.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Boosting works by sequentially adding features to an decision tree ensemble, eac
- Class proportions: Displays a table that shows the proportions of each class in the data set, training (and validaton), and test set.
- Model performance: Shows commonly used classification evaluation metrics like precision, recall, the F1-score, support and AUC (area under the ROC curve).
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.

#### Plots
Expand Down
1 change: 1 addition & 0 deletions inst/help/mlClassificationKnn.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ K-nearest neighbors is a method of classification that looks at the *k* number o
- Class proportions: Displays a table that shows the proportions of each class in the data set, training (and validaton), and test set.
- Model performance: Shows commonly used classification evaluation metrics like precision, recall, the F1-score, support and AUC (area under the ROC curve).
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.

#### Plots
Expand Down
1 change: 1 addition & 0 deletions inst/help/mlClassificationLda.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Linear Discriminant Analysis (LDA) is a method of classification that aims to fi
- Class proportions: Displays a table that shows the proportions of each class in the data set, training (and validaton), and test set.
- Model performance: Shows commonly used classification evaluation metrics like precision, recall, the F1-score, support and AUC (area under the ROC curve).
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
- Coefficients: Shows the coefficients for the linear discriminants.
- Prior and posterior probabilities: Shows the prior and posterior group probabilities. Prior probabilities are the proportions in the training set.
Expand Down
1 change: 1 addition & 0 deletions inst/help/mlClassificationNeuralNetwork.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Feedforward neural networks are predictive algorithms inspired by the biological
- Class proportions: Displays a table that shows the proportions of each class in the data set, training (and validaton), and test set.
- Model performance: Shows commonly used classification evaluation metrics like precision, recall, the F1-score, support and AUC (area under the ROC curve).
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
- Network weights: Shows the connections in the neural network together with their weights.

Expand Down
1 change: 1 addition & 0 deletions inst/help/mlClassificationRandomForest.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Random Forest is a method of classification that creates a set of decision trees
- Class proportions: Displays a table that shows the proportions of each class in the data set, training (and validaton), and test set.
- Model performance: Shows commonly used classification evaluation metrics like precision, recall, the F1-score, support and AUC (area under the ROC curve).
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.

#### Plots
Expand Down
1 change: 1 addition & 0 deletions inst/help/mlRegressionBoosting.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Boosting works by sequentially adding features to an decision tree ensemble, eac
#### Tables
- Model performance: Shows commonly used classification evaluation metrics like mean squared error (MSE), root mean squared error (RMSE) and R<sup>2</sup>.
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.

#### Plots
Expand Down
1 change: 1 addition & 0 deletions inst/help/mlRegressionKnn.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ K-nearest neighbors is a method of regression that looks at the *k* number of fe
#### Tables
- Model performance: Shows commonly used classification evaluation metrics like mean squared error (MSE), root mean squared error (RMSE) and R<sup>2</sup>.
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.

#### Plots
Expand Down
1 change: 1 addition & 0 deletions inst/help/mlRegressionNeuralNetwork.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Feedforward neural networks are predictive algorithms inspired by the biological
#### Tables
- Model performance: Shows commonly used classification evaluation metrics like mean squared error (MSE), root mean squared error (RMSE) and R<sup>2</sup>.
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
- Network weights: Shows the connections in the neural network together with their weights.

Expand Down
1 change: 1 addition & 0 deletions inst/help/mlRegressionRandomForest.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Random Forest is a method of regression that creates a set of decision trees tha
#### Tables
- Model performance: Shows commonly used classification evaluation metrics like mean squared error (MSE), root mean squared error (RMSE) and R<sup>2</sup>.
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.

#### Plots
Expand Down
1 change: 1 addition & 0 deletions inst/help/mlRegressionRegularized.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Regularized linear regression is an adaptation of linear regression in which the
#### Tables
- Model performance: Shows commonly used classification evaluation metrics like mean squared error (MSE), root mean squared error (RMSE) and R<sup>2</sup>.
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
- Regression coefficients: Gives the regression coefficient for each feature.

Expand Down
1 change: 1 addition & 0 deletions inst/help/mlclassificationdecisiontree.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Decision Trees is a supervised learning algorithm that uses a decision tree as a
- Class proportions: Displays a table that shows the proportions of each class in the data set, training (and validaton), and test set.
- Model performance: Shows commonly used classification evaluation metrics like precision, recall, the F1-score, support and AUC (area under the ROC curve).
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
- Splits in tree: Shows the split variables, their split point, and the number of observations (which are not missing and are of positive weight) sent left or right by the split. It also shows the improvement in deviance given by this split.

Expand Down
1 change: 1 addition & 0 deletions inst/help/mlclassificationsvm.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Support Vector Machines is a supervised learning algorithm that maps training ex
- Class proportions: Displays a table that shows the proportions of each class in the data set, training (and validaton), and test set.
- Model performance: Shows commonly used classification evaluation metrics like precision, recall, the F1-score, support and AUC (area under the ROC curve).
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
- Support vectors: Shows a table containing the data (points) indicated as support vectors by the algorithm.

Expand Down
1 change: 1 addition & 0 deletions inst/help/mlregressiondecisiontree.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Decision Trees is a supervised learning algorithm that uses a decision tree as a
#### Tables
- Model performance: Shows commonly used classification evaluation metrics like mean squared error (MSE), root mean squared error (RMSE) and R<sup>2</sup>.
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
- Splits in tree: Shows the split variables, their split point, and the number of observations (which are not missing and are of positive weight) sent left or right by the split. It also shows the improvement in deviance given by this split.

Expand Down
1 change: 1 addition & 0 deletions inst/help/mlregressionsvm.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Support Vector Machines is a supervised learning algorithm that maps training ex
#### Tables
- Model performance: Shows commonly used classification evaluation metrics like mean squared error (MSE), root mean squared error (RMSE) and R<sup>2</sup>.
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
- Support vectors: Shows a table containing the data (points) indicated as support vectors by the algorithm.

Expand Down
15 changes: 12 additions & 3 deletions inst/qml/common/tables/FeatureImportance.qml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,16 @@ import JASP.Widgets 1.0

CheckBox
{
name: "featureImportanceTable"
text: qsTr("Feature importance")
info: qsTr("Shows the available feature importance metrics for the fitted model.")
name: "featureImportanceTable"
text: qsTr("Feature importance")
info: qsTr("Shows the available feature importance metrics for the fitted model.")

IntegerField
{
name: "featureImportancePermutations"
text: qsTr("Permutations")
defaultValue: 50
min: 10
info: qsTr("Sets the number of permutations on which the mean dropout loss is based.")
}
}
Loading