Skip to content

Commit

Permalink
Variable nr of permutations for feature importance (#280)
Browse files Browse the repository at this point in the history
  • Loading branch information
koenderks authored Jan 25, 2024
1 parent b0459ff commit f39b0ee
Show file tree
Hide file tree
Showing 19 changed files with 45 additions and 22 deletions.
10 changes: 5 additions & 5 deletions R/commonMachineLearningRegression.R
Original file line number Diff line number Diff line change
Expand Up @@ -770,9 +770,9 @@
table <- createJaspTable(title = gettext("Feature Importance Metrics"))
table$position <- position
if (purpose == "regression") {
table$dependOn(options = c(.mlRegressionDependencies(options), "featureImportanceTable"))
table$dependOn(options = c(.mlRegressionDependencies(options), "featureImportanceTable", "featureImportancePermutations"))
} else {
table$dependOn(options = c(.mlClassificationDependencies(options), "featureImportanceTable"))
table$dependOn(options = c(.mlClassificationDependencies(options), "featureImportanceTable", "featureImportancePermutations"))
}
table$addColumnInfo(name = "predictor", title = "", type = "string")
table$addColumnInfo(name = "dl", title = gettext("Mean dropout loss"), type = "number")
Expand All @@ -786,13 +786,13 @@
)
.setSeedJASP(options) # Set the seed to make results reproducible
if (purpose == "regression") {
fi <- DALEX::model_parts(result[["explainer"]], B = 50)
fi <- DALEX::model_parts(result[["explainer"]], B = options[["featureImportancePermutations"]])
} else if (purpose == "classification") {
fi <- DALEX::model_parts(result[["explainer_fi"]], B = 50)
fi <- DALEX::model_parts(result[["explainer_fi"]], B = options[["featureImportancePermutations"]])
}
fi <- aggregate(x = fi[["dropout_loss"]], by = list(y = fi[["variable"]]), FUN = mean)
df <- data.frame(predictor = options[["predictors"]], dl = fi[match(options[["predictors"]], fi[["y"]]), "x"])
df <- df[order(-df[["dl"]]), ]
table$setData(df)
table$addFootnote(gettext("Mean dropout loss is based on 50 permutations."))
table$addFootnote(gettextf("Mean dropout loss is based on %1$s permutations.", options[["featureImportancePermutations"]]))
}
10 changes: 5 additions & 5 deletions R/mlRegressionBoosting.R
Original file line number Diff line number Diff line change
Expand Up @@ -160,9 +160,9 @@ mlRegressionBoosting <- function(jaspResults, dataset, options, ...) {
table <- createJaspTable(title = gettext("Feature Importance Metrics"))
table$position <- position
if (purpose == "regression") {
table$dependOn(options = c("featureImportanceTable", .mlRegressionDependencies()))
table$dependOn(options = c("featureImportanceTable", .mlRegressionDependencies(), "featureImportancePermutations"))
} else {
table$dependOn(options = c("featureImportanceTable", .mlClassificationDependencies()))
table$dependOn(options = c("featureImportanceTable", .mlClassificationDependencies(), "featureImportancePermutations"))
}
table$addColumnInfo(name = "predictor", title = "", type = "string")
table$addColumnInfo(name = "relIn", title = gettext("Relative Influence"), type = "number")
Expand All @@ -180,13 +180,13 @@ mlRegressionBoosting <- function(jaspResults, dataset, options, ...) {
table[["relIn"]] <- result[["relInf"]]$rel.inf
.setSeedJASP(options) # Set the seed to make results reproducible
if (purpose == "regression") {
fi <- DALEX::model_parts(result[["explainer"]], B = 50)
fi <- DALEX::model_parts(result[["explainer"]], B = options[["featureImportancePermutations"]])
} else if (purpose == "classification") {
fi <- DALEX::model_parts(result[["explainer_fi"]], B = 50)
fi <- DALEX::model_parts(result[["explainer_fi"]], B = options[["featureImportancePermutations"]])
}
fi <- aggregate(x = fi[["dropout_loss"]], by = list(y = fi[["variable"]]), FUN = mean)
table[["dl"]] <- fi[match(vars, fi[["y"]]), "x"]
table$addFootnote(gettext("Mean dropout loss is based on 50 permutations."))
table$addFootnote(gettextf("Mean dropout loss is based on %1$s permutations.", options[["featureImportancePermutations"]]))
}

.mlBoostingPlotOobImprovement <- function(options, jaspResults, ready, position, purpose) {
Expand Down
8 changes: 4 additions & 4 deletions R/mlRegressionDecisionTree.R
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ mlRegressionDecisionTree <- function(jaspResults, dataset, options, state = NULL
table <- createJaspTable(title = gettext("Feature Importance Metrics"))
table$position <- position
table$dependOn(options = c(
"featureImportanceTable", "trainingDataManual", "scaleVariables", "target", "predictors", "seed", "setSeed",
"featureImportanceTable", "featureImportancePermutations", "trainingDataManual", "scaleVariables", "target", "predictors", "seed", "setSeed",
"testSetIndicatorVariable", "testSetIndicator", "holdoutData", "testDataManual", "minObservationsForSplit", "minObservationsInNode", "interactionDepth", "complexityParameter"
))
table$addColumnInfo(name = "predictor", title = " ", type = "string")
Expand All @@ -169,13 +169,13 @@ mlRegressionDecisionTree <- function(jaspResults, dataset, options, state = NULL
table[["imp"]] <- as.numeric(varImpOrder) / sum(as.numeric(varImpOrder)) * 100
.setSeedJASP(options) # Set the seed to make results reproducible
if (purpose == "regression") {
fi <- DALEX::model_parts(result[["explainer"]], B = 50)
fi <- DALEX::model_parts(result[["explainer"]], B = options[["featureImportancePermutations"]])
} else if (purpose == "classification") {
fi <- DALEX::model_parts(result[["explainer_fi"]], B = 50)
fi <- DALEX::model_parts(result[["explainer_fi"]], B = options[["featureImportancePermutations"]])
}
fi <- aggregate(x = fi[["dropout_loss"]], by = list(y = fi[["variable"]]), FUN = mean)
table[["dl"]] <- fi[match(vars, fi[["y"]]), "x"]
table$addFootnote(gettext("Mean dropout loss is based on 50 permutations."))
table$addFootnote(gettextf("Mean dropout loss is based on %1$s permutations.", options[["featureImportancePermutations"]]))
}

.mlDecisionTreeTableSplits <- function(options, jaspResults, ready, position, purpose) {
Expand Down
10 changes: 5 additions & 5 deletions R/mlRegressionRandomForest.R
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,9 @@ mlRegressionRandomForest <- function(jaspResults, dataset, options, ...) {
table <- createJaspTable(title = gettext("Feature Importance Metrics"))
table$position <- position
if (purpose == "regression") {
table$dependOn(options = c("featureImportanceTable", .mlRegressionDependencies()))
table$dependOn(options = c("featureImportanceTable", .mlRegressionDependencies(), "featureImportancePermutations"))
} else {
table$dependOn(options = c("featureImportanceTable", .mlClassificationDependencies()))
table$dependOn(options = c("featureImportanceTable", .mlClassificationDependencies(), "featureImportancePermutations"))
}
table$addColumnInfo(name = "predictor", title = " ", type = "string")
table$addColumnInfo(name = "MDiA", title = gettext("Mean decrease in accuracy"), type = "number")
Expand All @@ -184,13 +184,13 @@ mlRegressionRandomForest <- function(jaspResults, dataset, options, ...) {
table[["MDiNI"]] <- result[["varImp"]]$TotalDecrNodeImp
.setSeedJASP(options) # Set the seed to make results reproducible
if (purpose == "regression") {
fi <- DALEX::model_parts(result[["explainer"]], B = 50)
fi <- DALEX::model_parts(result[["explainer"]], B = options[["featureImportancePermutations"]])
} else if (purpose == "classification") {
fi <- DALEX::model_parts(result[["explainer_fi"]], B = 50)
fi <- DALEX::model_parts(result[["explainer_fi"]], B = options[["featureImportancePermutations"]])
}
fi <- aggregate(x = fi[["dropout_loss"]], by = list(y = fi[["variable"]]), FUN = mean)
table[["dl"]] <- fi[match(vars, fi[["y"]]), "x"]
table$addFootnote(gettext("Mean dropout loss is based on 50 permutations."))
table$addFootnote(gettextf("Mean dropout loss is based on %1$s permutations.", options[["featureImportancePermutations"]]))
}

.mlRandomForestPlotError <- function(options, jaspResults, ready, position, purpose) {
Expand Down
1 change: 1 addition & 0 deletions inst/help/mlClassificationBoosting.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Boosting works by sequentially adding features to an decision tree ensemble, eac
- Class proportions: Displays a table that shows the proportions of each class in the data set, training (and validaton), and test set.
- Model performance: Shows commonly used classification evaluation metrics like precision, recall, the F1-score, support and AUC (area under the ROC curve).
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.

#### Plots
Expand Down
1 change: 1 addition & 0 deletions inst/help/mlClassificationKnn.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ K-nearest neighbors is a method of classification that looks at the *k* number o
- Class proportions: Displays a table that shows the proportions of each class in the data set, training (and validaton), and test set.
- Model performance: Shows commonly used classification evaluation metrics like precision, recall, the F1-score, support and AUC (area under the ROC curve).
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.

#### Plots
Expand Down
1 change: 1 addition & 0 deletions inst/help/mlClassificationLda.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Linear Discriminant Analysis (LDA) is a method of classification that aims to fi
- Class proportions: Displays a table that shows the proportions of each class in the data set, training (and validaton), and test set.
- Model performance: Shows commonly used classification evaluation metrics like precision, recall, the F1-score, support and AUC (area under the ROC curve).
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
- Coefficients: Shows the coefficients for the linear discriminants.
- Prior and posterior probabilities: Shows the prior and posterior group probabilities. Prior probabilities are the proportions in the training set.
Expand Down
1 change: 1 addition & 0 deletions inst/help/mlClassificationNeuralNetwork.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Feedforward neural networks are predictive algorithms inspired by the biological
- Class proportions: Displays a table that shows the proportions of each class in the data set, training (and validaton), and test set.
- Model performance: Shows commonly used classification evaluation metrics like precision, recall, the F1-score, support and AUC (area under the ROC curve).
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
- Network weights: Shows the connections in the neural network together with their weights.

Expand Down
1 change: 1 addition & 0 deletions inst/help/mlClassificationRandomForest.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Random Forest is a method of classification that creates a set of decision trees
- Class proportions: Displays a table that shows the proportions of each class in the data set, training (and validaton), and test set.
- Model performance: Shows commonly used classification evaluation metrics like precision, recall, the F1-score, support and AUC (area under the ROC curve).
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.

#### Plots
Expand Down
1 change: 1 addition & 0 deletions inst/help/mlRegressionBoosting.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Boosting works by sequentially adding features to an decision tree ensemble, eac
#### Tables
- Model performance: Shows commonly used classification evaluation metrics like mean squared error (MSE), root mean squared error (RMSE) and R<sup>2</sup>.
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.

#### Plots
Expand Down
1 change: 1 addition & 0 deletions inst/help/mlRegressionKnn.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ K-nearest neighbors is a method of regression that looks at the *k* number of fe
#### Tables
- Model performance: Shows commonly used classification evaluation metrics like mean squared error (MSE), root mean squared error (RMSE) and R<sup>2</sup>.
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.

#### Plots
Expand Down
1 change: 1 addition & 0 deletions inst/help/mlRegressionNeuralNetwork.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Feedforward neural networks are predictive algorithms inspired by the biological
#### Tables
- Model performance: Shows commonly used classification evaluation metrics like mean squared error (MSE), root mean squared error (RMSE) and R<sup>2</sup>.
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
- Network weights: Shows the connections in the neural network together with their weights.

Expand Down
1 change: 1 addition & 0 deletions inst/help/mlRegressionRandomForest.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Random Forest is a method of regression that creates a set of decision trees tha
#### Tables
- Model performance: Shows commonly used classification evaluation metrics like mean squared error (MSE), root mean squared error (RMSE) and R<sup>2</sup>.
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.

#### Plots
Expand Down
1 change: 1 addition & 0 deletions inst/help/mlRegressionRegularized.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Regularized linear regression is an adaptation of linear regression in which the
#### Tables
- Model performance: Shows commonly used classification evaluation metrics like mean squared error (MSE), root mean squared error (RMSE) and R<sup>2</sup>.
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
- Regression coefficients: Gives the regression coefficient for each feature.

Expand Down
1 change: 1 addition & 0 deletions inst/help/mlclassificationdecisiontree.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Decision Trees is a supervised learning algorithm that uses a decision tree as a
- Class proportions: Displays a table that shows the proportions of each class in the data set, training (and validaton), and test set.
- Model performance: Shows commonly used classification evaluation metrics like precision, recall, the F1-score, support and AUC (area under the ROC curve).
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
- Splits in tree: Shows the split variables, their split point, and the number of observations (which are not missing and are of positive weight) sent left or right by the split. It also shows the improvement in deviance given by this split.

Expand Down
1 change: 1 addition & 0 deletions inst/help/mlclassificationsvm.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Support Vector Machines is a supervised learning algorithm that maps training ex
- Class proportions: Displays a table that shows the proportions of each class in the data set, training (and validaton), and test set.
- Model performance: Shows commonly used classification evaluation metrics like precision, recall, the F1-score, support and AUC (area under the ROC curve).
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
- Support vectors: Shows a table containing the data (points) indicated as support vectors by the algorithm.

Expand Down
1 change: 1 addition & 0 deletions inst/help/mlregressiondecisiontree.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Decision Trees is a supervised learning algorithm that uses a decision tree as a
#### Tables
- Model performance: Shows commonly used classification evaluation metrics like mean squared error (MSE), root mean squared error (RMSE) and R<sup>2</sup>.
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
- Splits in tree: Shows the split variables, their split point, and the number of observations (which are not missing and are of positive weight) sent left or right by the split. It also shows the improvement in deviance given by this split.

Expand Down
1 change: 1 addition & 0 deletions inst/help/mlregressionsvm.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Support Vector Machines is a supervised learning algorithm that maps training ex
#### Tables
- Model performance: Shows commonly used classification evaluation metrics like mean squared error (MSE), root mean squared error (RMSE) and R<sup>2</sup>.
- Feature importance: Shows the available feature importance metrics for the fitted model.
- Permutations: Sets the number of permutations on which the mean dropout loss is based.
- Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
- Support vectors: Shows a table containing the data (points) indicated as support vectors by the algorithm.

Expand Down
15 changes: 12 additions & 3 deletions inst/qml/common/tables/FeatureImportance.qml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,16 @@ import JASP.Widgets 1.0

CheckBox
{
name: "featureImportanceTable"
text: qsTr("Feature importance")
info: qsTr("Shows the available feature importance metrics for the fitted model.")
name: "featureImportanceTable"
text: qsTr("Feature importance")
info: qsTr("Shows the available feature importance metrics for the fitted model.")

IntegerField
{
name: "featureImportancePermutations"
text: qsTr("Permutations")
defaultValue: 50
min: 10
info: qsTr("Sets the number of permutations on which the mean dropout loss is based.")
}
}

0 comments on commit f39b0ee

Please sign in to comment.