Variable nr of permutations for feature importance (#280)

jasp-stats · Jan 25, 2024 · f39b0ee · f39b0ee
1 parent b0459ff
commit f39b0ee
Show file tree

Hide file tree

Showing 19 changed files with 45 additions and 22 deletions.
diff --git a/R/commonMachineLearningRegression.R b/R/commonMachineLearningRegression.R
@@ -770,9 +770,9 @@
   table <- createJaspTable(title = gettext("Feature Importance Metrics"))
   table$position <- position
   if (purpose == "regression") {
-    table$dependOn(options = c(.mlRegressionDependencies(options), "featureImportanceTable"))
+    table$dependOn(options = c(.mlRegressionDependencies(options), "featureImportanceTable", "featureImportancePermutations"))
   } else {
-    table$dependOn(options = c(.mlClassificationDependencies(options), "featureImportanceTable"))
+    table$dependOn(options = c(.mlClassificationDependencies(options), "featureImportanceTable", "featureImportancePermutations"))
   }
   table$addColumnInfo(name = "predictor", title = "", type = "string")
   table$addColumnInfo(name = "dl", title = gettext("Mean dropout loss"), type = "number")
@@ -786,13 +786,13 @@
   )
   .setSeedJASP(options) # Set the seed to make results reproducible
   if (purpose == "regression") {
-    fi <- DALEX::model_parts(result[["explainer"]], B = 50)
+    fi <- DALEX::model_parts(result[["explainer"]], B = options[["featureImportancePermutations"]])
   } else if (purpose == "classification") {
-    fi <- DALEX::model_parts(result[["explainer_fi"]], B = 50)
+    fi <- DALEX::model_parts(result[["explainer_fi"]], B = options[["featureImportancePermutations"]])
   }
   fi <- aggregate(x = fi[["dropout_loss"]], by = list(y = fi[["variable"]]), FUN = mean)
   df <- data.frame(predictor = options[["predictors"]], dl = fi[match(options[["predictors"]], fi[["y"]]), "x"])
   df <- df[order(-df[["dl"]]), ]
   table$setData(df)
-  table$addFootnote(gettext("Mean dropout loss is based on 50 permutations."))
+  table$addFootnote(gettextf("Mean dropout loss is based on %1$s permutations.", options[["featureImportancePermutations"]]))
 }
diff --git a/R/mlRegressionBoosting.R b/R/mlRegressionBoosting.R
@@ -160,9 +160,9 @@ mlRegressionBoosting <- function(jaspResults, dataset, options, ...) {
   table <- createJaspTable(title = gettext("Feature Importance Metrics"))
   table$position <- position
   if (purpose == "regression") {
-    table$dependOn(options = c("featureImportanceTable", .mlRegressionDependencies()))
+    table$dependOn(options = c("featureImportanceTable", .mlRegressionDependencies(), "featureImportancePermutations"))
   } else {
-    table$dependOn(options = c("featureImportanceTable", .mlClassificationDependencies()))
+    table$dependOn(options = c("featureImportanceTable", .mlClassificationDependencies(), "featureImportancePermutations"))
   }
   table$addColumnInfo(name = "predictor", title = "", type = "string")
   table$addColumnInfo(name = "relIn", title = gettext("Relative Influence"), type = "number")
@@ -180,13 +180,13 @@ mlRegressionBoosting <- function(jaspResults, dataset, options, ...) {
   table[["relIn"]] <- result[["relInf"]]$rel.inf
   .setSeedJASP(options) # Set the seed to make results reproducible
   if (purpose == "regression") {
-    fi <- DALEX::model_parts(result[["explainer"]], B = 50)
+    fi <- DALEX::model_parts(result[["explainer"]], B = options[["featureImportancePermutations"]])
   } else if (purpose == "classification") {
-    fi <- DALEX::model_parts(result[["explainer_fi"]], B = 50)
+    fi <- DALEX::model_parts(result[["explainer_fi"]], B = options[["featureImportancePermutations"]])
   }
   fi <- aggregate(x = fi[["dropout_loss"]], by = list(y = fi[["variable"]]), FUN = mean)
   table[["dl"]] <- fi[match(vars, fi[["y"]]), "x"]
-  table$addFootnote(gettext("Mean dropout loss is based on 50 permutations."))
+  table$addFootnote(gettextf("Mean dropout loss is based on %1$s permutations.", options[["featureImportancePermutations"]]))
 }
 
 .mlBoostingPlotOobImprovement <- function(options, jaspResults, ready, position, purpose) {

diff --git a/R/mlRegressionDecisionTree.R b/R/mlRegressionDecisionTree.R
@@ -145,7 +145,7 @@ mlRegressionDecisionTree <- function(jaspResults, dataset, options, state = NULL
   table <- createJaspTable(title = gettext("Feature Importance Metrics"))
   table$position <- position
   table$dependOn(options = c(
-    "featureImportanceTable", "trainingDataManual", "scaleVariables", "target", "predictors", "seed", "setSeed",
+    "featureImportanceTable", "featureImportancePermutations", "trainingDataManual", "scaleVariables", "target", "predictors", "seed", "setSeed",
     "testSetIndicatorVariable", "testSetIndicator", "holdoutData", "testDataManual", "minObservationsForSplit", "minObservationsInNode", "interactionDepth", "complexityParameter"
   ))
   table$addColumnInfo(name = "predictor", title = " ", type = "string")
@@ -169,13 +169,13 @@ mlRegressionDecisionTree <- function(jaspResults, dataset, options, state = NULL
   table[["imp"]] <- as.numeric(varImpOrder) / sum(as.numeric(varImpOrder)) * 100
   .setSeedJASP(options) # Set the seed to make results reproducible
   if (purpose == "regression") {
-    fi <- DALEX::model_parts(result[["explainer"]], B = 50)
+    fi <- DALEX::model_parts(result[["explainer"]], B = options[["featureImportancePermutations"]])
   } else if (purpose == "classification") {
-    fi <- DALEX::model_parts(result[["explainer_fi"]], B = 50)
+    fi <- DALEX::model_parts(result[["explainer_fi"]], B = options[["featureImportancePermutations"]])
   }
   fi <- aggregate(x = fi[["dropout_loss"]], by = list(y = fi[["variable"]]), FUN = mean)
   table[["dl"]] <- fi[match(vars, fi[["y"]]), "x"]
-  table$addFootnote(gettext("Mean dropout loss is based on 50 permutations."))
+  table$addFootnote(gettextf("Mean dropout loss is based on %1$s permutations.", options[["featureImportancePermutations"]]))
 }
 
 .mlDecisionTreeTableSplits <- function(options, jaspResults, ready, position, purpose) {

diff --git a/R/mlRegressionRandomForest.R b/R/mlRegressionRandomForest.R
@@ -162,9 +162,9 @@ mlRegressionRandomForest <- function(jaspResults, dataset, options, ...) {
   table <- createJaspTable(title = gettext("Feature Importance Metrics"))
   table$position <- position
   if (purpose == "regression") {
-    table$dependOn(options = c("featureImportanceTable", .mlRegressionDependencies()))
+    table$dependOn(options = c("featureImportanceTable", .mlRegressionDependencies(), "featureImportancePermutations"))
   } else {
-    table$dependOn(options = c("featureImportanceTable", .mlClassificationDependencies()))
+    table$dependOn(options = c("featureImportanceTable", .mlClassificationDependencies(), "featureImportancePermutations"))
   }
   table$addColumnInfo(name = "predictor", title = " ", type = "string")
   table$addColumnInfo(name = "MDiA", title = gettext("Mean decrease in accuracy"), type = "number")
@@ -184,13 +184,13 @@ mlRegressionRandomForest <- function(jaspResults, dataset, options, ...) {
   table[["MDiNI"]] <- result[["varImp"]]$TotalDecrNodeImp
   .setSeedJASP(options) # Set the seed to make results reproducible
   if (purpose == "regression") {
-    fi <- DALEX::model_parts(result[["explainer"]], B = 50)
+    fi <- DALEX::model_parts(result[["explainer"]], B = options[["featureImportancePermutations"]])
   } else if (purpose == "classification") {
-    fi <- DALEX::model_parts(result[["explainer_fi"]], B = 50)
+    fi <- DALEX::model_parts(result[["explainer_fi"]], B = options[["featureImportancePermutations"]])
   }
   fi <- aggregate(x = fi[["dropout_loss"]], by = list(y = fi[["variable"]]), FUN = mean)
   table[["dl"]] <- fi[match(vars, fi[["y"]]), "x"]
-  table$addFootnote(gettext("Mean dropout loss is based on 50 permutations."))
+  table$addFootnote(gettextf("Mean dropout loss is based on %1$s permutations.", options[["featureImportancePermutations"]]))
 }
 
 .mlRandomForestPlotError <- function(options, jaspResults, ready, position, purpose) {

diff --git a/inst/help/mlClassificationBoosting.md b/inst/help/mlClassificationBoosting.md
@@ -18,6 +18,7 @@ Boosting works by sequentially adding features to an decision tree ensemble, eac
 - Class proportions: Displays a table that shows the proportions of each class in the data set, training (and validaton), and test set.
 - Model performance: Shows commonly used classification evaluation metrics like precision, recall, the F1-score, support and AUC (area under the ROC curve).
 - Feature importance: Shows the available feature importance metrics for the fitted model.
+  - Permutations: Sets the number of permutations on which the mean dropout loss is based.
 - Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
 
 #### Plots

diff --git a/inst/help/mlClassificationKnn.md b/inst/help/mlClassificationKnn.md
@@ -18,6 +18,7 @@ K-nearest neighbors is a method of classification that looks at the *k* number o
 - Class proportions: Displays a table that shows the proportions of each class in the data set, training (and validaton), and test set.
 - Model performance: Shows commonly used classification evaluation metrics like precision, recall, the F1-score, support and AUC (area under the ROC curve).
 - Feature importance: Shows the available feature importance metrics for the fitted model.
+  - Permutations: Sets the number of permutations on which the mean dropout loss is based.
 - Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
 
 #### Plots

diff --git a/inst/help/mlClassificationLda.md b/inst/help/mlClassificationLda.md
@@ -21,6 +21,7 @@ Linear Discriminant Analysis (LDA) is a method of classification that aims to fi
 - Class proportions: Displays a table that shows the proportions of each class in the data set, training (and validaton), and test set.
 - Model performance: Shows commonly used classification evaluation metrics like precision, recall, the F1-score, support and AUC (area under the ROC curve).
 - Feature importance: Shows the available feature importance metrics for the fitted model.
+  - Permutations: Sets the number of permutations on which the mean dropout loss is based.
 - Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
 - Coefficients: Shows the coefficients for the linear discriminants. 
 - Prior and posterior probabilities: Shows the prior and posterior group probabilities. Prior probabilities are the proportions in the training set.

diff --git a/inst/help/mlClassificationNeuralNetwork.md b/inst/help/mlClassificationNeuralNetwork.md
@@ -18,6 +18,7 @@ Feedforward neural networks are predictive algorithms inspired by the biological
 - Class proportions: Displays a table that shows the proportions of each class in the data set, training (and validaton), and test set.
 - Model performance: Shows commonly used classification evaluation metrics like precision, recall, the F1-score, support and AUC (area under the ROC curve).
 - Feature importance: Shows the available feature importance metrics for the fitted model.
+  - Permutations: Sets the number of permutations on which the mean dropout loss is based.
 - Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
 - Network weights: Shows the connections in the neural network together with their weights.
 

diff --git a/inst/help/mlClassificationRandomForest.md b/inst/help/mlClassificationRandomForest.md
@@ -18,6 +18,7 @@ Random Forest is a method of classification that creates a set of decision trees
 - Class proportions: Displays a table that shows the proportions of each class in the data set, training (and validaton), and test set.
 - Model performance: Shows commonly used classification evaluation metrics like precision, recall, the F1-score, support and AUC (area under the ROC curve).
 - Feature importance: Shows the available feature importance metrics for the fitted model.
+  - Permutations: Sets the number of permutations on which the mean dropout loss is based.
 - Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
 
 #### Plots

diff --git a/inst/help/mlRegressionBoosting.md b/inst/help/mlRegressionBoosting.md
@@ -16,6 +16,7 @@ Boosting works by sequentially adding features to an decision tree ensemble, eac
 #### Tables  
 - Model performance: Shows commonly used classification evaluation metrics like mean squared error (MSE), root mean squared error (RMSE) and R<sup>2</sup>.
 - Feature importance: Shows the available feature importance metrics for the fitted model.
+  - Permutations: Sets the number of permutations on which the mean dropout loss is based.
 - Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
 
 #### Plots

diff --git a/inst/help/mlRegressionKnn.md b/inst/help/mlRegressionKnn.md
@@ -16,6 +16,7 @@ K-nearest neighbors is a method of regression that looks at the *k* number of fe
 #### Tables  
 - Model performance: Shows commonly used classification evaluation metrics like mean squared error (MSE), root mean squared error (RMSE) and R<sup>2</sup>.
 - Feature importance: Shows the available feature importance metrics for the fitted model.
+  - Permutations: Sets the number of permutations on which the mean dropout loss is based.
 - Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
 
 #### Plots

diff --git a/inst/help/mlRegressionNeuralNetwork.md b/inst/help/mlRegressionNeuralNetwork.md
@@ -16,6 +16,7 @@ Feedforward neural networks are predictive algorithms inspired by the biological
 #### Tables  
 - Model performance: Shows commonly used classification evaluation metrics like mean squared error (MSE), root mean squared error (RMSE) and R<sup>2</sup>.
 - Feature importance: Shows the available feature importance metrics for the fitted model.
+  - Permutations: Sets the number of permutations on which the mean dropout loss is based.
 - Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
 - Network weights: Shows the connections in the neural network together with their weights.
 

diff --git a/inst/help/mlRegressionRandomForest.md b/inst/help/mlRegressionRandomForest.md
@@ -16,6 +16,7 @@ Random Forest is a method of regression that creates a set of decision trees tha
 #### Tables  
 - Model performance: Shows commonly used classification evaluation metrics like mean squared error (MSE), root mean squared error (RMSE) and R<sup>2</sup>.
 - Feature importance: Shows the available feature importance metrics for the fitted model.
+  - Permutations: Sets the number of permutations on which the mean dropout loss is based.
 - Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
 
 #### Plots

diff --git a/inst/help/mlRegressionRegularized.md b/inst/help/mlRegressionRegularized.md
@@ -18,6 +18,7 @@ Regularized linear regression is an adaptation of linear regression in which the
 #### Tables  
 - Model performance: Shows commonly used classification evaluation metrics like mean squared error (MSE), root mean squared error (RMSE) and R<sup>2</sup>.
 - Feature importance: Shows the available feature importance metrics for the fitted model.
+  - Permutations: Sets the number of permutations on which the mean dropout loss is based.
 - Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
 - Regression coefficients: Gives the regression coefficient for each feature.
 

diff --git a/inst/help/mlclassificationdecisiontree.md b/inst/help/mlclassificationdecisiontree.md
@@ -18,6 +18,7 @@ Decision Trees is a supervised learning algorithm that uses a decision tree as a
 - Class proportions: Displays a table that shows the proportions of each class in the data set, training (and validaton), and test set.
 - Model performance: Shows commonly used classification evaluation metrics like precision, recall, the F1-score, support and AUC (area under the ROC curve).
 - Feature importance: Shows the available feature importance metrics for the fitted model.
+  - Permutations: Sets the number of permutations on which the mean dropout loss is based.
 - Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
 - Splits in tree: Shows the split variables, their split point, and the number of observations (which are not missing and are of positive weight) sent left or right by the split. It also shows the improvement in deviance given by this split.
 

diff --git a/inst/help/mlclassificationsvm.md b/inst/help/mlclassificationsvm.md
@@ -18,6 +18,7 @@ Support Vector Machines is a supervised learning algorithm that maps training ex
 - Class proportions: Displays a table that shows the proportions of each class in the data set, training (and validaton), and test set.
 - Model performance: Shows commonly used classification evaluation metrics like precision, recall, the F1-score, support and AUC (area under the ROC curve).
 - Feature importance: Shows the available feature importance metrics for the fitted model.
+  - Permutations: Sets the number of permutations on which the mean dropout loss is based.
 - Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
 - Support vectors: Shows a table containing the data (points) indicated as support vectors by the algorithm.
 

diff --git a/inst/help/mlregressiondecisiontree.md b/inst/help/mlregressiondecisiontree.md
@@ -16,6 +16,7 @@ Decision Trees is a supervised learning algorithm that uses a decision tree as a
 #### Tables  
 - Model performance: Shows commonly used classification evaluation metrics like mean squared error (MSE), root mean squared error (RMSE) and R<sup>2</sup>.
 - Feature importance: Shows the available feature importance metrics for the fitted model.
+  - Permutations: Sets the number of permutations on which the mean dropout loss is based.
 - Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
 - Splits in tree: Shows the split variables, their split point, and the number of observations (which are not missing and are of positive weight) sent left or right by the split. It also shows the improvement in deviance given by this split.
 

diff --git a/inst/help/mlregressionsvm.md b/inst/help/mlregressionsvm.md
@@ -16,6 +16,7 @@ Support Vector Machines is a supervised learning algorithm that maps training ex
 #### Tables  
 - Model performance: Shows commonly used classification evaluation metrics like mean squared error (MSE), root mean squared error (RMSE) and R<sup>2</sup>.
 - Feature importance: Shows the available feature importance metrics for the fitted model.
+  - Permutations: Sets the number of permutations on which the mean dropout loss is based.
 - Explain predictions: Shows the decomposition of the model’s prediction into contributions that can be attributed to different explanatory variables.
 - Support vectors: Shows a table containing the data (points) indicated as support vectors by the algorithm.
 

diff --git a/inst/qml/common/tables/FeatureImportance.qml b/inst/qml/common/tables/FeatureImportance.qml
@@ -23,7 +23,16 @@ import JASP.Widgets		1.0
 
 CheckBox
 {
-	name:	"featureImportanceTable"
-	text:	qsTr("Feature importance")
-	info:	qsTr("Shows the available feature importance metrics for the fitted model.")
+	name:				"featureImportanceTable"
+	text:				qsTr("Feature importance")
+	info:				qsTr("Shows the available feature importance metrics for the fitted model.")
+
+	IntegerField
+	{
+		name:			"featureImportancePermutations"
+		text:			qsTr("Permutations")
+		defaultValue:	50
+		min:			10
+		info:			qsTr("Sets the number of permutations on which the mean dropout loss is based.")
+	}
 }