From 7588420403947895e836f36684e8258064e4f786 Mon Sep 17 00:00:00 2001 From: Koen Derks Date: Fri, 25 Oct 2024 14:01:53 +0200 Subject: [PATCH 1/5] Remember to overwrite preloaded data +otherwise we forget to filter NA's --- R/commonMachineLearningClassification.R | 4 +--- R/commonMachineLearningClustering.R | 4 +--- R/commonMachineLearningRegression.R | 8 ++------ R/mlRegressionRegularized.R | 4 +--- 4 files changed, 5 insertions(+), 15 deletions(-) diff --git a/R/commonMachineLearningClassification.R b/R/commonMachineLearningClassification.R index 47a2d391..5699adb8 100644 --- a/R/commonMachineLearningClassification.R +++ b/R/commonMachineLearningClassification.R @@ -42,9 +42,7 @@ } .mlClassificationReadData <- function(dataset, options) { - if (is.null(dataset)) { - dataset <- .readDataClassificationRegressionAnalyses(dataset, options) - } + dataset <- .readDataClassificationRegressionAnalyses(dataset, options) if (length(unlist(options[["predictors"]])) > 0 && options[["scaleVariables"]]) { dataset[, options[["predictors"]]] <- .scaleNumericData(dataset[, options[["predictors"]], drop = FALSE]) } diff --git a/R/commonMachineLearningClustering.R b/R/commonMachineLearningClustering.R index 16b6519f..5cff9035 100644 --- a/R/commonMachineLearningClustering.R +++ b/R/commonMachineLearningClustering.R @@ -33,9 +33,7 @@ .mlClusteringReadData <- function(dataset, options) { predictors <- unlist(options[["predictors"]]) predictors <- predictors[predictors != ""] - if (is.null(dataset)) { - dataset <- .readAndAddCompleteRowIndices(options, "predictors") - } + dataset <- .readAndAddCompleteRowIndices(options, "predictors") if (options[["scaleVariables"]] && length(unlist(options[["predictors"]])) > 0) { dataset <- .scaleNumericData(dataset) } diff --git a/R/commonMachineLearningRegression.R b/R/commonMachineLearningRegression.R index 34f9f6dc..3b1e6213 100644 --- a/R/commonMachineLearningRegression.R +++ b/R/commonMachineLearningRegression.R @@ -45,9 +45,7 @@ } .readDataRegressionAnalyses <- function(dataset, options, jaspResults) { - if (is.null(dataset)) { - dataset <- .readDataClassificationRegressionAnalyses(dataset, options) - } + dataset <- .readDataClassificationRegressionAnalyses(dataset, options) if (length(unlist(options[["predictors"]])) > 0 && options[["scaleVariables"]]) { dataset[, options[["predictors"]]] <- .scaleNumericData(dataset[, options[["predictors"]], drop = FALSE]) } @@ -72,9 +70,7 @@ options[[name2]] <- rep("scale", length(options[[name]])) } dataset <- jaspBase::readDataSetByVariableTypes(options, c(optionNames, optionNamesAsNumeric)) - complete.index <- which(complete.cases(dataset)) - dataset <- na.omit(dataset) - rownames(dataset) <- as.character(complete.index) + dataset <- jaspBase::excludeNaListwise(dataset, c(options[["target"]], options[["predictors"]])) return(dataset) } diff --git a/R/mlRegressionRegularized.R b/R/mlRegressionRegularized.R index 2e5af926..b09b5dd0 100644 --- a/R/mlRegressionRegularized.R +++ b/R/mlRegressionRegularized.R @@ -74,9 +74,7 @@ mlRegressionRegularized <- function(jaspResults, dataset, options, ...) { predictors <- unlist(options["predictors"]) predictors <- predictors[predictors != ""] - if (is.null(dataset)) { - dataset <- .readAndAddCompleteRowIndices(options, c("target", "predictors", "weights"), testSetIndicator) - } + dataset <- .readAndAddCompleteRowIndices(options, c("target", "predictors", "weights"), testSetIndicator) if (length(unlist(options[["predictors"]])) > 0 && options[["scaleVariables"]]) { dataset[, options[["predictors"]]] <- .scaleNumericData(dataset[, options[["predictors"]], drop = FALSE]) } From a6227761f4ddfc8f5bf8fb6b4194c52e6976a72c Mon Sep 17 00:00:00 2001 From: Koen Derks Date: Fri, 25 Oct 2024 16:34:30 +0200 Subject: [PATCH 2/5] Better use of preload data --- R/commonMachineLearningClassification.R | 5 +--- R/commonMachineLearningClustering.R | 2 +- R/commonMachineLearningRegression.R | 36 ++++++++++++++++++------- R/mlRegressionRegularized.R | 25 +---------------- 4 files changed, 30 insertions(+), 38 deletions(-) diff --git a/R/commonMachineLearningClassification.R b/R/commonMachineLearningClassification.R index 5699adb8..bdb12043 100644 --- a/R/commonMachineLearningClassification.R +++ b/R/commonMachineLearningClassification.R @@ -42,10 +42,7 @@ } .mlClassificationReadData <- function(dataset, options) { - dataset <- .readDataClassificationRegressionAnalyses(dataset, options) - if (length(unlist(options[["predictors"]])) > 0 && options[["scaleVariables"]]) { - dataset[, options[["predictors"]]] <- .scaleNumericData(dataset[, options[["predictors"]], drop = FALSE]) - } + dataset <- .readDataClassificationRegressionAnalyses(dataset, options, include_weights = FALSE) if (options[["target"]] != "") { dataset[, options[["target"]]] <- factor(dataset[, options[["target"]]], ordered = FALSE) } diff --git a/R/commonMachineLearningClustering.R b/R/commonMachineLearningClustering.R index 5cff9035..c13c428d 100644 --- a/R/commonMachineLearningClustering.R +++ b/R/commonMachineLearningClustering.R @@ -33,7 +33,7 @@ .mlClusteringReadData <- function(dataset, options) { predictors <- unlist(options[["predictors"]]) predictors <- predictors[predictors != ""] - dataset <- .readAndAddCompleteRowIndices(options, "predictors") + dataset <- jaspBase::excludeNaListwise(dataset, predictors) if (options[["scaleVariables"]] && length(unlist(options[["predictors"]])) > 0) { dataset <- .scaleNumericData(dataset) } diff --git a/R/commonMachineLearningRegression.R b/R/commonMachineLearningRegression.R index 3b1e6213..ede6fc57 100644 --- a/R/commonMachineLearningRegression.R +++ b/R/commonMachineLearningRegression.R @@ -44,21 +44,39 @@ return(opt) } -.readDataRegressionAnalyses <- function(dataset, options, jaspResults) { - dataset <- .readDataClassificationRegressionAnalyses(dataset, options) - if (length(unlist(options[["predictors"]])) > 0 && options[["scaleVariables"]]) { - dataset[, options[["predictors"]]] <- .scaleNumericData(dataset[, options[["predictors"]], drop = FALSE]) - } +.readDataRegressionAnalyses <- function(dataset, options, jaspResults, include_weights = FALSE) { + dataset <- .readDataClassificationRegressionAnalyses(dataset, options, include_weights) return(dataset) } -.readDataClassificationRegressionAnalyses <- function(dataset, options) { +.readDataClassificationRegressionAnalyses <- function(dataset, options, include_weights) { + target <- NULL + weights <- NULL testSetIndicator <- NULL - if (options[["testSetIndicatorVariable"]] != "" && options[["holdoutData"]] == "testSetIndicator") - testSetIndicator <- "testSetIndicatorVariable" + if (options[["target"]] != "") { + target <- options[["target"]] + } + if (include_weights && options[["weights"]] != "") { + weights <- options[["weights"]] + } + if (options[["testSetIndicatorVariable"]] != "" && options[["holdoutData"]] == "testSetIndicator") { + testSetIndicator <- options[["testSetIndicatorVariable"]] + } - return(.readAndAddCompleteRowIndices(options, c("target", "predictors"), testSetIndicator)) + predictors <- unlist(options["predictors"]) + predictors <- predictors[predictors != ""] + dataset <- jaspBase::excludeNaListwise(dataset, c(target, predictors, weights, testSetIndicator)) + + # Scale numeric predictors + if (length(unlist(options[["predictors"]])) > 0 && options[["scaleVariables"]]) { + dataset[, options[["predictors"]]] <- .scaleNumericData(dataset[, options[["predictors"]], drop = FALSE]) + } + # Make sure the test set indicator is numeric + if (options[["testSetIndicatorVariable"]] != "" && options[["holdoutData"]] == "testSetIndicator") + dataset[[options[["testSetIndicatorVariable"]]]] <- as.numeric(dataset[[options[["testSetIndicatorVariable"]]]]) + + return(dataset) } .readAndAddCompleteRowIndices <- function(options, optionNames = NULL, optionNamesAsNumeric = NULL) { diff --git a/R/mlRegressionRegularized.R b/R/mlRegressionRegularized.R index b09b5dd0..b02575ec 100644 --- a/R/mlRegressionRegularized.R +++ b/R/mlRegressionRegularized.R @@ -18,7 +18,7 @@ mlRegressionRegularized <- function(jaspResults, dataset, options, ...) { # Preparatory work - dataset <- .mlRegressionRegularizedReadData(dataset, options) + dataset <- .readDataRegressionAnalyses(dataset, options, include_weights = TRUE) .mlRegressionErrorHandling(dataset, options, type = "regularized") # Check if analysis is ready to run @@ -58,29 +58,6 @@ mlRegressionRegularized <- function(jaspResults, dataset, options, ...) { .mlRegressionRegularizedPlotLambda(options, jaspResults, ready, position = 10) } -# Read dataset -.mlRegressionRegularizedReadData <- function(dataset, options) { - target <- NULL - weights <- NULL - testSetIndicator <- NULL - if (options[["target"]] != "") { - target <- options[["target"]] - } - if (options[["weights"]] != "") { - weights <- options[["weights"]] - } - if (options[["testSetIndicatorVariable"]] != "" && options[["holdoutData"]] == "testSetIndicator") - testSetIndicator <- "testSetIndicatorVariable" - - predictors <- unlist(options["predictors"]) - predictors <- predictors[predictors != ""] - dataset <- .readAndAddCompleteRowIndices(options, c("target", "predictors", "weights"), testSetIndicator) - if (length(unlist(options[["predictors"]])) > 0 && options[["scaleVariables"]]) { - dataset[, options[["predictors"]]] <- .scaleNumericData(dataset[, options[["predictors"]], drop = FALSE]) - } - return(dataset) -} - .regularizedRegression <- function(dataset, options, jaspResults) { # Set model-specific parameters alpha <- switch(options[["penalty"]], From b8e95d24bd71745296dee4923cbb7b9983db87d4 Mon Sep 17 00:00:00 2001 From: Koen Derks Date: Fri, 25 Oct 2024 16:41:07 +0200 Subject: [PATCH 3/5] Oopsie --- R/mlRegressionLinear.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/mlRegressionLinear.R b/R/mlRegressionLinear.R index e3b9b54f..71b844c0 100644 --- a/R/mlRegressionLinear.R +++ b/R/mlRegressionLinear.R @@ -18,7 +18,7 @@ mlRegressionLinear <- function(jaspResults, dataset, options, ...) { # Preparatory work - dataset <- .mlRegressionRegularizedReadData(dataset, options) + dataset <- .readDataRegressionAnalyses(dataset, options, include_weights = TRUE) .mlRegressionErrorHandling(dataset, options, type = "lm") # Check if analysis is ready to run From 021f2e187b5467a448628b2a45ef16a34dd550b4 Mon Sep 17 00:00:00 2001 From: Koen Derks Date: Sun, 3 Nov 2024 09:38:16 +0100 Subject: [PATCH 4/5] Update mlPrediction.qml --- inst/qml/mlPrediction.qml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/qml/mlPrediction.qml b/inst/qml/mlPrediction.qml index 133850b1..04c47927 100644 --- a/inst/qml/mlPrediction.qml +++ b/inst/qml/mlPrediction.qml @@ -53,7 +53,7 @@ Form id: predictors name: "predictors" title: qsTr("Features") - allowedColumns: ["scale", "ordinal", "nominal"] + allowedColumns: ["scale", "nominal"] allowAnalysisOwnComputedColumns: false } } From c8f13be7c994432193382ab46caf7f4f034156fe Mon Sep 17 00:00:00 2001 From: Koen Derks Date: Sun, 3 Nov 2024 10:41:00 +0100 Subject: [PATCH 5/5] Make predictions add features work again --- R/mlPrediction.R | 2 +- inst/help/mlPrediction.md | 2 ++ inst/help/mlPrediction_nl.md | 2 ++ inst/qml/mlPrediction.qml | 2 +- 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/R/mlPrediction.R b/R/mlPrediction.R index 13dcee07..328e6a42 100644 --- a/R/mlPrediction.R +++ b/R/mlPrediction.R @@ -422,7 +422,7 @@ is.jaspMachineLearning <- function(x) { selection <- predictions[indexes] cols <- list(row = indexes, pred = selection) if (options[["predictionsTableFeatures"]]) { - for (i in model[["jaspVars"]][["encoded"]]$predictors) { + for (i in colnames(dataset)) { if (.columnIsNominal(i)) { table$addColumnInfo(name = i, title = i, type = "string") var <- levels(dataset[[i]])[dataset[[i]]] diff --git a/inst/help/mlPrediction.md b/inst/help/mlPrediction.md index cf63d315..30611f6f 100644 --- a/inst/help/mlPrediction.md +++ b/inst/help/mlPrediction.md @@ -1,6 +1,8 @@ Prediction === +The prediction analysis enables you to load a trained machine learning model and apply it to new data. It is important that the features in the new dataset have the same names as in the original dataset used for training. + ### Input #### Trained Model diff --git a/inst/help/mlPrediction_nl.md b/inst/help/mlPrediction_nl.md index f457a963..2cc993de 100644 --- a/inst/help/mlPrediction_nl.md +++ b/inst/help/mlPrediction_nl.md @@ -1,6 +1,8 @@ Voorspellen === +Met de voorspellingsanalyse kun je een getraind machine-learningmodel laden en toepassen op nieuwe gegevens. Het is belangrijk dat de kenmerken in de nieuwe dataset dezelfde namen hebben als in de oorspronkelijke dataset die voor de training is gebruikt. + ### Invoer #### Getraind model diff --git a/inst/qml/mlPrediction.qml b/inst/qml/mlPrediction.qml index 04c47927..5a182646 100644 --- a/inst/qml/mlPrediction.qml +++ b/inst/qml/mlPrediction.qml @@ -26,7 +26,7 @@ import "./common/tables" as TAB Form { - info: qsTr("The prediction analysis enables you to load a trained machine learning model and apply it to new data.") + info: qsTr("The prediction analysis enables you to load a trained machine learning model and apply it to new data. It is important that the features in the new dataset have the same names as in the original dataset used for training.") FileSelector {