mlr-org
diff --git a/‎NAMESPACE
Lines changed: 2 additions & 0 deletions b/‎NAMESPACE
Lines changed: 2 additions & 0 deletions
diff --git a/‎NEWS.md
Lines changed: 1 addition & 0 deletions b/‎NEWS.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎R/bibentries.R
Lines changed: 10 additions & 1 deletion b/‎R/bibentries.R
Lines changed: 10 additions & 1 deletion
diff --git a/‎R/helpers_xgboost.R
Lines changed: 39 additions & 0 deletions b/‎R/helpers_xgboost.R
Lines changed: 39 additions & 0 deletions
diff --git a/‎R/learner_aorsf_surv_aorsf.R
Lines changed: 9 additions & 11 deletions b/‎R/learner_aorsf_surv_aorsf.R
Lines changed: 9 additions & 11 deletions
diff --git a/‎R/learner_xgboost_surv_xgboost.R
Lines changed: 13 additions & 47 deletions b/‎R/learner_xgboost_surv_xgboost.R
Lines changed: 13 additions & 47 deletions
@@ -125,6 +125,8 @@ export(LearnerSurvRandomForestSRC)
 export(LearnerSurvRanger)
 export(LearnerSurvSVM)
 export(LearnerSurvXgboost)
+export(LearnerSurvXgboostAFT)
+export(LearnerSurvXgboostCox)
 export(create_learner)
 export(install_learners)
 export(list_mlr3learners)
 
@@ -1,5 +1,6 @@
 # mlr3extralearners 0.7.1-9000
 
+* Added `surv.xgboost.cox` and `surv.xgboost.aft` separate survival learners. `distr` prediction on the cox xgboost learner is now estimated via Breslow by default and aft xgboost has now in addition a `response` prediction (survival time)
 * Ported `surv.parametric` code to `survivalmodels`, changed `type` parameter to `form` to avoid conflict with survivalmodels's default parameter list
 * Fix: Replace hardcoded `VectorDistribution`s from partykit and flexsurv survival learners with survival matrices (`Matdist`) (thanks to @bblodfon)
 * Feat: Add `discrete` parameter in `surv.parametric` learner to return `Matdist` survival predictions
 
@@ -586,8 +586,17 @@ bibentries = c( # nolint start
     month = "01",
     journal = "University of California, Berkeley"
   ),
+  barnwal2022 = bibentry("article",
+    title = "Survival Regression with Accelerated Failure Time Model in XGBoost",
+    author = "Barnwal Avinash, Cho Hyunsu and Hocking Toby",
+    doi = "10.1080/10618600.2022.2067548",
+    issn = "15372715",
+    journal = "Journal of Computational and Graphical Statistics",
+    publisher = "American Statistical Association",
+    year = "2022"
+  ),
   Kohavi1995 = bibentry("inproceedings",
-    author = "Ron Kohavi",           
+    author = "Ron Kohavi",
     booktitle = "8th European Conference on Machine Learning",
     pages = "174--189",
     publisher = "Springer",
 
@@ -0,0 +1,39 @@
+# helper function to construct an `xgb.DMatrix` object
+# that has both features and target (label) data
+get_xgb_mat = function(task, objective, row_ids = NULL) {
+  # use all task rows if `rows_ids` is not specified
+  if (is.null(row_ids)) row_ids = task$row_ids
+
+  data = task$data(rows = row_ids, cols = task$feature_names)
+  truth = task$truth(rows = row_ids)
+  times = truth[, 1]
+  status = truth[, 2]
+
+  if (objective == "survival:cox") { # Cox
+    # censored => negative times, dead/event => positive times
+    times[status != 1] = -1L * times[status != 1]
+    data = xgboost::xgb.DMatrix(
+      data = as_numeric_matrix(data),
+      label = times
+    )
+  } else { # AFT
+    y_lower_bound = y_upper_bound = times
+    y_upper_bound[status == 0] = Inf
+
+    data = xgboost::xgb.DMatrix(as_numeric_matrix(data))
+    xgboost::setinfo(data, "label_lower_bound", y_lower_bound)
+    xgboost::setinfo(data, "label_upper_bound", y_upper_bound)
+  }
+
+  data
+}
+
+# return vector of importance scores given an `xgb.Booster` model
+xgb_imp = function(model) {
+  if (is.null(model)) {
+    stopf("No model stored")
+  }
+
+  imp = xgboost::xgb.importance(model = model)
+  set_names(imp$Gain, imp$Feature)
+}
@@ -45,10 +45,8 @@ LearnerSurvAorsf = R6Class("LearnerSurvAorsf",
         control_type = p_fct(levels = c("fast", "cph", "net"), default = "fast", tags = "train"),
         split_rule = p_fct(levels = c("logrank", "cstat"), default = "logrank", tags = "train"),
         control_fast_do_scale = p_lgl(default = FALSE, tags = "train"),
-        control_fast_ties = p_fct(levels = c("efron", "breslow"),
-          default = "efron", tags = "train"),
-        control_cph_ties = p_fct(levels = c("efron", "breslow"),
-          default = "efron", tags = "train"),
+        control_fast_ties = p_fct(levels = c("efron", "breslow"), default = "efron", tags = "train"),
+        control_cph_ties = p_fct(levels = c("efron", "breslow"), default = "efron", tags = "train"),
         control_cph_eps = p_dbl(default = 1e-9, lower = 0, tags = "train"),
         control_cph_iter_max = p_int(default = 20L, lower = 1, tags = "train"),
         control_net_alpha = p_dbl(default = 0.5, tags = "train"),
@@ -146,13 +144,13 @@ LearnerSurvAorsf = R6Class("LearnerSurvAorsf",
       # these parameters are used to organize the control arguments
       # above but are not used directly by aorsf::orsf(), so:
       pv = remove_named(pv, c("control_type",
-                                   "control_fast_do_scale",
-                                   "control_fast_ties",
-                                   "control_cph_ties",
-                                   "control_cph_eps",
-                                   "control_cph_iter_max",
-                                   "control_net_alpha",
-                                   "control_net_df_target"))
+                              "control_fast_do_scale",
+                              "control_fast_ties",
+                              "control_cph_ties",
+                              "control_cph_eps",
+                              "control_cph_iter_max",
+                              "control_net_alpha",
+                              "control_net_df_target"))
       invoke(
         aorsf::orsf,
         data = task$data(),
 
@@ -6,24 +6,23 @@
 #' eXtreme Gradient Boosting regression.
 #' Calls [xgboost::xgb.train()] from package \CRANpkg{xgboost}.
 #'
+#' **Note:** We strongly advise to use the separate [Cox][LearnerSurvXgboostCox]
+#' and [AFT][LearnerSurvXgboostAFT] xgboost survival learners since they represent
+#' two very distinct survival modeling methods and we offer more prediction
+#' types in the respective learners compared to the ones available here.
+#' This learner will be deprecated in the future.
+#'
 #' @template note_xgboost
 #'
 #' @section Initial parameter values:
 #' - `nrounds` is initialized to 1.
 #' - `nthread` is initialized to 1 to avoid conflicts with parallelization via \CRANpkg{future}.
 #' - `verbose` is initialized to 0.
 #' - `objective` is initialized to `survival:cox` for survival analysis.
-#' @section Early stopping:
-#' Early stopping can be used to find the optimal number of boosting rounds.
-#' The `early_stopping_set` parameter controls which set is used to monitor the performance.
-#' Set `early_stopping_set = "test"` to monitor the performance of the model on the test set while training.
-#' The test set for early stopping can be set with the `"test"` row role in the [mlr3::Task].
-#' Additionally, the range must be set in which the performance must increase with `early_stopping_rounds` and the maximum number of boosting rounds with `nrounds`.
-#' While resampling, the test set is automatically applied from the [mlr3::Resampling].
-#' Not that using the test set for early stopping can potentially bias the performance scores.
 #'
 #' @templateVar id surv.xgboost
 #' @template learner
+#' @template section_early_stopping
 #'
 #' @references
 #' `r format_bib("chen_2016")`
@@ -37,6 +36,9 @@ LearnerSurvXgboost = R6Class("LearnerSurvXgboost",
     #' @description
     #' Creates a new instance of this [R6][R6::R6Class] class.
     initialize = function() {
+      .Deprecated(
+        msg = "'surv.xgboost' will be deprecated in the future. Use 'surv.xgboost.cox' or 'surv.xgboost.aft' learners instead." #nolint
+      )
 
       ps = ps(
         aft_loss_distribution       = p_fct(c("normal", "logistic", "extreme"), default = "normal", tags = "train"),
@@ -71,7 +73,6 @@ LearnerSurvXgboost = R6Class("LearnerSurvXgboost",
         normalize_type              = p_fct(c("tree", "forest"), default = "tree", tags = "train"),
         nrounds                     = p_int(1L, tags = "train"),
         nthread                     = p_int(1L, default = 1L, tags = c("train", "threads")),
-        ntreelimit                  = p_int(1L, tags = "predict"),
         num_parallel_tree           = p_int(1L, default = 1L, tags = "train"),
         objective                   = p_fct(c("survival:cox", "survival:aft"), default = "survival:cox", tags = c("train", "predict")),
         one_drop                    = p_lgl(default = FALSE, tags = "train"),
@@ -134,46 +135,11 @@ LearnerSurvXgboost = R6Class("LearnerSurvXgboost",
     #'
     #' @return Named `numeric()`.
     importance = function() {
-      if (is.null(self$model)) {
-        stopf("No model stored")
-      }
-
-      imp = xgboost::xgb.importance(
-        model = self$model
-      )
-      set_names(imp$Gain, imp$Feature)
+      xgb_imp(self$model)
     }
   ),
 
   private = list(
-    # helper function to construct an `xgb.DMatrix` object
-    .get_data = function(task, pv, row_ids = NULL) {
-      # use all task rows if `rows_ids` is not specified
-      if (is.null(row_ids))
-        row_ids = task$row_ids
-
-      data = task$data(rows = row_ids, cols = task$feature_names)
-      target = task$data(rows = row_ids, cols = task$target_names)
-      targets = task$target_names
-      label = target[[targets[1]]] # time
-      status = target[[targets[2]]]
-
-      if (pv$objective == "survival:cox") {
-        label[status != 1] = -1L * label[status != 1]
-        data = xgboost::xgb.DMatrix(
-          data = as_numeric_matrix(data),
-          label = label)
-      } else {
-        y_lower_bound = y_upper_bound = label
-        y_upper_bound[status == 0] = Inf
-
-        data = xgboost::xgb.DMatrix(as_numeric_matrix(data))
-        xgboost::setinfo(data, "label_lower_bound", y_lower_bound)
-        xgboost::setinfo(data, "label_upper_bound", y_upper_bound)
-      }
-      data
-    },
-
     .train = function(task) {
 
       pv = self$param_set$get_values(tags = "train")
@@ -188,7 +154,7 @@ LearnerSurvXgboost = R6Class("LearnerSurvXgboost",
         pv$eval_metric = "aft-nloglik"
       }
 
-      data = private$.get_data(task, pv)
+      data = get_xgb_mat(task, pv$objective)
 
       if ("weights" %in% task$properties) {
         xgboost::setinfo(data, "weight", task$weights$weight)
@@ -201,7 +167,7 @@ LearnerSurvXgboost = R6Class("LearnerSurvXgboost",
       }
 
       if (pv$early_stopping_set == "test" && !is.null(task$row_roles$test)) {
-        test_data = private$.get_data(task, pv, task$row_roles$test)
+        test_data = get_xgb_mat(task, pv$objective, task$row_roles$test)
         pv$watchlist = c(pv$watchlist, list(test = test_data))
       }
       pv$early_stopping_set = NULL