cmu-delphi · dsweber2 · Dec 23, 2023 · Dec 9, 2023 · Dec 11, 2023 · Dec 13, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -27,9 +27,11 @@ Imports:
     purrr,
     recipes (>= 1.0.4),
     rlang,
+    slider,
     targets,
     tibble,
-    tidyr
+    tidyr,
+    zeallot
 Suggests:
     ggplot2,
     knitr,

diff --git a/NAMESPACE b/NAMESPACE
@@ -28,15 +28,19 @@ export(make_shared_grids)
 export(make_target_ensemble_grid)
 export(make_target_param_grid)
 export(overprediction)
-export(perform_sanity_checks)
 export(read_external_predictions_data)
+export(rolling_mean)
+export(rolling_sd)
 export(run_evaluation_measure)
 export(run_workflow_and_format)
+export(sanitize_args_predictors_trainer)
 export(scaled_pop)
 export(sharpness)
 export(single_id)
 export(slide_forecaster)
+export(smoothed_scaled)
 export(underprediction)
+export(update_predictors)
 export(weighted_interval_score)
 importFrom(assertthat,assert_that)
 importFrom(cli,cli_abort)
@@ -81,13 +85,16 @@ importFrom(epipredict,step_epi_naomit)
 importFrom(epipredict,step_population_scaling)
 importFrom(epipredict,step_training_window)
 importFrom(epiprocess,as_epi_df)
+importFrom(epiprocess,epi_slide)
 importFrom(epiprocess,epix_slide)
 importFrom(magrittr,"%<>%")
 importFrom(magrittr,"%>%")
 importFrom(purrr,imap)
 importFrom(purrr,map)
 importFrom(purrr,map2_vec)
+importFrom(purrr,map_chr)
 importFrom(purrr,map_vec)
+importFrom(purrr,reduce)
 importFrom(purrr,transpose)
 importFrom(recipes,all_numeric)
 importFrom(rlang,"!!")
@@ -96,6 +103,7 @@ importFrom(rlang,.data)
 importFrom(rlang,quo)
 importFrom(rlang,sym)
 importFrom(rlang,syms)
+importFrom(slider,slide_dbl)
 importFrom(targets,tar_config_get)
 importFrom(targets,tar_group)
 importFrom(targets,tar_read)
@@ -105,3 +113,4 @@ importFrom(tidyr,drop_na)
 importFrom(tidyr,expand_grid)
 importFrom(tidyr,pivot_wider)
 importFrom(tidyr,unnest)
+importFrom(zeallot,"%<-%")
diff --git a/R/data_transforms.R b/R/data_transforms.R
@@ -0,0 +1,114 @@
+# various reusable transforms to apply before handing to epipredict
+
+#' extract the non-key, non-smoothed columns from epi_data
+#' @keywords internal
+#' @param epi_data the `epi_df`
+#' @param cols vector of column names to use. If `NULL`, fill with all non-key columns
+get_trainable_names <- function(epi_data, cols) {
+  if (is.null(cols)) {
+    cols <- get_nonkey_names(epi_data)
+    # exclude anything with the same naming schema as the rolling average/sd created below
+    cols <- cols[!grepl("_\\w{1,2}\\d+", cols)]
+  }
+  return(cols)
+}
+
+#' just the names which aren't keys for an epi_df
+#' @description
+#' names, but it excludes keys
+#' @param epi_data the epi_df
+get_nonkey_names <- function(epi_data) {
+  cols <- names(epi_data)
+  cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))]
+  return(cols)
+}
+
+
+#' update the predictors to only contain the smoothed/sd versions of cols
+#' @description
+#' modifies the list of preditors so that any which have been modified have the
+#'   modified versions included, and not the original. Should only be applied
+#'   after both rolling_mean and rolling_sd.
+#' @param epi_data the epi_df, only included to get the non-key column names
+#' @param cols_modified the list of columns which have been modified. If this is `NULL`, that means we were modifying every column.
+#' @param predictors the initial set of predictors; any unmodified are kept, any modified are replaced with the modified versions (e.g. "a" becoming "a_m17").
+#' @importFrom purrr map map_chr reduce
+#' @return returns an updated list of predictors, with modified columns replaced and non-modified columns left intact.
+#' @export
+update_predictors <- function(epi_data, cols_modified, predictors) {
+  if (!is.null(cols_modified)) {
+    # if cols_modified isn't null, make sure we include predictors that weren't modified
+    unchanged_predictors <- map(cols_modified, ~ !grepl(.x, predictors, fixed = TRUE)) %>% reduce(`&`)
+    unchanged_predictors <- predictors[unchanged_predictors]
+  } else {
+    # if it's null, we've modified every predictor
+    unchanged_predictors <- character(0L)
+  }
+  # all the non-key names
+  col_names <- get_nonkey_names(epi_data)
+  is_present <- function(original_predictor) {
+    grepl(original_predictor, col_names) & !(col_names %in% predictors)
+  }
+  is_modified <- map(predictors, is_present) %>% reduce(`|`)
+  new_predictors <- col_names[is_modified]
+  return(c(unchanged_predictors, new_predictors))
+}
+
+#' get a rolling average for the named columns
+#' @description
+#' add column(s) that are the rolling means of the specified columns, as
+#'   implemented by slider. Defaults to the previous 7 days.
+#' Currently only group_by's on the geo_value. Should probably extend to more
+#'   keys if you have them
+#' @param epi_data the dataset
+#' @param width the number of days (or examples, the sliding isn't time-aware) to use
+#' @param cols_to_mean the non-key columns to take the mean over. `NULL` means all
+#' @importFrom slider slide_dbl
+#' @importFrom epiprocess epi_slide
+#' @export
+rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) {
+  cols_to_mean <- get_trainable_names(epi_data, cols_to_mean)
+  epi_data %<>% group_by(geo_value)
+  for (col in cols_to_mean) {
+    mean_name <- paste0(col, "_m", width)
+    epi_data %<>% epi_slide(~ mean(.x[[col]], rm.na = TRUE), before = width-1L, new_col_name = mean_name)
+  }
+  epi_data %<>% ungroup()
+  return(epi_data)
+}
+
+#' get a rolling standard deviation for the named columns
+#' @description
+#' A rolling standard deviation, based off of a rolling mean. First it
+#'   calculates a rolling mean with width `mean_width`, and then squares the
+#'   difference between that and the actual value, averaged over `sd_width`.
+#' @param epi_data the dataset
+#' @param sd_width the number of days (or examples, the sliding isn't
+#'   time-aware) to use for the standard deviation calculation
+#' @param mean_width like `sd_width`, but it governs the mean. Should be less
+#'   than the `sd_width`, and if `NULL` (the default) it is half of `sd_width`
+#'   (so 14 in the complete default case)
+#' @param cols_to_sd the non-key columns to take the sd over. `NULL` means all
+#' @param keep_mean bool, if `TRUE`, it retains keeps the mean column
+#' @importFrom epiprocess epi_slide
+#' @export
+rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd = NULL, keep_mean = FALSE) {
+  if (is.null(mean_width)) {
+    mean_width <- as.integer(ceiling(sd_width / 2))
+  }
+  cols_to_sd <- get_trainable_names(epi_data, cols_to_sd)
+  result <- epi_data
+  for (col in cols_to_sd) {
+    result %<>% group_by(geo_value)
+    mean_name <- paste0(col, "_m", mean_width)
+    sd_name <- paste0(col, "_sd", sd_width)
+    result %<>% epi_slide(~ mean(.x[[col]], na.rm = TRUE), before = mean_width-1L, new_col_name = mean_name)
+    result %<>% epi_slide(~ sqrt(mean((.x[[mean_name]] - .x[[col]])^2, na.rm = TRUE)), before = sd_width-1, new_col_name = sd_name)
+    if (!keep_mean) {
+      # TODO make sure the extra info sticks around
+      result %<>% select(-{{ mean_name }})
+    }
+    result %<>% dplyr_reconstruct(epi_data)
+  }
+  result %<>% ungroup()
+}
diff --git a/R/data_validation.R b/R/data_validation.R
@@ -13,7 +13,7 @@
 #'   include empty strings
 #' @param args_list the args list created by [`epipredict::arx_args_list`]
 #' @export
-perform_sanity_checks <- function(epi_data,
+sanitize_args_predictors_trainer <- function(epi_data,
                                   outcome,
                                   predictors,
                                   trainer,
@@ -56,7 +56,7 @@ perform_sanity_checks <- function(epi_data,
 #' @export
 confirm_sufficient_data <- function(epi_data, ahead, args_input, buffer = 9) {
   if (!is.null(args_input$lags)) {
-    lag_max <- max(args_input$lags)
+    lag_max <- max(unlist(args_input$lags))
   } else {
     lag_max <- 14 # default value of 2 weeks
   }

diff --git a/R/epipredict_utilities.R b/R/epipredict_utilities.R
@@ -2,25 +2,25 @@
 #' add the default steps for arx_forecaster
 #' @description
 #' add the default steps for arx_forecaster
-#' @param rec an [`epipredict::epi_recipe`]
+#' @param preproc an [`epipredict::epi_recipe`]
 #' @param outcome a character of the column to be predicted
 #' @param predictors a character vector of the columns used as predictors
 #' @param args_list an [`epipredict::arx_args_list`]
 #' @seealso [arx_postprocess] for the layer equivalent
 #' @importFrom epipredict step_epi_lag step_epi_ahead step_epi_naomit step_training_window
 #' @export
-arx_preprocess <- function(rec, outcome, predictors, args_list) {
+arx_preprocess <- function(preproc, outcome, predictors, args_list) {
   # input already validated
   lags <- args_list$lags
   for (l in seq_along(lags)) {
     p <- predictors[l]
-    rec %<>% step_epi_lag(!!p, lag = lags[[l]])
+    preproc %<>% step_epi_lag(!!p, lag = lags[[l]])
   }
-  rec %<>%
+  preproc %<>%
     step_epi_ahead(!!outcome, ahead = args_list$ahead) %>%
     step_epi_naomit() %>%
     step_training_window(n_recent = args_list$n_training)
-  return(rec)
+  return(preproc)
 }
 
 # TODO replace with `layer_arx_forecaster`

diff --git a/R/forecaster_flatline.R b/R/forecaster_flatline.R
@@ -40,9 +40,7 @@ flatline_fc <- function(epi_data,
   args_list <- do.call(flatline_args_list, args_input)
   # if you want to ignore extra_sources, setting predictors is the way to do it
   predictors <- c(outcome, extra_sources)
-  argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, NULL, args_list)
-  args_list <- argsPredictorsTrainer[[1]]
-  predictors <- argsPredictorsTrainer[[2]]
+  c(args_list, predictors, trainer) %<-% sanitize_args_predictors_trainer(epi_data, outcome, predictors, NULL, args_list)
   # end of the copypasta
   # finally, any other pre-processing (e.g. smoothing) that isn't performed by
   # epipredict

diff --git a/R/forecaster_scaled_pop.R b/R/forecaster_scaled_pop.R
@@ -35,9 +35,10 @@
 #' @param quantile_levels The quantile levels to predict. Defaults to those required by
 #'   covidhub.
 #' @seealso some utilities for making forecasters: [format_storage],
-#'   [perform_sanity_checks]
+#'   [sanitize_args_predictors_trainer]
 #' @importFrom epipredict epi_recipe step_population_scaling frosting arx_args_list layer_population_scaling
 #' @importFrom tibble tibble
+#' @importFrom zeallot %<-%
 #' @importFrom recipes all_numeric
 #' @export
 scaled_pop <- function(epi_data,
@@ -73,13 +74,10 @@ scaled_pop <- function(epi_data,
   args_input[["ahead"]] <- effective_ahead
   args_input[["quantile_levels"]] <- quantile_levels
   args_list <- do.call(arx_args_list, args_input)
-  # if you want to ignore extra_sources, setting predictors is the way to do it
+  # if you want to hardcode particular predictors in a particular forecaster
   predictors <- c(outcome, extra_sources)
   # TODO: Partial match quantile_level coming from here (on Dmitry's machine)
-  argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list)
-  args_list <- argsPredictorsTrainer[[1]]
-  predictors <- argsPredictorsTrainer[[2]]
-  trainer <- argsPredictorsTrainer[[3]]
+  c(args_list, predictors, trainer) %<-% sanitize_args_predictors_trainer(epi_data, outcome, predictors, trainer, args_list)
   # end of the copypasta
   # finally, any other pre-processing (e.g. smoothing) that isn't performed by
   # epipredict