Skip to content

Add the 7dav we talked about along with the std #76

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Dec 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,11 @@ Imports:
purrr,
recipes (>= 1.0.4),
rlang,
slider,
targets,
tibble,
tidyr
tidyr,
zeallot
Suggests:
ggplot2,
knitr,
Expand Down
11 changes: 10 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,19 @@ export(make_shared_grids)
export(make_target_ensemble_grid)
export(make_target_param_grid)
export(overprediction)
export(perform_sanity_checks)
export(read_external_predictions_data)
export(rolling_mean)
export(rolling_sd)
export(run_evaluation_measure)
export(run_workflow_and_format)
export(sanitize_args_predictors_trainer)
export(scaled_pop)
export(sharpness)
export(single_id)
export(slide_forecaster)
export(smoothed_scaled)
export(underprediction)
export(update_predictors)
export(weighted_interval_score)
importFrom(assertthat,assert_that)
importFrom(cli,cli_abort)
Expand Down Expand Up @@ -81,13 +85,16 @@ importFrom(epipredict,step_epi_naomit)
importFrom(epipredict,step_population_scaling)
importFrom(epipredict,step_training_window)
importFrom(epiprocess,as_epi_df)
importFrom(epiprocess,epi_slide)
importFrom(epiprocess,epix_slide)
importFrom(magrittr,"%<>%")
importFrom(magrittr,"%>%")
importFrom(purrr,imap)
importFrom(purrr,map)
importFrom(purrr,map2_vec)
importFrom(purrr,map_chr)
importFrom(purrr,map_vec)
importFrom(purrr,reduce)
importFrom(purrr,transpose)
importFrom(recipes,all_numeric)
importFrom(rlang,"!!")
Expand All @@ -96,6 +103,7 @@ importFrom(rlang,.data)
importFrom(rlang,quo)
importFrom(rlang,sym)
importFrom(rlang,syms)
importFrom(slider,slide_dbl)
importFrom(targets,tar_config_get)
importFrom(targets,tar_group)
importFrom(targets,tar_read)
Expand All @@ -105,3 +113,4 @@ importFrom(tidyr,drop_na)
importFrom(tidyr,expand_grid)
importFrom(tidyr,pivot_wider)
importFrom(tidyr,unnest)
importFrom(zeallot,"%<-%")
114 changes: 114 additions & 0 deletions R/data_transforms.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# various reusable transforms to apply before handing to epipredict

#' extract the non-key, non-smoothed columns from epi_data
#' @keywords internal
#' @param epi_data the `epi_df`
#' @param cols vector of column names to use. If `NULL`, fill with all non-key columns
get_trainable_names <- function(epi_data, cols) {
if (is.null(cols)) {
cols <- get_nonkey_names(epi_data)
# exclude anything with the same naming schema as the rolling average/sd created below
cols <- cols[!grepl("_\\w{1,2}\\d+", cols)]
}
return(cols)
}

#' just the names which aren't keys for an epi_df
#' @description
#' names, but it excludes keys
#' @param epi_data the epi_df
get_nonkey_names <- function(epi_data) {
cols <- names(epi_data)
cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))]
return(cols)
}


#' update the predictors to only contain the smoothed/sd versions of cols
#' @description
#' modifies the list of preditors so that any which have been modified have the
#' modified versions included, and not the original. Should only be applied
#' after both rolling_mean and rolling_sd.
#' @param epi_data the epi_df, only included to get the non-key column names
#' @param cols_modified the list of columns which have been modified. If this is `NULL`, that means we were modifying every column.
#' @param predictors the initial set of predictors; any unmodified are kept, any modified are replaced with the modified versions (e.g. "a" becoming "a_m17").
#' @importFrom purrr map map_chr reduce
#' @return returns an updated list of predictors, with modified columns replaced and non-modified columns left intact.
#' @export
update_predictors <- function(epi_data, cols_modified, predictors) {
if (!is.null(cols_modified)) {
# if cols_modified isn't null, make sure we include predictors that weren't modified
unchanged_predictors <- map(cols_modified, ~ !grepl(.x, predictors, fixed = TRUE)) %>% reduce(`&`)
unchanged_predictors <- predictors[unchanged_predictors]
} else {
# if it's null, we've modified every predictor
unchanged_predictors <- character(0L)
}
# all the non-key names
col_names <- get_nonkey_names(epi_data)
is_present <- function(original_predictor) {
grepl(original_predictor, col_names) & !(col_names %in% predictors)
}
is_modified <- map(predictors, is_present) %>% reduce(`|`)
new_predictors <- col_names[is_modified]
return(c(unchanged_predictors, new_predictors))
}

#' get a rolling average for the named columns
#' @description
#' add column(s) that are the rolling means of the specified columns, as
#' implemented by slider. Defaults to the previous 7 days.
#' Currently only group_by's on the geo_value. Should probably extend to more
#' keys if you have them
#' @param epi_data the dataset
#' @param width the number of days (or examples, the sliding isn't time-aware) to use
#' @param cols_to_mean the non-key columns to take the mean over. `NULL` means all
#' @importFrom slider slide_dbl
#' @importFrom epiprocess epi_slide
#' @export
rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) {
cols_to_mean <- get_trainable_names(epi_data, cols_to_mean)
epi_data %<>% group_by(geo_value)
for (col in cols_to_mean) {
mean_name <- paste0(col, "_m", width)
epi_data %<>% epi_slide(~ mean(.x[[col]], rm.na = TRUE), before = width-1L, new_col_name = mean_name)
}
epi_data %<>% ungroup()
return(epi_data)
}

#' get a rolling standard deviation for the named columns
#' @description
#' A rolling standard deviation, based off of a rolling mean. First it
#' calculates a rolling mean with width `mean_width`, and then squares the
#' difference between that and the actual value, averaged over `sd_width`.
#' @param epi_data the dataset
#' @param sd_width the number of days (or examples, the sliding isn't
#' time-aware) to use for the standard deviation calculation
#' @param mean_width like `sd_width`, but it governs the mean. Should be less
#' than the `sd_width`, and if `NULL` (the default) it is half of `sd_width`
#' (so 14 in the complete default case)
#' @param cols_to_sd the non-key columns to take the sd over. `NULL` means all
#' @param keep_mean bool, if `TRUE`, it retains keeps the mean column
#' @importFrom epiprocess epi_slide
#' @export
rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd = NULL, keep_mean = FALSE) {
if (is.null(mean_width)) {
mean_width <- as.integer(ceiling(sd_width / 2))
}
cols_to_sd <- get_trainable_names(epi_data, cols_to_sd)
result <- epi_data
for (col in cols_to_sd) {
result %<>% group_by(geo_value)
mean_name <- paste0(col, "_m", mean_width)
sd_name <- paste0(col, "_sd", sd_width)
result %<>% epi_slide(~ mean(.x[[col]], na.rm = TRUE), before = mean_width-1L, new_col_name = mean_name)
result %<>% epi_slide(~ sqrt(mean((.x[[mean_name]] - .x[[col]])^2, na.rm = TRUE)), before = sd_width-1, new_col_name = sd_name)
if (!keep_mean) {
# TODO make sure the extra info sticks around
result %<>% select(-{{ mean_name }})
}
result %<>% dplyr_reconstruct(epi_data)
}
result %<>% ungroup()
}
4 changes: 2 additions & 2 deletions R/data_validation.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#' include empty strings
#' @param args_list the args list created by [`epipredict::arx_args_list`]
#' @export
perform_sanity_checks <- function(epi_data,
sanitize_args_predictors_trainer <- function(epi_data,
outcome,
predictors,
trainer,
Expand Down Expand Up @@ -56,7 +56,7 @@ perform_sanity_checks <- function(epi_data,
#' @export
confirm_sufficient_data <- function(epi_data, ahead, args_input, buffer = 9) {
if (!is.null(args_input$lags)) {
lag_max <- max(args_input$lags)
lag_max <- max(unlist(args_input$lags))
} else {
lag_max <- 14 # default value of 2 weeks
}
Expand Down
10 changes: 5 additions & 5 deletions R/epipredict_utilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,25 @@
#' add the default steps for arx_forecaster
#' @description
#' add the default steps for arx_forecaster
#' @param rec an [`epipredict::epi_recipe`]
#' @param preproc an [`epipredict::epi_recipe`]
#' @param outcome a character of the column to be predicted
#' @param predictors a character vector of the columns used as predictors
#' @param args_list an [`epipredict::arx_args_list`]
#' @seealso [arx_postprocess] for the layer equivalent
#' @importFrom epipredict step_epi_lag step_epi_ahead step_epi_naomit step_training_window
#' @export
arx_preprocess <- function(rec, outcome, predictors, args_list) {
arx_preprocess <- function(preproc, outcome, predictors, args_list) {
# input already validated
lags <- args_list$lags
for (l in seq_along(lags)) {
p <- predictors[l]
rec %<>% step_epi_lag(!!p, lag = lags[[l]])
preproc %<>% step_epi_lag(!!p, lag = lags[[l]])
}
rec %<>%
preproc %<>%
step_epi_ahead(!!outcome, ahead = args_list$ahead) %>%
step_epi_naomit() %>%
step_training_window(n_recent = args_list$n_training)
return(rec)
return(preproc)
}

# TODO replace with `layer_arx_forecaster`
Expand Down
4 changes: 1 addition & 3 deletions R/forecaster_flatline.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,7 @@ flatline_fc <- function(epi_data,
args_list <- do.call(flatline_args_list, args_input)
# if you want to ignore extra_sources, setting predictors is the way to do it
predictors <- c(outcome, extra_sources)
argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, NULL, args_list)
args_list <- argsPredictorsTrainer[[1]]
predictors <- argsPredictorsTrainer[[2]]
c(args_list, predictors, trainer) %<-% sanitize_args_predictors_trainer(epi_data, outcome, predictors, NULL, args_list)
# end of the copypasta
# finally, any other pre-processing (e.g. smoothing) that isn't performed by
# epipredict
Expand Down
10 changes: 4 additions & 6 deletions R/forecaster_scaled_pop.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@
#' @param quantile_levels The quantile levels to predict. Defaults to those required by
#' covidhub.
#' @seealso some utilities for making forecasters: [format_storage],
#' [perform_sanity_checks]
#' [sanitize_args_predictors_trainer]
#' @importFrom epipredict epi_recipe step_population_scaling frosting arx_args_list layer_population_scaling
#' @importFrom tibble tibble
#' @importFrom zeallot %<-%
#' @importFrom recipes all_numeric
#' @export
scaled_pop <- function(epi_data,
Expand Down Expand Up @@ -73,13 +74,10 @@ scaled_pop <- function(epi_data,
args_input[["ahead"]] <- effective_ahead
args_input[["quantile_levels"]] <- quantile_levels
args_list <- do.call(arx_args_list, args_input)
# if you want to ignore extra_sources, setting predictors is the way to do it
# if you want to hardcode particular predictors in a particular forecaster
predictors <- c(outcome, extra_sources)
# TODO: Partial match quantile_level coming from here (on Dmitry's machine)
argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list)
args_list <- argsPredictorsTrainer[[1]]
predictors <- argsPredictorsTrainer[[2]]
trainer <- argsPredictorsTrainer[[3]]
c(args_list, predictors, trainer) %<-% sanitize_args_predictors_trainer(epi_data, outcome, predictors, trainer, args_list)
# end of the copypasta
# finally, any other pre-processing (e.g. smoothing) that isn't performed by
# epipredict
Expand Down
Loading