Skip to content

Add the 7dav we talked about along with the std #76

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Dec 23, 2023
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ Imports:
purrr,
recipes (>= 1.0.4),
rlang,
slider,
targets,
tibble,
tidyr
Expand Down
8 changes: 8 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,17 @@ export(make_target_param_grid)
export(overprediction)
export(perform_sanity_checks)
export(read_external_predictions_data)
export(rolling_mean)
export(rolling_sd)
export(run_evaluation_measure)
export(run_workflow_and_format)
export(scaled_pop)
export(sharpness)
export(single_id)
export(slide_forecaster)
export(smoothed_scaled)
export(underprediction)
export(update_predictors)
export(weighted_interval_score)
importFrom(assertthat,assert_that)
importFrom(cli,cli_abort)
Expand Down Expand Up @@ -81,13 +85,16 @@ importFrom(epipredict,step_epi_naomit)
importFrom(epipredict,step_population_scaling)
importFrom(epipredict,step_training_window)
importFrom(epiprocess,as_epi_df)
importFrom(epiprocess,epi_slide)
importFrom(epiprocess,epix_slide)
importFrom(magrittr,"%<>%")
importFrom(magrittr,"%>%")
importFrom(purrr,imap)
importFrom(purrr,map)
importFrom(purrr,map2_vec)
importFrom(purrr,map_chr)
importFrom(purrr,map_vec)
importFrom(purrr,reduce)
importFrom(purrr,transpose)
importFrom(recipes,all_numeric)
importFrom(rlang,"!!")
Expand All @@ -96,6 +103,7 @@ importFrom(rlang,.data)
importFrom(rlang,quo)
importFrom(rlang,sym)
importFrom(rlang,syms)
importFrom(slider,slide_dbl)
importFrom(targets,tar_config_get)
importFrom(targets,tar_group)
importFrom(targets,tar_read)
Expand Down
114 changes: 114 additions & 0 deletions R/data_transforms.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# various reusable transforms to apply before handing to epipredict

#' extract the non-key, non-smoothed columns from epi_data
#' @keywords internal
#' @param epi_data the `epi_df`
#' @param cols vector of column names to use. If `NULL`, fill with all non-key columns
get_trainable_names <- function(epi_data, cols) {
if (is.null(cols)) {
cols <- get_nonkey_names(epi_data)
# exclude anything with the same naming schema as the rolling average/sd created below
cols <- cols[!grepl("_\\w{1,2}\\d+", cols)]
}
return(cols)
}

#' just the names which aren't keys for an epi_df
#' @description
#' names, but it excludes keys
#' @param epi_data the epi_df
get_nonkey_names <- function(epi_data) {
cols <- names(epi_data)
cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))]
return(cols)
}


#' update the predictors to only contain the smoothed/sd versions of cols
#' @description
#' modifies the list of preditors so that any which have been modified have the
#' modified versions included, and not the original. Should only be applied
#' after both rolling_mean and rolling_sd.
#' @param epi_data the epi_df, only included to get the non-key column names
#' @param cols_modified the list of columns which have been modified. If this is `NULL`, that means we were modifying every column.
#' @param predictors the initial set of predictors; any unmodified are kept, any modified are replaced with the modified versions (e.g. "a" becoming "a_m17").
#' @importFrom purrr map map_chr reduce
#' @return returns an updated list of predictors, with modified columns replaced and non-modified columns left intact.
#' @export
update_predictors <- function(epi_data, cols_modified, predictors) {
if (!is.null(cols_modified)) {
# if cols_modified isn't null, make sure we include predictors that weren't modified
unchanged_predictors <- map(cols_modified, ~ !grepl(.x, predictors, fixed = TRUE)) %>% reduce(`&`)
unchanged_predictors <- predictors[unchanged_predictors]
} else {
# if it's null, we've modified every predictor
unchanged_predictors <- character(0L)
}
# all the non-key names
col_names <- get_nonkey_names(epi_data)
is_present <- function(original_predictor) {
grepl(original_predictor, col_names) & !(col_names %in% predictors)
}
is_modified <- map(predictors, is_present) %>% reduce(`|`)
new_predictors <- col_names[is_modified]
return(c(unchanged_predictors, new_predictors))
}

#' get a rolling average for the named columns
#' @description
#' add column(s) that are the rolling means of the specified columns, as
#' implemented by slider. Defaults to the previous 7 days.
#' Currently only group_by's on the geo_value. Should probably extend to more
#' keys if you have them
#' @param epi_data the dataset
#' @param width the number of days (or examples, the sliding isn't time-aware) to use
#' @param cols_to_mean the non-key columns to take the mean over. `NULL` means all
#' @importFrom slider slide_dbl
#' @importFrom epiprocess epi_slide
#' @export
rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) {
cols_to_mean <- get_trainable_names(epi_data, cols_to_mean)
epi_data %<>% group_by(geo_value)
for (col in cols_to_mean) {
mean_name <- paste0(col, "_m", width)
epi_data %<>% epi_slide(~ mean(.x[[col]]), before = width, new_col_name = mean_name)
}
epi_data %<>% ungroup()
return(epi_data)
}

#' get a rolling standard deviation for the named columns
#' @description
#' A rolling standard deviation, based off of a rolling mean. First it
#' calculates a rolling mean with width `mean_width`, and then squares the
#' difference between that and the actual value, averaged over `sd_width`.
#' @param epi_data the dataset
#' @param sd_width the number of days (or examples, the sliding isn't
#' time-aware) to use for the standard deviation calculation
#' @param mean_width like `sd_width`, but it governs the mean. Should be less
#' than the `sd_width`, and if `NULL` (the default) it is half of `sd_width`
#' (so 14 in the complete default case)
#' @param cols_to_sd the non-key columns to take the sd over. `NULL` means all
#' @param keep_mean bool, if `TRUE`, it retains keeps the mean column
#' @importFrom epiprocess epi_slide
#' @export
rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd = NULL, keep_mean = FALSE) {
if (is.null(mean_width)) {
mean_width <- as.integer(ceiling(sd_width / 2))
}
cols_to_sd <- get_trainable_names(epi_data, cols_to_sd)
result <- epi_data
for (col in cols_to_sd) {
result %<>% group_by(geo_value)
mean_name <- paste0(col, "_m", mean_width)
sd_name <- paste0(col, "_sd", sd_width)
result %<>% epi_slide(~ mean(.x[[col]]), before = mean_width, new_col_name = mean_name)
result %<>% epi_slide(~ sqrt(mean((.x[[mean_name]] - .x[[col]])^2)), before = sd_width, new_col_name = sd_name)
if (!keep_mean) {
# TODO make sure the extra info sticks around
result %<>% select(-{{ mean_name }})
}
result %<>% dplyr_reconstruct(epi_data)
}
result %<>% ungroup()
}
10 changes: 5 additions & 5 deletions R/epipredict_utilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,25 @@
#' add the default steps for arx_forecaster
#' @description
#' add the default steps for arx_forecaster
#' @param rec an [`epipredict::epi_recipe`]
#' @param preproc an [`epipredict::epi_recipe`]
#' @param outcome a character of the column to be predicted
#' @param predictors a character vector of the columns used as predictors
#' @param args_list an [`epipredict::arx_args_list`]
#' @seealso [arx_postprocess] for the layer equivalent
#' @importFrom epipredict step_epi_lag step_epi_ahead step_epi_naomit step_training_window
#' @export
arx_preprocess <- function(rec, outcome, predictors, args_list) {
arx_preprocess <- function(preproc, outcome, predictors, args_list) {
# input already validated
lags <- args_list$lags
for (l in seq_along(lags)) {
p <- predictors[l]
rec %<>% step_epi_lag(!!p, lag = lags[[l]])
preproc %<>% step_epi_lag(!!p, lag = lags[[l]])
}
rec %<>%
preproc %<>%
step_epi_ahead(!!outcome, ahead = args_list$ahead) %>%
step_epi_naomit() %>%
step_training_window(n_recent = args_list$n_training)
return(rec)
return(preproc)
}

# TODO replace with `layer_arx_forecaster`
Expand Down
139 changes: 139 additions & 0 deletions R/forecaster_smoothed_scaled.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
#' predict on smoothed data and the standard deviation
#' @description
#' This is a variant of `scaled_pop`, which predicts on a smoothed version of
#' the data. Even if the target is smoothed when used as a /predictor/, as a
#' /target/ it still uses the raw value (this captures some of the noise). It
#' also uses a rolling standard deviation as an auxillary signal, window of
#' withd `sd_width`, which by default is 28 days.
#' @param epi_data the actual data used
#' @param outcome the name of the target variable
#' @param extra_sources the name of any extra columns to use. This list could be
#' empty
#' @param ahead (this is relative to the `as_of` field of the `epi_df`, which is
#' likely *not* the same as the `ahead` used by epipredict, which is relative
#' to the max time value of the `epi_df`. how to handle this is a modelling
#' question left up to each forecaster; see latency_adjusting.R for the
#' existing examples)
#' @param pop_scaling an example extra parameter unique to this forecaster
#' @param trainer an example extra parameter that is fairly common
#' @param smooth_width the number of days over which to do smoothing. If `NULL`,
#' then no smoothing is applied.
#' @param smooth_cols the names of the columns to smooth. If `NULL` it smooths
#' everything
#' @param sd_width the number of days over which to take a moving average of the
#' standard deviation. If `NULL`, the sd_width isn't included.
#' @param sd_mean_width to calculate the sd, we need a window size for the mean
#' used.
#' @param sd_cols the names of the columns to smooth. If `NULL` its includes
#' the sd of everything
#' @param quantile_levels The quantile levels to predict. Defaults to those
#' @param ... any additional arguments as used by [arx_args_list]
#' required by covidhub.
#' @seealso some utilities for making forecasters: [format_storage],
#' [perform_sanity_checks]
#' @importFrom epipredict epi_recipe step_population_scaling frosting arx_args_list layer_population_scaling
#' @importFrom tibble tibble
#' @importFrom recipes all_numeric
#' @export
smoothed_scaled <- function(epi_data,
outcome,
extra_sources = "",
ahead = 1,
pop_scaling = TRUE,
trainer = parsnip::linear_reg(),
quantile_levels = covidhub_probs(),
smooth_width = 7,
smooth_cols = NULL,
sd_width = 28,
sd_mean_width = 14,
sd_cols = NULL,
...) {
# perform any preprocessing not supported by epipredict
# this is a temp fix until a real fix gets put into epipredict
epi_data <- clear_lastminute_nas(epi_data)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: why do we want to clear NAs? (Probably should add an explanatory comment. Note this is sort of an "anti-complete()" so even if we want complete()d stuff anywhere below then those things need to be careful. E.g., slide computation passed to epi_slide() should probably test nrow(.x) to decide whether it should output NA.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lastminute here means NA in the final days. Epipredict just chokes on that, when it should be treating the NA's as a latency adjustment.

Copy link
Contributor

@brookslogan brookslogan Jan 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, that makes sense, though the current clear_lastminute_nas clears all NAs not just lastminute ones. Relates to lastminute-imputation & gap treatment stuff. I think we discussed this elsewhere but current approach probably works okay on hhs hosp data (and hopefully any chng data used?? idk there), but when adding to epipredict should probably be a bit more careful since it could be used on more problematic data sets.

# one that every forecaster will need to handle: how to manage max(time_value)
# that's older than the `as_of` date
epidataAhead <- extend_ahead(epi_data, ahead)
# see latency_adjusting for other examples
# this next part is basically unavoidable boilerplate you'll want to copy
epi_data <- epidataAhead[[1]]
effective_ahead <- epidataAhead[[2]]
args_input <- list(...)
# edge case where there is no data or less data than the lags; eventually epipredict will handle this
if (!confirm_sufficient_data(epi_data, effective_ahead, args_input)) {
null_result <- tibble(
geo_value = character(),
forecast_date = lubridate::Date(),
target_end_date = lubridate::Date(),
quantile = numeric(),
value = numeric()
)
return(null_result)
}
args_input[["ahead"]] <- effective_ahead
args_input[["quantile_levels"]] <- quantile_levels
args_list <- do.call(arx_args_list, args_input)
# if you want to ignore extra_sources, setting predictors is the way to do it
predictors <- c(outcome, extra_sources)
# TODO: Partial match quantile_level coming from here (on Dmitry's machine)
argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list)
args_list <- argsPredictorsTrainer[[1]]
predictors <- argsPredictorsTrainer[[2]]
trainer <- argsPredictorsTrainer[[3]]
# end of the copypasta
# finally, any other pre-processing (e.g. smoothing) that isn't performed by
# epipredict
# smoothing
keep_mean <- (smooth_width == sd_mean_width) # do we need to do the mean separately?
if (!is.null(smooth_width) && !keep_mean) {
epi_data %<>% rolling_mean(
width = smooth_width,
cols_to_mean = smooth_cols
)
}

# measuring standard deviation
if (!is.null(sd_width)) {
epi_data %<>% rolling_sd(
sd_width = sd_width,
mean_width = sd_mean_width,
cols_to_sd = sd_cols,
keep_mean = keep_mean
)
}
# and need to make sure we exclude the original varialbes as predictors
predictors <- update_predictors(epi_data, c(smooth_cols, sd_cols), predictors)
Copy link
Contributor

@brookslogan brookslogan Dec 19, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue: Do we always want to do this? E.g., dev10 pairs lags of separate rolling means of the original with the sd estimate. [nvm, this is what seems to be happening here as well, though I need to check where the lags are specified] And how are the requested lags derived for these predictors? May make more sense to just keep everything and have the model specify what it wants. Or, in epipredict framework, have a smooth mean step that adds (lags of) smoothed means & assigns predictor roles, and sd step that works in a similar way, so dev10 could be a step_epi_lag + step_rolling_sd, while the implied behavior above could be a step_rolling_mean + step_rolling_sd. (thought: I'm wishing for a step_epi_slide and a step_mutate that allow .by, but these concrete steps would be good too I think. And same thing in epiprocess, probably want specific rolling mean and rolling sd helpers..)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lags are applied to all predictors, and are specified per-model in the ..., which can be any of the arx args.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here we may want to apply different lagsets to the 7dav (7*(0:4) ish) and the sd covariate (maybe just 0) & ignore the original non-7dav'd predictors. Is that possible via the list-of-vectors form of lags?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ignore the original non-7dav'd predictors

this is what the update_predictors function is for. It drops the non-7dav'd versions from the list of predictors.

Is that possible via the list-of-vectors form of lags?

Vector or List. Positive integers enumerating lags to use in autoregressive-type models (in days). By default, an unnamed list of lags will be set to correspond to the order of the predictors.

Apparently, though I haven't actually tried it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like the current validation will probably complain if we try it.

issue: c(smooth_cols, sd_cols) is probably wrong when exactly one of smooth_cols and sd_cols is NULL. These variables have not been overwritten with get_trainable_names in this function.

# preprocessing supported by epipredict
preproc <- epi_recipe(epi_data)
if (pop_scaling) {
preproc %<>% step_population_scaling(
all_numeric(),
df = epipredict::state_census,
df_pop_col = "pop",
create_new = FALSE,
rate_rescaling = 1e5,
by = c("geo_value" = "abbr")
)
}
preproc %<>% arx_preprocess(outcome, predictors, args_list)

# postprocessing supported by epipredict
postproc <- frosting()
postproc %<>% arx_postprocess(trainer, args_list)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (with arx_postprocess...): ignores forecast_date passed through ...; doesn't respect target_date passed through ....

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (with arx_postprocess...): ignores forecast_date passed through ...; doesn't respect target_date passed through ....

That's intended behavior. Both of those are set through adjusting ahead for target_date and the as_of of epi_data for the forecast_date.

issue: c(smooth_cols, sd_cols) is probably wrong when exactly one of smooth_cols and sd_cols is NULL. These variables have not been overwritten with get_trainable_names in this function.

Yeah, I should probably just completely refactor the way I wrote update_predictors. I think it should work for now though.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Think a quick fix here would be to just make sure forecast_date and target_date are NULL in args_list. Or to move away from ... forwarding to manually handling/forwarding/forbidding each parameter. I assume you're not using passing in either of these parameters downstream, so maybe this could also be deferred to a separate Issue.

if (pop_scaling) {
postproc %<>% layer_population_scaling(
.pred, .pred_distn,
df = epipredict::state_census,
df_pop_col = "pop",
create_new = FALSE,
rate_rescaling = 1e5,
by = c("geo_value" = "abbr")
)
}
# with all the setup done, we execute and format
pred <- run_workflow_and_format(preproc, postproc, trainer, epi_data)
# now pred has the columns
# (geo_value, forecast_date, target_end_date, quantile, value)
# finally, any postprocessing not supported by epipredict e.g. calibration
return(pred)
}
9 changes: 8 additions & 1 deletion R/targets_utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,14 @@ make_shared_grids <- function() {
),
tidyr::expand_grid(
forecaster = "flatline_fc",
ahead = 1:7
ahead = c(1:7, 14, 21, 28)
),
tidyr::expand_grid(
forecaster = "smoothed_scaled",
trainer = c("quantreg"),
ahead = c(1:7, 14, 21, 28),
lags = list(list(c(0, 3, 5, 7, 14), c(0),), c(0, 7, 14)),
pop_scaling = c(FALSE)
)
)
}
Expand Down
4 changes: 2 additions & 2 deletions app.R
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ shinyApp(
verticalLayout(
plotlyOutput("main_plot", height = "90em"),
h2("forecaster name -> parameters"),
#textOutput("forecaster_param_title"),
# textOutput("forecaster_param_title"),
dataTableOutput("forecaster_table"),
h2("ensemble name -> parameters"),
dataTableOutput("ensemble_table")
Expand Down Expand Up @@ -261,7 +261,7 @@ shinyApp(
# Use scatterplot or lines depending on the x var.
{
if (input$x_var %in% c(input$facet_vars, "geo_value", "forecaster", "ahead")) {
. + geom_point(aes(size = n)) + expand_limits(size = 0)
. + geom_point(aes(size = n / 100)) + expand_limits(size = 0)
} else {
. + geom_line()
}
Expand Down
4 changes: 2 additions & 2 deletions man/arx_preprocess.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions man/get_nonkey_names.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading