-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_validation.R
77 lines (72 loc) · 3.23 KB
/
data_validation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#' helper function for those writing forecasters
#' @description
#' a smorgasbord of checks that any epipredict-based forecaster should do:
#' 1. check that the args list is created correctly,
#' 2. rewrite an empty extra sources list from an empty string
#' 3. validate the outcome and predictors as present,
#' 4. make sure the trainer is a `regression` model from `parsnip`
#' 5. adjust the trainer's quantiles based on those in args_list if it's a
#' quantile trainer
#' 6. remake the lags to match the numebr of predictors
#' @inheritParams scaled_pop
#' @param predictors the full list of predictors including the outcome. can
#' include empty strings
#' @param args_list the args list created by [`epipredict::arx_args_list`]
#' @export
sanitize_args_predictors_trainer <- function(epi_data,
outcome,
predictors,
trainer,
args_list) {
if (!inherits(args_list, c("arx_fcast", "alist"))) {
cli::cli_abort("args_list was not created using `arx_args_list().")
}
predictors <- predictors[predictors != ""]
epipredict:::validate_forecaster_inputs(epi_data, outcome, predictors)
if (!is.null(trainer) && !epipredict:::is_regression(trainer)) {
cli::cli_abort("{trainer} must be a `{parsnip}` model of mode 'regression'.")
} else if (inherits(trainer, "quantile_reg")) {
# add all quantile_levels to the trainer and update args list
quantile_levels <- sort(epipredict:::compare_quantile_args(
args_list$quantile_levels,
rlang::eval_tidy(trainer$args$quantile_levels)
))
args_list$quantile_levels <- quantile_levels
trainer$args$quantile_levels <- rlang::enquo(quantile_levels)
}
args_list$lags <- epipredict:::arx_lags_validator(predictors, args_list$lags)
return(list(args_list, predictors, trainer))
}
#' confirm that there's enough data to run this model
#' @description
#' epipredict is a little bit fragile about having enough data to train; we want
#' to be able to return a null result rather than error out.
#' @param epi_data the input data
#' @param ahead the effective ahead; may be infinite if there isn't enough data.
#' @param args_input the input as supplied to `slide_forecaster`; lags is the
#' important argument, which may or may not be defined, with the default
#' coming from `arx_args_list`
#' @param buffer how many training data to insist on having (e.g. if `buffer=1`,
#' this trains on one sample; the default is set so that `linear_reg` isn't
#' rank deficient)
#' @importFrom tidyr drop_na
#' @export
confirm_sufficient_data <- function(epi_data, ahead, args_input, buffer = 9) {
if (!is.null(args_input$lags)) {
lag_max <- max(unlist(args_input$lags))
} else {
lag_max <- 14 # default value of 2 weeks
}
# TODO: Buffer should probably be 2 * n(lags) * n(predictors). But honestly,
# this needs to be fixed in epipredict itself, see
# https://github.com/cmu-delphi/epipredict/issues/106.
return(
!is.infinite(ahead) &&
epi_data %>%
drop_na() %>%
group_by(geo_value) %>%
summarise(has_enough_data = n_distinct(time_value) >= lag_max + ahead + buffer) %>%
pull(has_enough_data) %>%
any()
)
}