-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_transforms.R
114 lines (108 loc) · 5.07 KB
/
data_transforms.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# various reusable transforms to apply before handing to epipredict
#' extract the non-key, non-smoothed columns from epi_data
#' @keywords internal
#' @param epi_data the `epi_df`
#' @param cols vector of column names to use. If `NULL`, fill with all non-key columns
get_trainable_names <- function(epi_data, cols) {
if (is.null(cols)) {
cols <- get_nonkey_names(epi_data)
# exclude anything with the same naming schema as the rolling average/sd created below
cols <- cols[!grepl("_\\w{1,2}\\d+", cols)]
}
return(cols)
}
#' just the names which aren't keys for an epi_df
#' @description
#' names, but it excludes keys
#' @param epi_data the epi_df
get_nonkey_names <- function(epi_data) {
cols <- names(epi_data)
cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))]
return(cols)
}
#' update the predictors to only contain the smoothed/sd versions of cols
#' @description
#' modifies the list of preditors so that any which have been modified have the
#' modified versions included, and not the original. Should only be applied
#' after both rolling_mean and rolling_sd.
#' @param epi_data the epi_df, only included to get the non-key column names
#' @param cols_modified the list of columns which have been modified. If this is `NULL`, that means we were modifying every column.
#' @param predictors the initial set of predictors; any unmodified are kept, any modified are replaced with the modified versions (e.g. "a" becoming "a_m17").
#' @importFrom purrr map map_chr reduce
#' @return returns an updated list of predictors, with modified columns replaced and non-modified columns left intact.
#' @export
update_predictors <- function(epi_data, cols_modified, predictors) {
if (!is.null(cols_modified)) {
# if cols_modified isn't null, make sure we include predictors that weren't modified
unchanged_predictors <- map(cols_modified, ~ !grepl(.x, predictors, fixed = TRUE)) %>% reduce(`&`)
unchanged_predictors <- predictors[unchanged_predictors]
} else {
# if it's null, we've modified every predictor
unchanged_predictors <- character(0L)
}
# all the non-key names
col_names <- get_nonkey_names(epi_data)
is_present <- function(original_predictor) {
grepl(original_predictor, col_names) & !(col_names %in% predictors)
}
is_modified <- map(predictors, is_present) %>% reduce(`|`)
new_predictors <- col_names[is_modified]
return(c(unchanged_predictors, new_predictors))
}
#' get a rolling average for the named columns
#' @description
#' add column(s) that are the rolling means of the specified columns, as
#' implemented by slider. Defaults to the previous 7 days.
#' Currently only group_by's on the geo_value. Should probably extend to more
#' keys if you have them
#' @param epi_data the dataset
#' @param width the number of days (or examples, the sliding isn't time-aware) to use
#' @param cols_to_mean the non-key columns to take the mean over. `NULL` means all
#' @importFrom slider slide_dbl
#' @importFrom epiprocess epi_slide
#' @export
rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) {
cols_to_mean <- get_trainable_names(epi_data, cols_to_mean)
epi_data %<>% group_by(geo_value)
for (col in cols_to_mean) {
mean_name <- paste0(col, "_m", width)
epi_data %<>% epi_slide(~ mean(.x[[col]], rm.na = TRUE), before = width-1L, new_col_name = mean_name)
}
epi_data %<>% ungroup()
return(epi_data)
}
#' get a rolling standard deviation for the named columns
#' @description
#' A rolling standard deviation, based off of a rolling mean. First it
#' calculates a rolling mean with width `mean_width`, and then squares the
#' difference between that and the actual value, averaged over `sd_width`.
#' @param epi_data the dataset
#' @param sd_width the number of days (or examples, the sliding isn't
#' time-aware) to use for the standard deviation calculation
#' @param mean_width like `sd_width`, but it governs the mean. Should be less
#' than the `sd_width`, and if `NULL` (the default) it is half of `sd_width`
#' (so 14 in the complete default case)
#' @param cols_to_sd the non-key columns to take the sd over. `NULL` means all
#' @param keep_mean bool, if `TRUE`, it retains keeps the mean column
#' @importFrom epiprocess epi_slide
#' @export
rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd = NULL, keep_mean = FALSE) {
if (is.null(mean_width)) {
mean_width <- as.integer(ceiling(sd_width / 2))
}
cols_to_sd <- get_trainable_names(epi_data, cols_to_sd)
result <- epi_data
for (col in cols_to_sd) {
result %<>% group_by(geo_value)
mean_name <- paste0(col, "_m", mean_width)
sd_name <- paste0(col, "_sd", sd_width)
result %<>% epi_slide(~ mean(.x[[col]], na.rm = TRUE), before = mean_width-1L, new_col_name = mean_name)
result %<>% epi_slide(~ sqrt(mean((.x[[mean_name]] - .x[[col]])^2, na.rm = TRUE)), before = sd_width-1, new_col_name = sd_name)
if (!keep_mean) {
# TODO make sure the extra info sticks around
result %<>% select(-{{ mean_name }})
}
result %<>% dplyr_reconstruct(epi_data)
}
result %<>% ungroup()
}