Skip to content

Commit 251ec5e

Browse files
committed
feat(epi_slide_opt)!: add .prefix =, .suffix =, .new_col_names =
- BREAKING CHANGE: adjust default output column naming scheme, disallow overwriting columns.
1 parent c75de38 commit 251ec5e

11 files changed

+429
-72
lines changed

NAMESPACE

+5
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ importFrom(checkmate,assert_list)
114114
importFrom(checkmate,assert_logical)
115115
importFrom(checkmate,assert_numeric)
116116
importFrom(checkmate,assert_scalar)
117+
importFrom(checkmate,assert_string)
117118
importFrom(checkmate,checkInt)
118119
importFrom(checkmate,check_atomic)
119120
importFrom(checkmate,check_data_frame)
@@ -176,6 +177,7 @@ importFrom(dplyr,summarize)
176177
importFrom(dplyr,tibble)
177178
importFrom(dplyr,ungroup)
178179
importFrom(ggplot2,autoplot)
180+
importFrom(glue,glue)
179181
importFrom(lifecycle,deprecated)
180182
importFrom(lubridate,as.period)
181183
importFrom(lubridate,days)
@@ -198,6 +200,7 @@ importFrom(rlang,env)
198200
importFrom(rlang,expr_label)
199201
importFrom(rlang,f_env)
200202
importFrom(rlang,f_rhs)
203+
importFrom(rlang,is_bare_integerish)
201204
importFrom(rlang,is_environment)
202205
importFrom(rlang,is_formula)
203206
importFrom(rlang,is_function)
@@ -206,6 +209,7 @@ importFrom(rlang,is_quosure)
206209
importFrom(rlang,list2)
207210
importFrom(rlang,missing_arg)
208211
importFrom(rlang,new_function)
212+
importFrom(rlang,quo_get_env)
209213
importFrom(rlang,quo_is_missing)
210214
importFrom(rlang,sym)
211215
importFrom(rlang,syms)
@@ -230,3 +234,4 @@ importFrom(tidyselect,starts_with)
230234
importFrom(tsibble,as_tsibble)
231235
importFrom(utils,capture.output)
232236
importFrom(utils,tail)
237+
importFrom(vctrs,vec_data)

R/epi_df.R

+5-6
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,6 @@ as_epi_df.tbl_df <- function(
232232
as_of,
233233
other_keys = character(),
234234
...) {
235-
# possible standard substitutions for time_value
236235
x <- rename(x, ...)
237236
x <- guess_column_name(x, "time_value", time_column_names())
238237
x <- guess_column_name(x, "geo_value", geo_column_names())
@@ -282,11 +281,11 @@ as_epi_df.tbl_df <- function(
282281
cli_abort("as_epi_df: `other_keys` can't include \".time_value_counts\"")
283282
}
284283

285-
duplicated_time_values <- x %>%
286-
group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>%
287-
filter(dplyr::n() > 1) %>%
288-
ungroup()
289-
if (nrow(duplicated_time_values) > 0) {
284+
if (anyDuplicated(x[c("geo_value", "time_value", other_keys)])) {
285+
duplicated_time_values <- x %>%
286+
group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>%
287+
filter(dplyr::n() > 1) %>%
288+
ungroup()
290289
bad_data <- capture.output(duplicated_time_values)
291290
cli_abort(
292291
"as_epi_df: some groups in the data have duplicated time values. epi_df requires a unique time_value per group.",

R/epiprocess-package.R

+3
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#' @importFrom checkmate anyInfinite anyMissing assert assert_character
77
#' @importFrom checkmate assert_class assert_data_frame assert_int assert_list
88
#' @importFrom checkmate assert_logical assert_numeric assert_scalar checkInt
9+
#' @importFrom checkmate assert_string
910
#' @importFrom checkmate check_atomic check_data_frame expect_class test_int
1011
#' @importFrom checkmate check_names
1112
#' @importFrom checkmate test_subset test_set_equal vname
@@ -16,6 +17,8 @@
1617
#' @importFrom dplyr select
1718
#' @importFrom lifecycle deprecated
1819
#' @importFrom rlang %||%
20+
#' @importFrom rlang is_bare_integerish
21+
#' @importFrom vctrs vec_data
1922
## usethis namespace: end
2023
NULL
2124

R/slide.R

+135-37
Original file line numberDiff line numberDiff line change
@@ -537,7 +537,7 @@ get_before_after_from_window <- function(window_size, align, time_type) {
537537
#'
538538
#' @template basic-slide-params
539539
#' @param .col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column
540-
#' name(e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`),
540+
#' name (e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`),
541541
#' [other tidy-select expression][tidyselect::language], or a vector of
542542
#' characters (e.g. `c("cases", "deaths")`). Variable names can be used as if
543543
#' they were positions in the data frame, so expressions like `x:y` can be
@@ -559,13 +559,40 @@ get_before_after_from_window <- function(window_size, align, time_type) {
559559
#' `epi_slide_mean` and `epi_slide_sum`) take care of window completion
560560
#' automatically to prevent associated errors.
561561
#' @param ... Additional arguments to pass to the slide computation `.f`, for
562-
#' example, `algo` or `na.rm` in data.table functions. You don't need to
563-
#' specify `.x`, `.window_size`, or `.align` (or `before`/`after` for slider
564-
#' functions).
562+
#' example, `algo` or `na.rm` in data.table functions. You don't need to
563+
#' specify `.x`, `.window_size`, or `.align` (or `before`/`after` for slider
564+
#' functions).
565+
#' @param .prefix Optional [`glue::glue`] format string; name the slide result
566+
#' column(s) by attaching this prefix to the corresponding input column(s).
567+
#' Some shorthand is supported for basing the output names on `.window_size`
568+
#' or other arguments; see "Prefix and suffix shorthand" below.
569+
#' @param .suffix Optional [`glue::glue`] format string; like `.prefix`. The
570+
#' default naming behavior is equivalent to `.suffix =
571+
#' "_{.n}{.time_unit_abbr}{.align_abbr}{.f_abbr}"`. Can be used in combination
572+
#' with `.prefix`.
573+
#' @param .new_col_names Optional character vector with length matching the
574+
#' number of input columns from `.col_names`; name the slide result column(s)
575+
#' with these names. Cannot be used in combination with `.prefix` and/or
576+
#' `.suffix`.
577+
#'
578+
#' @section Prefix and suffix shorthand:
579+
#'
580+
#' [`glue::glue`] format strings specially interpret content within curly
581+
#' braces. E.g., `glue::glue("ABC{2 + 2}")` evaluates to `"ABC4"`. For `.prefix`
582+
#' and `.suffix`, we provide `glue` with some additional variable bindings:
583+
#'
584+
#' - `{.n}` will be the number of time steps in the computation
585+
#' corresponding to the `.window_size`.
586+
#' - `{.time_unit_abbr}` will be a lower-case letter corresponding to the
587+
#' `time_type` of `.x`
588+
#' - `{.align_abbr}` will be `""` if `.align` is the default of `"right"`;
589+
#' otherwise, it will be the first letter of `.align`
590+
#' - `{.f_abbr}` will be a short string based on what `.f`
565591
#'
566592
#' @importFrom dplyr bind_rows mutate %>% arrange tibble select all_of
567-
#' @importFrom rlang enquo expr_label caller_arg
593+
#' @importFrom rlang enquo expr_label caller_arg quo_get_env
568594
#' @importFrom tidyselect eval_select
595+
#' @importFrom glue glue
569596
#' @importFrom purrr map map_lgl
570597
#' @importFrom data.table frollmean frollsum frollapply
571598
#' @importFrom lubridate as.period
@@ -577,8 +604,7 @@ get_before_after_from_window <- function(window_size, align, time_type) {
577604
#' # Compute a 7-day trailing average on cases.
578605
#' cases_deaths_subset %>%
579606
#' group_by(geo_value) %>%
580-
#' epi_slide_opt(cases, .f = data.table::frollmean, .window_size = 7) %>%
581-
#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases)
607+
#' epi_slide_opt(cases, .f = data.table::frollmean, .window_size = 7)
582608
#'
583609
#' # Same as above, but adjust `frollmean` settings for speed, accuracy, and
584610
#' # to allow partially-missing windows.
@@ -588,11 +614,11 @@ get_before_after_from_window <- function(window_size, align, time_type) {
588614
#' cases,
589615
#' .f = data.table::frollmean, .window_size = 7,
590616
#' algo = "exact", hasNA = TRUE, na.rm = TRUE
591-
#' ) %>%
592-
#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases)
617+
#' )
593618
epi_slide_opt <- function(
594619
.x, .col_names, .f, ...,
595620
.window_size = NULL, .align = c("right", "center", "left"),
621+
.prefix = NULL, .suffix = NULL, .new_col_names = NULL,
596622
.ref_time_values = NULL, .all_rows = FALSE) {
597623
assert_class(.x, "epi_df")
598624

@@ -620,7 +646,7 @@ epi_slide_opt <- function(
620646
if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) {
621647
cli::cli_abort(
622648
"epi_slide_opt: the argument `new_col_name` is not supported for `epi_slide_opt`. If you want to customize
623-
the output column names, use `dplyr::rename` after the slide.",
649+
the output column names, use `.prefix =`, `.suffix =`, or `.new_col_**names** =`.",
624650
class = "epiprocess__epi_slide_opt__new_name_not_supported"
625651
)
626652
}
@@ -644,21 +670,37 @@ epi_slide_opt <- function(
644670
)
645671
}
646672

673+
# The position of a given column can be differ between input `.x` and
674+
# `.data_group` since the grouping step by default drops grouping columns.
675+
# To avoid rerunning `eval_select` for every `.data_group`, convert
676+
# positions of user-provided `col_names` into string column names. We avoid
677+
# using `names(pos)` directly for robustness and in case we later want to
678+
# allow users to rename fields via tidyselection.
679+
col_names_quo <- enquo(.col_names)
680+
pos <- eval_select(col_names_quo, data = .x, allow_rename = FALSE)
681+
col_names_chr <- names(.x)[pos]
682+
647683
# Check that slide function `.f` is one of those short-listed from
648684
# `data.table` and `slider` (or a function that has the exact same
649685
# definition, e.g. if the function has been reexported or defined
650686
# locally).
651-
if (any(map_lgl(
652-
list(frollmean, frollsum, frollapply),
653-
~ identical(.f, .x)
654-
))) {
655-
f_from_package <- "data.table"
656-
} else if (any(map_lgl(
657-
list(slide_sum, slide_prod, slide_mean, slide_min, slide_max, slide_all, slide_any),
658-
~ identical(.f, .x)
659-
))) {
660-
f_from_package <- "slider"
661-
} else {
687+
f_possibilities <-
688+
tibble::tribble(
689+
~f, ~package, ~abbr,
690+
frollmean, "data.table", "av",
691+
frollsum, "data.table", "sum",
692+
frollapply, "data.table", "slide",
693+
slide_sum, "slider", "sum",
694+
slide_prod, "slider", "prod",
695+
slide_mean, "slider", "av",
696+
slide_min, "slider", "min",
697+
slide_max, "slider", "max",
698+
slide_all, "slider", "all",
699+
slide_any, "slider", "any",
700+
)
701+
f_info <- f_possibilities %>%
702+
filter(map_lgl(.data$f, ~ identical(.f, .x)))
703+
if (nrow(f_info) == 0L) {
662704
# `f` is from somewhere else and not supported
663705
cli_abort(
664706
c(
@@ -672,6 +714,7 @@ epi_slide_opt <- function(
672714
epiprocess__f = .f
673715
)
674716
}
717+
f_from_package <- f_info$package
675718

676719
user_provided_rtvs <- !is.null(.ref_time_values)
677720
if (!user_provided_rtvs) {
@@ -702,22 +745,72 @@ epi_slide_opt <- function(
702745
validate_slide_window_arg(.window_size, time_type)
703746
window_args <- get_before_after_from_window(.window_size, .align, time_type)
704747

748+
# Handle output naming
749+
if ((!is.null(.prefix) || !is.null(.suffix)) && !is.null(.new_col_names)) {
750+
cli_abort(
751+
"Can't use both .prefix/.suffix and .new_col_names at the same time.",
752+
class = "epiprocess__epi_slide_opt_incompatible_naming_args"
753+
)
754+
}
755+
assert_string(.prefix, null.ok = TRUE)
756+
assert_string(.suffix, null.ok = TRUE)
757+
assert_character(.new_col_names, len = length(col_names_chr), null.ok = TRUE)
758+
if (is.null(.prefix) && is.null(.suffix) && is.null(.new_col_names)) {
759+
.suffix <- "_{.n}{.time_unit_abbr}{.align_abbr}{.f_abbr}"
760+
# ^ does not account for any arguments specified to underlying functions via
761+
# `...` such as `na.rm =`, nor does it distinguish between functions from
762+
# different packages accomplishing the same type of computation. Those are
763+
# probably only set one way per task, so this probably produces cleaner
764+
# names without clashes (though maybe some confusion if switching between
765+
# code with different settings).
766+
}
767+
if (!is.null(.prefix) || !is.null(.suffix)) {
768+
.prefix <- .prefix %||% ""
769+
.suffix <- .suffix %||% ""
770+
if (identical(.window_size, Inf)) {
771+
n <- "running_"
772+
time_unit_abbr <- ""
773+
align_abbr <- ""
774+
} else {
775+
n <- time_delta_to_n_steps(.window_size, time_type)
776+
time_unit_abbr <- time_type_unit_abbr(time_type)
777+
align_abbr <- c(right = "", center = "c", left = "l")[[.align]]
778+
}
779+
glue_env <- rlang::env(
780+
.n = n,
781+
.time_unit_abbr = time_unit_abbr,
782+
.align_abbr = align_abbr,
783+
.f_abbr = f_info$abbr,
784+
quo_get_env(col_names_quo)
785+
)
786+
.new_col_names <- unclass(
787+
glue(.prefix, .envir = glue_env) +
788+
col_names_chr +
789+
glue(.suffix, .envir = glue_env)
790+
)
791+
} else {
792+
# `.new_col_names` was provided by user; we don't need to do anything.
793+
}
794+
if (any(.new_col_names %in% names(.x))) {
795+
cli_abort(c(
796+
"Naming conflict between new columns and existing columns",
797+
"x" = "Overlapping names: {format_varnames(intersect(.new_col_names, names(.x)))}"
798+
), class = "epiprocess__epi_slide_opt_old_new_name_conflict")
799+
}
800+
if (anyDuplicated(.new_col_names)) {
801+
cli_abort(c(
802+
"New column names contain duplicates",
803+
"x" = "Duplicated names: {format_varnames(unique(.new_col_names[duplicated(.new_col_names)]))}"
804+
), class = "epiprocess__epi_slide_opt_new_name_duplicated")
805+
}
806+
result_col_names <- .new_col_names
807+
705808
# Make a complete date sequence between min(.x$time_value) and max(.x$time_value).
706809
date_seq_list <- full_date_seq(.x, window_args$before, window_args$after, time_type)
707810
all_dates <- date_seq_list$all_dates
708811
pad_early_dates <- date_seq_list$pad_early_dates
709812
pad_late_dates <- date_seq_list$pad_late_dates
710813

711-
# The position of a given column can be differ between input `.x` and
712-
# `.data_group` since the grouping step by default drops grouping columns.
713-
# To avoid rerunning `eval_select` for every `.data_group`, convert
714-
# positions of user-provided `col_names` into string column names. We avoid
715-
# using `names(pos)` directly for robustness and in case we later want to
716-
# allow users to rename fields via tidyselection.
717-
pos <- eval_select(enquo(.col_names), data = .x, allow_rename = FALSE)
718-
col_names_chr <- names(.x)[pos]
719-
# Always rename results to "slide_value_<original column name>".
720-
result_col_names <- paste0("slide_value_", col_names_chr)
721814
slide_one_grp <- function(.data_group, .group_key, ...) {
722815
missing_times <- all_dates[!(all_dates %in% .data_group$time_value)]
723816
# `frollmean` requires a full window to compute a result. Add NA values
@@ -827,8 +920,7 @@ epi_slide_opt <- function(
827920
#' # Compute a 7-day trailing average on cases.
828921
#' cases_deaths_subset %>%
829922
#' group_by(geo_value) %>%
830-
#' epi_slide_mean(cases, .window_size = 7) %>%
831-
#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases)
923+
#' epi_slide_mean(cases, .window_size = 7)
832924
#'
833925
#' # Same as above, but adjust `frollmean` settings for speed, accuracy, and
834926
#' # to allow partially-missing windows.
@@ -838,11 +930,11 @@ epi_slide_opt <- function(
838930
#' cases,
839931
#' .window_size = 7,
840932
#' na.rm = TRUE, algo = "exact", hasNA = TRUE
841-
#' ) %>%
842-
#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases)
933+
#' )
843934
epi_slide_mean <- function(
844935
.x, .col_names, ...,
845936
.window_size = NULL, .align = c("right", "center", "left"),
937+
.prefix = NULL, .suffix = NULL, .new_col_names = NULL,
846938
.ref_time_values = NULL, .all_rows = FALSE) {
847939
# Deprecated argument handling
848940
provided_args <- rlang::call_args_names(rlang::call_match())
@@ -885,6 +977,9 @@ epi_slide_mean <- function(
885977
...,
886978
.window_size = .window_size,
887979
.align = .align,
980+
.prefix = .prefix,
981+
.suffix = .suffix,
982+
.new_col_names = .new_col_names,
888983
.ref_time_values = .ref_time_values,
889984
.all_rows = .all_rows
890985
)
@@ -899,11 +994,11 @@ epi_slide_mean <- function(
899994
#' # Compute a 7-day trailing sum on cases.
900995
#' cases_deaths_subset %>%
901996
#' group_by(geo_value) %>%
902-
#' epi_slide_sum(cases, .window_size = 7) %>%
903-
#' dplyr::select(geo_value, time_value, cases, cases_7dsum = slide_value_cases)
997+
#' epi_slide_sum(cases, .window_size = 7)
904998
epi_slide_sum <- function(
905999
.x, .col_names, ...,
9061000
.window_size = NULL, .align = c("right", "center", "left"),
1001+
.prefix = NULL, .suffix = NULL, .new_col_names = NULL,
9071002
.ref_time_values = NULL, .all_rows = FALSE) {
9081003
# Deprecated argument handling
9091004
provided_args <- rlang::call_args_names(rlang::call_match())
@@ -945,6 +1040,9 @@ epi_slide_sum <- function(
9451040
...,
9461041
.window_size = .window_size,
9471042
.align = .align,
1043+
.prefix = .prefix,
1044+
.suffix = .suffix,
1045+
.new_col_names = .new_col_names,
9481046
.ref_time_values = .ref_time_values,
9491047
.all_rows = .all_rows
9501048
)

0 commit comments

Comments
 (0)