Skip to content

Commit b444a3c

Browse files
authored
Merge pull request #386 from cmu-delphi/ndefries/speedups
Speedups for `epix_slide`
2 parents 31f6319 + c6863ea commit b444a3c

8 files changed

+81
-22
lines changed

NAMESPACE

+4
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,9 @@ importFrom(data.table,as.data.table)
6666
importFrom(data.table,between)
6767
importFrom(data.table,copy)
6868
importFrom(data.table,key)
69+
importFrom(data.table,rbindlist)
6970
importFrom(data.table,set)
71+
importFrom(data.table,setDF)
7072
importFrom(data.table,setkeyv)
7173
importFrom(dplyr,arrange)
7274
importFrom(dplyr,bind_rows)
@@ -114,6 +116,8 @@ importFrom(rlang,syms)
114116
importFrom(stats,cor)
115117
importFrom(stats,median)
116118
importFrom(tibble,as_tibble)
119+
importFrom(tibble,new_tibble)
120+
importFrom(tibble,validate_tibble)
117121
importFrom(tidyr,unnest)
118122
importFrom(tidyselect,eval_select)
119123
importFrom(tidyselect,starts_with)

R/archive.R

+4-1
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,10 @@ epi_archive =
495495
version <= max_version, ] %>%
496496
unique(by = c("geo_value", "time_value", other_keys),
497497
fromLast = TRUE) %>%
498-
tibble::as_tibble() %>%
498+
tibble::as_tibble() %>%
499+
# (`as_tibble` should de-alias the DT and its columns in any edge
500+
# cases where they are aliased. We don't say we guarantee this
501+
# though.)
499502
dplyr::select(-"version") %>%
500503
as_epi_df(geo_type = self$geo_type,
501504
time_type = self$time_type,

R/epi_df.R

+7-1
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,13 @@ new_epi_df = function(x = tibble::tibble(), geo_type, time_type, as_of,
159159

160160
# Reorder columns (geo_value, time_value, ...)
161161
if(sum(dim(x)) != 0){
162-
x = dplyr::relocate(x, "geo_value", "time_value")
162+
cols_to_put_first <- c("geo_value", "time_value")
163+
x <- x[, c(
164+
cols_to_put_first,
165+
# All other columns
166+
names(x)[!(names(x) %in% cols_to_put_first)]
167+
)
168+
]
163169
}
164170

165171
# Apply epi_df class, attach metadata, and return

R/grouped_epi_archive.R

+31-14
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,9 @@ grouped_epi_archive =
185185
#' @description Slides a given function over variables in a `grouped_epi_archive`
186186
#' object. See the documentation for the wrapper function [`epix_slide()`] for
187187
#' details.
188-
#' @importFrom data.table key address
188+
#' @importFrom data.table key address rbindlist setDF
189+
#' @importFrom tibble as_tibble new_tibble validate_tibble
190+
#' @importFrom dplyr group_by groups
189191
#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms
190192
#' env missing_arg
191193
slide = function(f, ..., before, ref_time_values,
@@ -280,16 +282,19 @@ grouped_epi_archive =
280282
if (! (is.atomic(comp_value) || is.data.frame(comp_value))) {
281283
Abort("The slide computation must return an atomic vector or a data frame.")
282284
}
285+
286+
# Label every result row with the `ref_time_value`
287+
res <- list(time_value = ref_time_value)
288+
283289
# Wrap the computation output in a list and unchop/unnest later if
284290
# `as_list_col = FALSE`. This approach means that we will get a
285291
# list-class col rather than a data.frame-class col when
286292
# `as_list_col = TRUE` and the computations outputs are data
287293
# frames.
288-
comp_value <- list(comp_value)
289-
290-
# Label every result row with the `ref_time_value`:
291-
return(tibble::tibble(time_value = .env$ref_time_value,
292-
!!new_col := .env$comp_value))
294+
res[[new_col]] <- list(comp_value)
295+
296+
# Convert the list to a tibble all at once for speed.
297+
return(validate_tibble(new_tibble(res)))
293298
}
294299

295300
# If `f` is missing, interpret ... as an expression for tidy evaluation
@@ -308,7 +313,7 @@ grouped_epi_archive =
308313
}
309314

310315
f = as_slide_computation(f, ...)
311-
x = purrr::map_dfr(ref_time_values, function(ref_time_value) {
316+
x = lapply(ref_time_values, function(ref_time_value) {
312317
# Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`,
313318
# `epi_archive` if `all_versions` is `TRUE`:
314319
as_of_raw = private$ungrouped$as_of(ref_time_value, min_time_value = ref_time_value - before, all_versions = all_versions)
@@ -331,6 +336,13 @@ grouped_epi_archive =
331336
# copies.
332337
if (address(as_of_archive$DT) == address(private$ungrouped$DT)) {
333338
# `as_of` aliased its the full `$DT`; copy before mutating:
339+
#
340+
# Note: this step is probably unneeded; we're fine with
341+
# aliasing of the DT or its columns: vanilla operations aren't
342+
# going to mutate them in-place if they are aliases, and we're
343+
# not performing mutation (unlike the situation with
344+
# `fill_through_version` where we do mutate a `DT` and don't
345+
# want aliasing).
334346
as_of_archive$DT <- copy(as_of_archive$DT)
335347
}
336348
dt_key = data.table::key(as_of_archive$DT)
@@ -357,15 +369,20 @@ grouped_epi_archive =
357369
}
358370

359371
return(
360-
dplyr::group_by(as_of_df, dplyr::across(tidyselect::all_of(private$vars)),
361-
.drop=private$drop) %>%
362-
dplyr::group_modify(group_modify_fn,
363-
f = f, ...,
364-
ref_time_value = ref_time_value,
365-
new_col = new_col,
366-
.keep = TRUE)
372+
dplyr::group_modify(
373+
dplyr::group_by(as_of_df, !!!syms(private$vars), .drop=private$drop),
374+
group_modify_fn,
375+
f = f, ...,
376+
ref_time_value = ref_time_value,
377+
new_col = new_col,
378+
.keep = TRUE
379+
)
367380
)
368381
})
382+
# Combine output into a single tibble
383+
x <- as_tibble(setDF(rbindlist(x)))
384+
# Reconstruct groups
385+
x <- group_by(x, !!!syms(private$vars), .drop=private$drop)
369386

370387
# Unchop/unnest if we need to
371388
if (!as_list_col) {

R/methods-epi_archive.R

+17-2
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,18 @@
3232
#' x$as_of(max_version = v)
3333
#' ```
3434
#'
35-
#' @export
35+
#' Mutation and aliasing: `epix_as_of` and `$as_of` will not mutate the input
36+
#' archives, but may in some edge cases alias parts of the inputs, so copy the
37+
#' outputs if needed before using mutating operations like `data.table`'s `:=`
38+
#' operator. Currently, the only situation where there is potentially aliasing
39+
#' is of the `DT` in edge cases with `all_versions = TRUE`, but this may change
40+
#' in the future.
41+
#'
3642
#' @examples
3743
#' # warning message of data latency shown
3844
#' epix_as_of(x = archive_cases_dv_subset,
3945
#' max_version = max(archive_cases_dv_subset$DT$version))
4046
#'
41-
#' @export
4247
#' @examples
4348
#'
4449
#' range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01
@@ -60,6 +65,8 @@
6065
#' }, epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning"))
6166
#' # Since R 4.0, there is a `globalCallingHandlers` function that can be used
6267
#' # to globally toggle these warnings.
68+
#'
69+
#' @export
6370
epix_as_of = function(x, max_version, min_time_value = -Inf, all_versions = FALSE) {
6471
if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.")
6572
return(x$as_of(max_version, min_time_value, all_versions = all_versions))
@@ -798,6 +805,14 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr
798805
#' x$slide(new_var = comp(old_var), before = 119)
799806
#' ```
800807
#'
808+
#' Mutation and aliasing: `epix_slide` and `$slide` will not perform in-place
809+
#' mutation of the input archives on their own. In some edge cases the inputs it
810+
#' feeds to the slide computations may alias parts of the input archive, so copy
811+
#' the slide computation inputs if needed before using mutating operations like
812+
#' `data.table`'s `:=` operator. Similarly, in some edge cases, the output of
813+
#' the slide operation may alias parts of the input archive, so similarly, make
814+
#' sure to clone and/or copy appropriately before using in-place mutation.
815+
#'
801816
#' @examples
802817
#' library(dplyr)
803818
#'

R/slide.R

+2-4
Original file line numberDiff line numberDiff line change
@@ -300,8 +300,7 @@ epi_slide = function(x, f, ..., before, after, ref_time_values,
300300
# Count the number of appearances of each reference time value (these
301301
# appearances should all be real for now, but if we allow ref time values
302302
# outside of .data_group's time values):
303-
counts = .data_group %>%
304-
dplyr::filter(.data$time_value %in% time_values) %>%
303+
counts = dplyr::filter(.data_group, .data$time_value %in% time_values) %>%
305304
dplyr::count(.data$time_value) %>%
306305
dplyr::pull(n)
307306

@@ -375,8 +374,7 @@ epi_slide = function(x, f, ..., before, after, ref_time_values,
375374
.x$.real <- NULL
376375
f(.x, .group_key, .ref_time_value, ...)
377376
}
378-
x = x %>%
379-
group_modify(slide_one_grp,
377+
x = group_modify(x, slide_one_grp,
380378
f = f_wrapper, ...,
381379
starts = starts,
382380
stops = stops,

man/epix_as_of.Rd

+8
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/epix_slide.Rd

+8
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)