Skip to content

Commit 44e4646

Browse files
authored
Merge pull request #397 from cmu-delphi/ndefries/f-wrapper-speedup-factory
Step through pre-calculated start times for each group using closure rather than using `.real` col in `epi_slide`
2 parents b95685e + abd24d3 commit 44e4646

20 files changed

+332
-248
lines changed

DESCRIPTION

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Type: Package
22
Package: epiprocess
33
Title: Tools for basic signal processing in epidemiology
4-
Version: 0.7.1.9999
4+
Version: 0.7.2.9999
55
Authors@R: c(
66
person("Jacob", "Bien", role = "ctb"),
77
person("Logan", "Brooks", role = "aut"),

NEWS.md

+8
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
# epiprocess 0.7.2.9999
2+
3+
## Improvements
4+
5+
* `epi_slide` computations are now 2-4 times faster after changing how
6+
reference time values, made accessible within sliding functions, are
7+
calculated (#397).
8+
19
# epiprocess 0.7.1.9999
210

311
Note that `epiprocess` uses the [Semantic Versioning

R/slide.R

+32-64
Original file line numberDiff line numberDiff line change
@@ -230,84 +230,56 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values,
230230
after <- time_step(after)
231231
}
232232

233-
min_ref_time_values <- ref_time_values - before
234-
min_ref_time_values_not_in_x <- min_ref_time_values[!(min_ref_time_values %in% unique(x$time_value))]
235-
236-
# Do set up to let us recover `ref_time_value`s later.
237-
# A helper column marking real observations.
238-
x$.real <- TRUE
239-
240-
# Create df containing phony data. Df has the same columns and attributes as
241-
# `x`, but filled with `NA`s aside from grouping columns. Number of rows is
242-
# equal to the number of `min_ref_time_values_not_in_x` we have * the
243-
# number of unique levels seen in the grouping columns.
244-
before_time_values_df <- data.frame(time_value = min_ref_time_values_not_in_x)
245-
if (length(group_vars(x)) != 0) {
246-
before_time_values_df <- dplyr::cross_join(
247-
# Get unique combinations of grouping columns seen in real data.
248-
unique(x[, group_vars(x)]),
249-
before_time_values_df
250-
)
251-
}
252-
# Automatically fill in all other columns from `x` with `NA`s, and carry
253-
# attributes over to new df.
254-
before_time_values_df <- bind_rows(x[0, ], before_time_values_df)
255-
before_time_values_df$.real <- FALSE
256-
257-
x <- bind_rows(before_time_values_df, x)
258-
259233
# Arrange by increasing time_value
260234
x <- arrange(x, time_value)
261235

262236
# Now set up starts and stops for sliding/hopping
263-
time_range <- range(unique(x$time_value))
264-
starts <- in_range(ref_time_values - before, time_range)
265-
stops <- in_range(ref_time_values + after, time_range)
266-
267-
if (length(starts) == 0 || length(stops) == 0) {
268-
Abort("The starting and/or stopping times for sliding are out of bounds with respect to the range of times in your data. Check your settings for ref_time_values and align (and before, if specified).")
269-
}
237+
starts <- ref_time_values - before
238+
stops <- ref_time_values + after
270239

271240
# Symbolize new column name
272241
new_col <- sym(new_col_name)
273242

274243
# Computation for one group, all time values
275244
slide_one_grp <- function(.data_group,
276-
f, ...,
245+
.group_key, # see `?group_modify`
246+
..., # `...` to `epi_slide` forwarded here
247+
f_factory,
277248
starts,
278249
stops,
279-
time_values,
250+
ref_time_values,
280251
all_rows,
281252
new_col) {
282253
# Figure out which reference time values appear in the data group in the
283254
# first place (we need to do this because it could differ based on the
284255
# group, hence the setup/checks for the reference time values based on all
285-
# the data could still be off)
286-
o <- time_values %in% .data_group$time_value
256+
# the data could still be off):
257+
o <- ref_time_values %in% .data_group$time_value
287258
starts <- starts[o]
288259
stops <- stops[o]
289-
time_values <- time_values[o]
260+
kept_ref_time_values <- ref_time_values[o]
261+
262+
f <- f_factory(kept_ref_time_values)
290263

291264
# Compute the slide values
292265
slide_values_list <- slider::hop_index(
293266
.x = .data_group,
294267
.i = .data_group$time_value,
295-
.f = f, ...,
296268
.starts = starts,
297-
.stops = stops
269+
.stops = stops,
270+
.f = f,
271+
.group_key, ...
298272
)
299273

300274
# Now figure out which rows in the data group are in the reference time
301275
# values; this will be useful for all sorts of checks that follow
302-
o <- .data_group$time_value %in% time_values
276+
o <- .data_group$time_value %in% kept_ref_time_values
303277
num_ref_rows <- sum(o)
304278

305-
# Count the number of appearances of each reference time value (these
306-
# appearances should all be real for now, but if we allow ref time values
307-
# outside of .data_group's time values):
308-
counts <- dplyr::filter(.data_group, .data$time_value %in% time_values) %>%
279+
# Count the number of appearances of each kept reference time value.
280+
counts <- dplyr::filter(.data_group, .data$time_value %in% kept_ref_time_values) %>%
309281
dplyr::count(.data$time_value) %>%
310-
dplyr::pull(n)
282+
`[[`("n")
311283

312284
if (!all(purrr::map_lgl(slide_values_list, is.atomic)) &&
313285
!all(purrr::map_lgl(slide_values_list, is.data.frame))) {
@@ -349,7 +321,6 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values,
349321
# fills with NA equivalent.
350322
vctrs::vec_slice(slide_values, o) <- orig_values
351323
} else {
352-
# This implicitly removes phony (`.real` == FALSE) observations.
353324
.data_group <- filter(.data_group, o)
354325
}
355326
return(mutate(.data_group, !!new_col := slide_values))
@@ -372,18 +343,24 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values,
372343

373344
f <- as_slide_computation(f, ...)
374345
# Create a wrapper that calculates and passes `.ref_time_value` to the
375-
# computation.
376-
f_wrapper <- function(.x, .group_key, ...) {
377-
.ref_time_value <- min(.x$time_value) + before
378-
.x <- .x[.x$.real, ]
379-
.x$.real <- NULL
380-
f(.x, .group_key, .ref_time_value, ...)
346+
# computation. `i` is contained in the `f_wrapper_factory` environment such
347+
# that when called within `slide_one_grp` `i` is reset for every group.
348+
f_wrapper_factory <- function(kept_ref_time_values) {
349+
# Use `i` to advance through list of start dates.
350+
i <- 1L
351+
f_wrapper <- function(.x, .group_key, ...) {
352+
.ref_time_value <- kept_ref_time_values[[i]]
353+
i <<- i + 1L
354+
f(.x, .group_key, .ref_time_value, ...)
355+
}
356+
return(f_wrapper)
381357
}
382358
x <- group_modify(x, slide_one_grp,
383-
f = f_wrapper, ...,
359+
...,
360+
f_factory = f_wrapper_factory,
384361
starts = starts,
385362
stops = stops,
386-
time_values = ref_time_values,
363+
ref_time_values = ref_time_values,
387364
all_rows = all_rows,
388365
new_col = new_col,
389366
.keep = FALSE
@@ -394,14 +371,5 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values,
394371
x <- unnest(x, !!new_col, names_sep = names_sep)
395372
}
396373

397-
# Remove any remaining phony observations. When `all_rows` is TRUE, phony
398-
# observations aren't necessarily removed in `slide_one_grp`.
399-
if (all_rows) {
400-
x <- x[x$.real, ]
401-
}
402-
403-
# Drop helper column `.real`.
404-
x$.real <- NULL
405-
406374
return(x)
407375
}

R/utils.R

-4
Original file line numberDiff line numberDiff line change
@@ -361,10 +361,6 @@ as_slide_computation <- function(f, ...) {
361361

362362
##########
363363

364-
in_range <- function(x, rng) pmin(pmax(x, rng[1]), rng[2])
365-
366-
##########
367-
368364
Min <- function(x) min(x, na.rm = TRUE)
369365
Max <- function(x) max(x, na.rm = TRUE)
370366
Sum <- function(x) sum(x, na.rm = TRUE)

man/as_epi_archive.Rd

+34-22
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/as_epi_df.Rd

+26-18
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)