Skip to content

Commit 8985db2

Browse files
committed
Make as_list_col=TRUE consistent for vecs and dfs from slide comps
1 parent de398b1 commit 8985db2

File tree

9 files changed

+226
-84
lines changed

9 files changed

+226
-84
lines changed

NEWS.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@ inter-release development versions will include an additional ".9999" suffix.
2020
* To keep the old behavior, convert the output of `epix_slide()` to `epi_df`
2121
when desired and set the metadata appropriately.
2222

23+
## Improvements:
24+
25+
* `epi_slide` and `epix_slide` now support `as_list_col = TRUE` when the slide
26+
computations output atomic vectors, and output a list column in "chopped"
27+
format (see `tidyr::chop`).
28+
2329
# epiprocess 0.6.0
2430

2531
## Breaking changes:

R/grouped_epi_archive.R

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -279,11 +279,12 @@ grouped_epi_archive =
279279
if (! (is.atomic(comp_value) || is.data.frame(comp_value))) {
280280
Abort("The slide computation must return an atomic vector or a data frame.")
281281
}
282-
if (is.data.frame(comp_value)) {
283-
# Wrap in a list so that we get a list-type col rather than a
284-
# data.frame-type col when `as_list_col = TRUE`:
285-
comp_value <- list(comp_value)
286-
}
282+
# Wrap the computation output in a list and unchop/unnest later if
283+
# `as_list_col = FALSE`. This approach means that we will get a
284+
# list-class col rather than a data.frame-class col when
285+
# `as_list_col = TRUE` and the computations outputs are data
286+
# frames.
287+
comp_value <- list(comp_value)
287288

288289
# Label every result row with the `ref_time_value`:
289290
return(tibble::tibble(time_value = .env$ref_time_value,
@@ -426,8 +427,8 @@ grouped_epi_archive =
426427
)
427428
})
428429
}
429-
430-
# Unnest if we need to
430+
431+
# Unchop/unnest if we need to
431432
if (!as_list_col) {
432433
x = tidyr::unnest(x, !!new_col, names_sep = names_sep)
433434
}

R/methods-epi_archive.R

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -707,11 +707,12 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr
707707
#' @param new_col_name String indicating the name of the new column that will
708708
#' contain the derivative values. Default is "slide_value"; note that setting
709709
#' `new_col_name` equal to an existing column name will overwrite this column.
710-
#' @param as_list_col If the computations return data frames, should the slide
711-
#' result hold these in a single list column or try to unnest them? Default is
712-
#' `FALSE`, in which case a list object returned by `f` would be unnested
713-
#' (using [`tidyr::unnest()`]), and the names of the resulting columns are given
714-
#' by prepending `new_col_name` to the names of the list elements.
710+
#' @param as_list_col Should the slide results be held in a list column, or be
711+
#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`,
712+
#' in which case a list object returned by `f` would be unnested (using
713+
#' [`tidyr::unnest()`]), and, if the slide computations output data frames,
714+
#' the names of the resulting columns are given by prepending `new_col_name`
715+
#' to the names of the list elements.
715716
#' @param names_sep String specifying the separator to use in `tidyr::unnest()`
716717
#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix
717718
#' from `new_col_name` entirely.

R/slide.R

Lines changed: 39 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,12 @@
5252
#' @param new_col_name String indicating the name of the new column that will
5353
#' contain the derivative values. Default is "slide_value"; note that setting
5454
#' `new_col_name` equal to an existing column name will overwrite this column.
55-
#' @param as_list_col If the computations return data frames, should the slide
56-
#' result hold these in a single list column or try to unnest them? Default is
57-
#' `FALSE`, in which case a list object returned by `f` would be unnested
58-
#' (using [`tidyr::unnest()`]), and the names of the resulting columns are given
59-
#' by prepending `new_col_name` to the names of the list elements.
55+
#' @param as_list_col Should the slide results be held in a list column, or be
56+
#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`,
57+
#' in which case a list object returned by `f` would be unnested (using
58+
#' [`tidyr::unnest()`]), and, if the slide computations output data frames,
59+
#' the names of the resulting columns are given by prepending `new_col_name`
60+
#' to the names of the list elements.
6061
#' @param names_sep String specifying the separator to use in `tidyr::unnest()`
6162
#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix
6263
#' from `new_col_name` entirely.
@@ -248,11 +249,11 @@ epi_slide = function(x, f, ..., before, after, ref_time_values,
248249
time_values = time_values[o]
249250

250251
# Compute the slide values
251-
slide_values = slider::hop_index(.x = .data_group,
252-
.i = .data_group$time_value,
253-
.f = f, ...,
254-
.starts = starts,
255-
.stops = stops)
252+
slide_values_list = slider::hop_index(.x = .data_group,
253+
.i = .data_group$time_value,
254+
.f = f, ...,
255+
.starts = starts,
256+
.stops = stops)
256257

257258
# Now figure out which rows in the data group are in the reference time
258259
# values; this will be useful for all sorts of checks that follow
@@ -265,42 +266,38 @@ epi_slide = function(x, f, ..., before, after, ref_time_values,
265266
dplyr::count(.data$time_value) %>%
266267
dplyr::pull(n)
267268

268-
# If they're all atomic vectors
269-
if (all(sapply(slide_values, is.atomic))) {
270-
if (all(sapply(slide_values, length) == 1)) {
271-
# Recycle to make size stable (one slide value per ref time value)
272-
slide_values = rep(unlist(slide_values), times = counts)
273-
}
274-
else {
275-
# Unlist, then check its length, and abort if not right
276-
slide_values = unlist(slide_values)
277-
if (length(slide_values) != num_ref_rows) {
278-
Abort("If the slide computations return atomic vectors, then they must each have a single element, or else one element per appearance of the reference time value in the local window.")
279-
}
280-
}
269+
if (!all(purrr::map_lgl(slide_values_list, is.atomic)) &&
270+
!all(purrr::map_lgl(slide_values_list, is.data.frame))) {
271+
Abort("The slide computations must return always atomic vectors or data frames (and not a mix of these two structures).")
281272
}
282-
283-
# If they're all data frames
284-
else if (all(sapply(slide_values, is.data.frame))) {
285-
if (all(sapply(slide_values, nrow) == 1)) {
286-
# Recycle to make size stable (one slide value per ref time value)
287-
slide_values = rep(slide_values, times = counts)
273+
274+
# Unlist if appropriate:
275+
slide_values =
276+
if (as_list_col) {
277+
slide_values_list
278+
} else {
279+
vctrs::list_unchop(slide_values_list)
288280
}
289-
else {
290-
# Split (each row on its own), check length, abort if not right
291-
slide_df = dplyr::bind_rows(slide_values)
292-
slide_values = split(slide_df, 1:nrow(slide_df))
293-
if (length(slide_values) != num_ref_rows) {
294-
Abort("If the slide computations return data frames, then they must each have a single row, or else one row per appearance of the reference time value in the local window.")
295-
}
281+
282+
if (all(purrr::map_int(slide_values_list, vctrs::vec_size) == 1L) &&
283+
length(slide_values_list) != 0L) {
284+
# Recycle to make size stable (one slide value per ref time value).
285+
# (Length-0 case also could be handled here, but causes difficulties;
286+
# leave it to the next branch, where it also belongs.)
287+
slide_values = vctrs::vec_rep_each(slide_values, times = counts)
288+
} else {
289+
# Split and flatten if appropriate, perform a (loose) check on number of
290+
# rows.
291+
if (as_list_col) {
292+
slide_values = purrr::list_flatten(purrr::map(
293+
slide_values, ~ vctrs::vec_split(.x, seq_len(vctrs::vec_size(.x)))[["val"]]
294+
))
295+
}
296+
if (vctrs::vec_size(slide_values) != num_ref_rows) {
297+
Abort("The slide computations must either (a) output a single element/row each, or (b) one element/row per appearance of the reference time value in the local window.")
296298
}
297299
}
298-
299-
# If neither all atomic vectors or all data frames, then abort
300-
else {
301-
Abort("The slide computations must return always atomic vectors or data frames (and not a mix of these two structures).")
302-
}
303-
300+
304301
# If all rows, then pad slide values with NAs, else filter down data group
305302
if (all_rows) {
306303
orig_values = slide_values

man/epi_slide.Rd

Lines changed: 6 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/epix_slide.Rd

Lines changed: 6 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-epi_slide.R

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,76 @@ test_that("these doesn't produce an error; the error appears only if the ref tim
8686
dplyr::select("geo_value","slide_value_value"),
8787
dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2))) # not out of range for either group
8888
})
89+
90+
test_that("computation output formats x as_list_col", {
91+
toy_edf = tibble::tribble(
92+
~geo_value, ~time_value, ~value ,
93+
"a" , 1:10 , 2L^( 1:10),
94+
"b" , 1:10 , 2L^(11:20),
95+
) %>%
96+
tidyr::unchop(c(time_value, value)) %>%
97+
as_epi_df(as_of = 100)
98+
# We'll try 7d sum with a few formats.
99+
basic_result_from_size1 = tibble::tribble(
100+
~geo_value, ~time_value, ~value , ~slide_value ,
101+
"a" , 1:10 , 2L^( 1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE),
102+
"b" , 1:10 , 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE),
103+
) %>%
104+
tidyr::unchop(c(time_value, value, slide_value)) %>%
105+
dplyr::arrange(time_value) %>%
106+
as_epi_df(as_of = 100)
107+
expect_identical(
108+
toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value)),
109+
basic_result_from_size1
110+
)
111+
expect_identical(
112+
toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value), as_list_col = TRUE),
113+
basic_result_from_size1 %>% dplyr::mutate(slide_value = as.list(slide_value))
114+
)
115+
expect_identical(
116+
toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value))),
117+
basic_result_from_size1 %>% rename(slide_value_value = slide_value)
118+
)
119+
expect_identical(
120+
toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), as_list_col = TRUE),
121+
basic_result_from_size1 %>%
122+
mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x)))
123+
)
124+
# output naming functionality:
125+
expect_identical(
126+
toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)),
127+
new_col_name = "result"),
128+
basic_result_from_size1 %>% rename(result_value = slide_value)
129+
)
130+
expect_identical(
131+
toy_edf %>% epi_slide(before = 6L, ~ data.frame(value_sum = sum(.x$value)),
132+
names_sep = NULL),
133+
basic_result_from_size1 %>% rename(value_sum = slide_value)
134+
)
135+
# trying with non-size-1 computation outputs:
136+
basic_result_from_size2 = tibble::tribble(
137+
~geo_value, ~time_value, ~value , ~slide_value ,
138+
"a" , 1:10 , 2L^( 1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE),
139+
"b" , 1:10 , 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE) + 1L,
140+
) %>%
141+
tidyr::unchop(c(time_value, value, slide_value)) %>%
142+
dplyr::arrange(time_value) %>%
143+
as_epi_df(as_of = 100)
144+
expect_identical(
145+
toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value) + 0:1),
146+
basic_result_from_size2
147+
)
148+
expect_identical(
149+
toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value) + 0:1, as_list_col = TRUE),
150+
basic_result_from_size2 %>% dplyr::mutate(slide_value = as.list(slide_value))
151+
)
152+
expect_identical(
153+
toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value) + 0:1)),
154+
basic_result_from_size2 %>% rename(slide_value_value = slide_value)
155+
)
156+
expect_identical(
157+
toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value) + 0:1), as_list_col = TRUE),
158+
basic_result_from_size2 %>%
159+
mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x)))
160+
)
161+
})

tests/testthat/test-epix_slide.R

Lines changed: 74 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -60,35 +60,94 @@ test_that("epix_slide works as intended",{
6060
})
6161

6262
test_that("epix_slide works as intended with `as_list_col=TRUE`",{
63-
# Note Issue #261.
64-
xx1 <- xx %>%
63+
xx_dfrow1 <- xx %>%
6564
group_by(.data$geo_value) %>%
6665
epix_slide(f = ~ data.frame(bin_sum = sum(.x$binary)),
6766
before = 2,
68-
as_list_col=TRUE)
67+
as_list_col = TRUE)
6968

70-
xx2 <- tibble(geo_value = rep("x",4),
71-
time_value = c(4,5,6,7),
72-
slide_value =
73-
c(2^3+2^2,
74-
2^6+2^3,
75-
2^10+2^9,
76-
2^15+2^14) %>%
77-
purrr::map(~ data.frame(bin_sum = .x))
78-
) %>%
69+
xx_dfrow2 <- tibble(
70+
geo_value = rep("x",4),
71+
time_value = c(4,5,6,7),
72+
slide_value =
73+
c(2^3+2^2,
74+
2^6+2^3,
75+
2^10+2^9,
76+
2^15+2^14) %>%
77+
purrr::map(~ data.frame(bin_sum = .x))
78+
) %>%
7979
group_by(geo_value)
8080

81-
expect_identical(xx1,xx2) # *
81+
expect_identical(xx_dfrow1,xx_dfrow2) # *
8282

83-
xx3 <- (
83+
xx_dfrow3 <- (
8484
xx
8585
$group_by(dplyr::across(dplyr::all_of("geo_value")))
8686
$slide(f = ~ data.frame(bin_sum = sum(.x$binary)),
8787
before = 2,
8888
as_list_col = TRUE)
8989
)
9090

91-
expect_identical(xx1,xx3) # This and * Imply xx2 and xx3 are identical
91+
expect_identical(xx_dfrow1,xx_dfrow3) # This and * Imply xx_dfrow2 and xx_dfrow3 are identical
92+
93+
xx_df1 <- xx %>%
94+
group_by(.data$geo_value) %>%
95+
epix_slide(f = ~ data.frame(bin = .x$binary),
96+
before = 2,
97+
as_list_col = TRUE)
98+
99+
xx_df2 <- tibble(
100+
geo_value = rep("x",4),
101+
time_value = c(4,5,6,7),
102+
slide_value =
103+
list(c(2^3,2^2),
104+
c(2^6,2^3),
105+
c(2^10,2^9),
106+
c(2^15,2^14)) %>%
107+
purrr::map(~ data.frame(bin = rev(.x)))
108+
) %>%
109+
group_by(geo_value)
110+
111+
expect_identical(xx_df1,xx_df2)
112+
113+
xx_scalar1 <- xx %>%
114+
group_by(.data$geo_value) %>%
115+
epix_slide(f = ~ sum(.x$binary),
116+
before = 2,
117+
as_list_col = TRUE)
118+
119+
xx_scalar2 <- tibble(
120+
geo_value = rep("x",4),
121+
time_value = c(4,5,6,7),
122+
slide_value =
123+
list(2^3+2^2,
124+
2^6+2^3,
125+
2^10+2^9,
126+
2^15+2^14)
127+
) %>%
128+
group_by(geo_value)
129+
130+
expect_identical(xx_scalar1,xx_scalar2)
131+
132+
xx_vec1 <- xx %>%
133+
group_by(.data$geo_value) %>%
134+
epix_slide(f = ~ .x$binary,
135+
before = 2,
136+
as_list_col = TRUE)
137+
138+
xx_vec2 <- tibble(
139+
geo_value = rep("x",4),
140+
time_value = c(4,5,6,7),
141+
slide_value =
142+
list(c(2^3,2^2),
143+
c(2^6,2^3),
144+
c(2^10,2^9),
145+
c(2^15,2^14)) %>%
146+
purrr::map(rev)
147+
) %>%
148+
group_by(geo_value)
149+
150+
expect_identical(xx_vec1,xx_vec2)
92151
})
93152

94153
test_that("epix_slide `before` validation works", {

0 commit comments

Comments
 (0)