Make as_list_col=TRUE consistent for vecs and dfs from slide comps

lcbrooks · lcbrooks · commit 8985db2b9066 · 2023-05-24T05:52:07.000-07:00
diff --git a/NEWS.md b/NEWS.md
@@ -20,6 +20,12 @@ inter-release development versions will include an additional ".9999" suffix.
   * To keep the old behavior, convert the output of `epix_slide()` to `epi_df`
     when desired and set the metadata appropriately.
 
+## Improvements:
+
+* `epi_slide` and `epix_slide` now support `as_list_col = TRUE` when the slide
+  computations output atomic vectors, and output a list column in "chopped"
+  format (see `tidyr::chop`).
+
 # epiprocess 0.6.0
 
 ## Breaking changes:
diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R
@@ -279,11 +279,12 @@ grouped_epi_archive =
               if (! (is.atomic(comp_value) || is.data.frame(comp_value))) {
                 Abort("The slide computation must return an atomic vector or a data frame.")
               }
-              if (is.data.frame(comp_value)) {
-                # Wrap in a list so that we get a list-type col rather than a
-                # data.frame-type col when `as_list_col = TRUE`:
-                comp_value <- list(comp_value)
-              }
+              # Wrap the computation output in a list and unchop/unnest later if
+              # `as_list_col = FALSE`. This approach means that we will get a
+              # list-class col rather than a data.frame-class col when
+              # `as_list_col = TRUE` and the computations outputs are data
+              # frames.
+              comp_value <- list(comp_value)
               
               # Label every result row with the `ref_time_value`:
               return(tibble::tibble(time_value = .env$ref_time_value,
@@ -426,8 +427,8 @@ grouped_epi_archive =
                 )
               })
             }
-            
-            # Unnest if we need to
+
+            # Unchop/unnest if we need to
             if (!as_list_col) {
               x = tidyr::unnest(x, !!new_col, names_sep = names_sep)
             }
diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R
@@ -707,11 +707,12 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr
 #' @param new_col_name String indicating the name of the new column that will
 #'   contain the derivative values. Default is "slide_value"; note that setting
 #'   `new_col_name` equal to an existing column name will overwrite this column.
-#' @param as_list_col If the computations return data frames, should the slide
-#'   result hold these in a single list column or try to unnest them? Default is
-#'   `FALSE`, in which case a list object returned by `f` would be unnested
-#'   (using [`tidyr::unnest()`]), and the names of the resulting columns are given
-#'   by prepending `new_col_name` to the names of the list elements.
+#' @param as_list_col Should the slide results be held in a list column, or be
+#'   [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`,
+#'   in which case a list object returned by `f` would be unnested (using
+#'   [`tidyr::unnest()`]), and, if the slide computations output data frames,
+#'   the names of the resulting columns are given by prepending `new_col_name`
+#'   to the names of the list elements.
 #' @param names_sep String specifying the separator to use in `tidyr::unnest()`
 #'   when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix
 #'   from `new_col_name` entirely.
diff --git a/R/slide.R b/R/slide.R
@@ -52,11 +52,12 @@
 #' @param new_col_name String indicating the name of the new column that will
 #'   contain the derivative values. Default is "slide_value"; note that setting
 #'   `new_col_name` equal to an existing column name will overwrite this column.
-#' @param as_list_col If the computations return data frames, should the slide
-#'   result hold these in a single list column or try to unnest them? Default is
-#'   `FALSE`, in which case a list object returned by `f` would be unnested
-#'   (using [`tidyr::unnest()`]), and the names of the resulting columns are given
-#'   by prepending `new_col_name` to the names of the list elements.
+#' @param as_list_col Should the slide results be held in a list column, or be
+#'   [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`,
+#'   in which case a list object returned by `f` would be unnested (using
+#'   [`tidyr::unnest()`]), and, if the slide computations output data frames,
+#'   the names of the resulting columns are given by prepending `new_col_name`
+#'   to the names of the list elements.
 #' @param names_sep String specifying the separator to use in `tidyr::unnest()`
 #'   when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix
 #'   from `new_col_name` entirely.
@@ -248,11 +249,11 @@ epi_slide = function(x, f, ..., before, after, ref_time_values,
     time_values = time_values[o] 
     
     # Compute the slide values 
-    slide_values = slider::hop_index(.x = .data_group,
-                                     .i = .data_group$time_value,
-                                     .f = f, ...,
-                                     .starts = starts,
-                                     .stops = stops)
+    slide_values_list = slider::hop_index(.x = .data_group,
+                                          .i = .data_group$time_value,
+                                          .f = f, ...,
+                                          .starts = starts,
+                                          .stops = stops)
 
     # Now figure out which rows in the data group are in the reference time
     # values; this will be useful for all sorts of checks that follow
@@ -265,42 +266,38 @@ epi_slide = function(x, f, ..., before, after, ref_time_values,
       dplyr::count(.data$time_value) %>%
       dplyr::pull(n)
 
-    # If they're all atomic vectors 
-    if (all(sapply(slide_values, is.atomic))) {
-      if (all(sapply(slide_values, length) == 1)) {
-        # Recycle to make size stable (one slide value per ref time value)
-        slide_values = rep(unlist(slide_values), times = counts)
-      }
-      else {
-        # Unlist, then check its length, and abort if not right
-        slide_values = unlist(slide_values)
-        if (length(slide_values) != num_ref_rows) {
-          Abort("If the slide computations return atomic vectors, then they must each have a single element, or else one element per appearance of the reference time value in the local window.")
-        }
-      }
+    if (!all(purrr::map_lgl(slide_values_list, is.atomic)) &&
+          !all(purrr::map_lgl(slide_values_list, is.data.frame))) {
+      Abort("The slide computations must return always atomic vectors or data frames (and not a mix of these two structures).")
     }
-      
-    # If they're all data frames
-    else if (all(sapply(slide_values, is.data.frame))) {
-      if (all(sapply(slide_values, nrow) == 1)) {
-        # Recycle to make size stable (one slide value per ref time value)
-        slide_values = rep(slide_values, times = counts)
+    
+    # Unlist if appropriate:
+    slide_values =
+      if (as_list_col) {
+        slide_values_list
+      } else {
+        vctrs::list_unchop(slide_values_list)
       }
-      else {
-        # Split (each row on its own), check length, abort if not right
-        slide_df = dplyr::bind_rows(slide_values)
-        slide_values = split(slide_df, 1:nrow(slide_df))
-        if (length(slide_values) != num_ref_rows) {
-          Abort("If the slide computations return data frames, then they must each have a single row, or else one row per appearance of the reference time value in the local window.")
-        }
+
+    if (all(purrr::map_int(slide_values_list, vctrs::vec_size) == 1L) &&
+          length(slide_values_list) != 0L) {
+      # Recycle to make size stable (one slide value per ref time value).
+      # (Length-0 case also could be handled here, but causes difficulties;
+      # leave it to the next branch, where it also belongs.)
+      slide_values = vctrs::vec_rep_each(slide_values, times = counts)
+    } else {
+      # Split and flatten if appropriate, perform a (loose) check on number of
+      # rows.
+      if (as_list_col) {
+        slide_values = purrr::list_flatten(purrr::map(
+          slide_values, ~ vctrs::vec_split(.x, seq_len(vctrs::vec_size(.x)))[["val"]]
+        ))
+      }
+      if (vctrs::vec_size(slide_values) != num_ref_rows) {
+        Abort("The slide computations must either (a) output a single element/row each, or (b) one element/row per appearance of the reference time value in the local window.")
       }
     }
-      
-    # If neither all atomic vectors or all data frames, then abort 
-    else {
-      Abort("The slide computations must return always atomic vectors or data frames (and not a mix of these two structures).")
-    }      
-    
+
     # If all rows, then pad slide values with NAs, else filter down data group
     if (all_rows) {
       orig_values = slide_values
diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd
diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd
diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R
@@ -86,3 +86,76 @@ test_that("these doesn't produce an error; the error appears only if the ref tim
                      dplyr::select("geo_value","slide_value_value"), 
                    dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2))) # not out of range for either group
 })
+
+test_that("computation output formats x as_list_col", {
+  toy_edf = tibble::tribble(
+    ~geo_value, ~time_value, ~value    ,
+    "a"       , 1:10       , 2L^( 1:10),
+    "b"       , 1:10       , 2L^(11:20),
+    ) %>%
+    tidyr::unchop(c(time_value, value)) %>%
+    as_epi_df(as_of = 100)
+  # We'll try 7d sum with a few formats.
+  basic_result_from_size1 = tibble::tribble(
+    ~geo_value, ~time_value, ~value    , ~slide_value                                    ,
+    "a"       , 1:10       , 2L^( 1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE),
+    "b"       , 1:10       , 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE),
+    ) %>%
+    tidyr::unchop(c(time_value, value, slide_value)) %>%
+    dplyr::arrange(time_value) %>%
+    as_epi_df(as_of = 100)
+  expect_identical(
+    toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value)),
+    basic_result_from_size1
+  )
+  expect_identical(
+    toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value), as_list_col = TRUE),
+    basic_result_from_size1 %>% dplyr::mutate(slide_value = as.list(slide_value))
+  )
+  expect_identical(
+    toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value))),
+    basic_result_from_size1 %>% rename(slide_value_value = slide_value)
+  )
+  expect_identical(
+    toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), as_list_col = TRUE),
+    basic_result_from_size1 %>%
+      mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x)))
+  )
+  # output naming functionality:
+  expect_identical(
+    toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)),
+                          new_col_name = "result"),
+    basic_result_from_size1 %>% rename(result_value = slide_value)
+  )
+  expect_identical(
+    toy_edf %>% epi_slide(before = 6L, ~ data.frame(value_sum = sum(.x$value)),
+                          names_sep = NULL),
+    basic_result_from_size1 %>% rename(value_sum = slide_value)
+  )
+  # trying with non-size-1 computation outputs:
+  basic_result_from_size2 = tibble::tribble(
+    ~geo_value, ~time_value, ~value    , ~slide_value                                    ,
+    "a"       , 1:10       , 2L^( 1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE),
+    "b"       , 1:10       , 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE) + 1L,
+    ) %>%
+    tidyr::unchop(c(time_value, value, slide_value)) %>%
+    dplyr::arrange(time_value) %>%
+    as_epi_df(as_of = 100)
+  expect_identical(
+    toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value) + 0:1),
+    basic_result_from_size2
+  )
+  expect_identical(
+    toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value) + 0:1, as_list_col = TRUE),
+    basic_result_from_size2 %>% dplyr::mutate(slide_value = as.list(slide_value))
+  )
+  expect_identical(
+    toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value) + 0:1)),
+    basic_result_from_size2 %>% rename(slide_value_value = slide_value)
+  )
+  expect_identical(
+    toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value) + 0:1), as_list_col = TRUE),
+    basic_result_from_size2 %>%
+      mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x)))
+  )
+})
diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R
@@ -60,35 +60,94 @@ test_that("epix_slide works as intended",{
 })
 
 test_that("epix_slide works as intended with `as_list_col=TRUE`",{
-  # Note Issue #261.
-  xx1 <- xx %>%
+  xx_dfrow1 <- xx %>%
     group_by(.data$geo_value) %>%
     epix_slide(f = ~ data.frame(bin_sum = sum(.x$binary)),
                before = 2,
-               as_list_col=TRUE)
+               as_list_col = TRUE)
   
-  xx2 <- tibble(geo_value = rep("x",4),
-                time_value = c(4,5,6,7),
-                slide_value =
-                  c(2^3+2^2,
-                    2^6+2^3,
-                    2^10+2^9,
-                    2^15+2^14) %>%
-                  purrr::map(~ data.frame(bin_sum = .x))
-                ) %>%
+  xx_dfrow2 <- tibble(
+    geo_value = rep("x",4),
+    time_value = c(4,5,6,7),
+    slide_value =
+      c(2^3+2^2,
+        2^6+2^3,
+        2^10+2^9,
+        2^15+2^14) %>%
+      purrr::map(~ data.frame(bin_sum = .x))
+  ) %>%
     group_by(geo_value)
   
-  expect_identical(xx1,xx2) # *
+  expect_identical(xx_dfrow1,xx_dfrow2) # *
   
-  xx3 <- (
+  xx_dfrow3 <- (
     xx
     $group_by(dplyr::across(dplyr::all_of("geo_value")))
     $slide(f = ~ data.frame(bin_sum = sum(.x$binary)),
            before = 2,
            as_list_col = TRUE)
   )
   
-  expect_identical(xx1,xx3) # This and * Imply xx2 and xx3 are identical
+  expect_identical(xx_dfrow1,xx_dfrow3) # This and * Imply xx_dfrow2 and xx_dfrow3 are identical
+  
+  xx_df1 <- xx %>%
+    group_by(.data$geo_value) %>%
+    epix_slide(f = ~ data.frame(bin = .x$binary),
+               before = 2,
+               as_list_col = TRUE)
+  
+  xx_df2 <- tibble(
+    geo_value = rep("x",4),
+    time_value = c(4,5,6,7),
+    slide_value =
+      list(c(2^3,2^2),
+           c(2^6,2^3),
+           c(2^10,2^9),
+           c(2^15,2^14)) %>%
+      purrr::map(~ data.frame(bin = rev(.x)))
+  ) %>%
+    group_by(geo_value)
+  
+  expect_identical(xx_df1,xx_df2)
+
+  xx_scalar1 <- xx %>%
+    group_by(.data$geo_value) %>%
+    epix_slide(f = ~ sum(.x$binary),
+               before = 2,
+               as_list_col = TRUE)
+  
+  xx_scalar2 <- tibble(
+    geo_value = rep("x",4),
+    time_value = c(4,5,6,7),
+    slide_value =
+      list(2^3+2^2,
+           2^6+2^3,
+           2^10+2^9,
+           2^15+2^14)
+  ) %>%
+    group_by(geo_value)
+  
+  expect_identical(xx_scalar1,xx_scalar2)
+  
+  xx_vec1 <- xx %>%
+    group_by(.data$geo_value) %>%
+    epix_slide(f = ~ .x$binary,
+               before = 2,
+               as_list_col = TRUE)
+  
+  xx_vec2 <- tibble(
+    geo_value = rep("x",4),
+    time_value = c(4,5,6,7),
+    slide_value = 
+      list(c(2^3,2^2),
+           c(2^6,2^3),
+           c(2^10,2^9),
+           c(2^15,2^14)) %>%
+      purrr::map(rev)
+  ) %>%
+    group_by(geo_value)
+  
+  expect_identical(xx_vec1,xx_vec2)
 })
 
 test_that("epix_slide `before` validation works", {
diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd