cmu-delphi
diff --git a/‎NAMESPACE
Lines changed: 4 additions & 7 deletions b/‎NAMESPACE
Lines changed: 4 additions & 7 deletions
diff --git a/‎NEWS.md
Lines changed: 19 additions & 0 deletions b/‎NEWS.md
Lines changed: 19 additions & 0 deletions
diff --git a/‎R/grouped_epi_archive.R
Lines changed: 57 additions & 28 deletions b/‎R/grouped_epi_archive.R
Lines changed: 57 additions & 28 deletions
diff --git a/‎R/methods-epi_df.R
Lines changed: 66 additions & 82 deletions b/‎R/methods-epi_df.R
Lines changed: 66 additions & 82 deletions
@@ -1,26 +1,22 @@
 # Generated by roxygen2: do not edit by hand
 
 S3method("[",epi_df)
-S3method(arrange,epi_df)
+S3method("names<-",epi_df)
 S3method(as_epi_df,data.frame)
 S3method(as_epi_df,epi_df)
 S3method(as_epi_df,tbl_df)
 S3method(as_epi_df,tbl_ts)
 S3method(as_tsibble,epi_df)
 S3method(dplyr_col_modify,col_modify_recorder_df)
-S3method(filter,epi_df)
+S3method(dplyr_col_modify,epi_df)
+S3method(dplyr_reconstruct,epi_df)
 S3method(group_by,epi_archive)
 S3method(group_by,epi_df)
 S3method(group_by,grouped_epi_archive)
 S3method(group_by_drop_default,grouped_epi_archive)
-S3method(group_modify,epi_df)
-S3method(mutate,epi_df)
 S3method(next_after,Date)
 S3method(next_after,integer)
 S3method(print,epi_df)
-S3method(relocate,epi_df)
-S3method(rename,epi_df)
-S3method(slice,epi_df)
 S3method(summary,epi_df)
 S3method(ungroup,epi_df)
 S3method(unnest,epi_df)
@@ -66,6 +62,7 @@ importFrom(data.table,set)
 importFrom(data.table,setkeyv)
 importFrom(dplyr,arrange)
 importFrom(dplyr,dplyr_col_modify)
+importFrom(dplyr,dplyr_reconstruct)
 importFrom(dplyr,filter)
 importFrom(dplyr,group_by)
 importFrom(dplyr,group_by_drop_default)
 
@@ -4,9 +4,28 @@ Note that `epiprocess` uses the [Semantic Versioning
 ("semver")](https://semver.org/) scheme for all release versions, but not for
 development versions. A ".9999" suffix indicates a development version.
 
+## Breaking changes:
+
+* `epi_slide` and `epix_slide` now keep the grouping of `x` in their results,
+  like `dplyr::mutate` and `dplyr::group_modify`. To obtain the old behavior,
+  `dplyr::ungroup` immediately after slides.
+
+## Potentially-breaking changes:
+
+* Fixed `[` on grouped `epi_df`s to maintain the grouping if possible when
+  dropping the `epi_df` class (e.g., when removing the `time_value` column).
+* Fixed `group_modify` on `epi_df`s to decay to non-`epi_df`s when an `epi_df`
+  result doesn't make sense (e.g., when removing the `time_value` column).
+
+## Improvements:
+
+* Fixed `epi_slide` and `group_modify` on grouped `epi_df`s to not drop `epi_df`
+  class.
+
 ## Cleanup:
 
 * Added a `NEWS.md` file to track changes to the package.
+* Implemented `?dplyr::dplyr_extending` for `epi_df`s (#223).
 
 # epiprocess 0.5.0:
 
 
@@ -215,24 +215,55 @@ grouped_epi_archive =
             # Symbolize column name
             new_col = sym(new_col_name)
 
-            # Key variable names, apart from time value and version
-            key_vars = setdiff(key(private$ungrouped$DT), c("time_value", "version"))
+            # Each computation is expected to output a data frame with either
+            # one element/row total or one element/row per encountered
+            # nongrouping, nontime, nonversion key value. These nongrouping,
+            # nontime, nonversion key columns can be seen as the "effective" key
+            # of the computation; the computation might return an object that
+            # reports a different key or no key, but the "effective" key should
+            # still be a valid unique key for the data, and is something that we
+            # could use even with `.keep = FALSE`.
+            comp_effective_key_vars =
+              setdiff(key(private$ungrouped$DT),
+                      c(private$vars, "time_value", "version"))
 
             # Computation for one group, one time value
-            comp_one_grp = function(.data_group,
+            comp_one_grp = function(.data_group, .group_key,
                                     f, ..., 
-                                    time_value,
-                                    key_vars,
+                                    ref_time_value,
+                                    comp_effective_key_vars,
                                     new_col) {
               # Carry out the specified computation 
-              comp_value = f(.data_group, ...)
+              comp_value = f(.data_group, .group_key, ...)
 
-              # Count the number of appearances of the reference time value.
-              # Note: ideally, we want to directly count occurrences of the ref
-              # time value but due to latency, this will often not appear in the
-              # data group. So we count the number of unique key values, outside 
-              # of the time value column
-              count = sum(!duplicated(.data_group[, key_vars]))
+              # Calculate the number of output elements/rows we expect the
+              # computation to output: one per distinct "effective computation
+              # key variable" value encountered in the input. Note: this mirrors
+              # how `epi_slide` does things if we're using unique keys, but can
+              # diverge if using nonunique keys. The `epi_slide` approach of
+              # counting occurrences of the `ref_time_value` in the `time_value`
+              # column, which helps lines up the computation results with
+              # corresponding rows of the input data, wouldn't quite apply here:
+              # we'd want to line up with rows (from the same group) with
+              # `version` matching the `ref_time_value`, but would still need to
+              # summarize these rows somehow and drop the `time_value` input
+              # column, but this summarization requires something like a
+              # to-be-unique output key to determine a sensible number of rows
+              # to output (and the contents of those rows).
+              count =
+                if (length(comp_effective_key_vars) != 0L) {
+                  sum(!duplicated(.data_group[, comp_effective_key_vars]))
+                } else {
+                  # Same idea as above, but accounting for `duplicated` not
+                  # working as we want on 0 columns. (Should be the same as if
+                  # we were counting distinct values of a column defined as
+                  # `rep(val, target_n_rows)`.)
+                  if (nrow(.data_group) == 0L) {
+                    0L
+                  } else {
+                    1L
+                  }
+                }
 
               # If we get back an atomic vector
               if (is.atomic(comp_value)) {
@@ -241,7 +272,7 @@ grouped_epi_archive =
                 }
                 # If not a singleton, should be the right length, else abort
                 else if (length(comp_value) != count) {
-                  Abort("If the slide computation returns an atomic vector, then it must have a single element, or else one element per appearance of the reference time value in the local window.")
+                  Abort('If the slide computation returns an atomic vector, then it must have either (a) a single element, or (b) one element per distinct combination of key variables, excluding the `time_value`, `version`, and grouping variables, that is present in the first argument to the computation.')
                 }
               }
 
@@ -256,7 +287,7 @@ grouped_epi_archive =
                 }
                 # Make into a list
                 else {
-                  comp_value = split(comp_value, 1:nrow(comp_value))
+                  comp_value = split(comp_value, seq_len(nrow(comp_value)))
                 }
               }
 
@@ -265,10 +296,9 @@ grouped_epi_archive =
                 Abort("The slide computation must return an atomic vector or a data frame.")
               }
 
-              # Note that we've already recycled comp value to make size stable,
-              # so tibble() will just recycle time value appropriately
-              return(tibble::tibble(time_value = time_value, 
-                                    !!new_col := comp_value))
+              # Label every result row with the `ref_time_value`:
+              return(tibble::tibble(time_value = rep(.env$ref_time_value, count),
+                                    !!new_col := .env$comp_value))
             }
 
             # If f is not missing, then just go ahead, slide by group
@@ -278,12 +308,12 @@ grouped_epi_archive =
                 private$ungrouped$as_of(ref_time_value, min_time_value = ref_time_value - before) %>%
                   dplyr::group_by(dplyr::across(tidyselect::all_of(private$vars)),
                                   .drop=private$drop) %>%
-                  dplyr::summarize(comp_one_grp(dplyr::cur_data_all(),
-                                                f = f, ...,
-                                                time_value = ref_time_value,
-                                                key_vars = key_vars,
-                                                new_col = new_col),
-                                   .groups = groups)
+                  dplyr::group_modify(comp_one_grp,
+                                      f = f, ...,
+                                      ref_time_value = ref_time_value,
+                                      comp_effective_key_vars = comp_effective_key_vars,
+                                      new_col = new_col,
+                                      .keep = TRUE)
               })
             }
 
@@ -306,11 +336,10 @@ grouped_epi_archive =
                   dplyr::group_by(dplyr::across(tidyselect::all_of(private$vars))) %>%
                   dplyr::group_modify(comp_one_grp,
                                       f = f, quo = quo,
-                                      time_value = ref_time_value,
-                                      key_vars = key_vars,
+                                      ref_time_value = ref_time_value,
+                                      comp_effective_key_vars = comp_effective_key_vars,
                                       new_col = new_col,
-                                      .keep = TRUE) %>%
-                  dplyr::ungroup()
+                                      .keep = TRUE)
               })
             }
 
 
@@ -19,8 +19,7 @@ as_tsibble.epi_df = function(x, key, ...) {
 
 #' Base S3 methods for an `epi_df` object
 #'
-#' Print, summary, and `dplyr` verbs (that preserve class and attributes) for an
-#' `epi_df` object.
+#' Print and summary functions for an `epi_df` object.
 #'
 #' @param x The `epi_df` object.
 #' @param ... Additional arguments passed to methods.
@@ -65,119 +64,104 @@ summary.epi_df = function(object, ...) {
                          dplyr::summarize(mean(.data$num)))))
 }
 
+#' Drop any `epi_df` metadata and class on a data frame
+#'
+#' Useful in implementing `?dplyr_extending` when manipulations cause invariants
+#' of `epi_df`s to be violated and we need to return some other class. Note that
+#' this will maintain any grouping (keeping the `grouped_df` class and
+#' associated attributes, if present).
+#'
+#' @param x an `epi_df` or other data frame
+#' @return `x` with any metadata dropped and the `"epi_df"` class, if previously
+#'   present, dropped
+#'
+#' @noRd
+decay_epi_df = function(x) {
+  attributes(x)$metadata <- NULL
+  class(x) <- class(x)[class(x) != "epi_df"]
+  x
+}
+
+# Implementing `dplyr_extending`: `geo_type` and `time_type` are scalar
+# attributes dependent on columns, and `other_keys` acts like an attribute
+# vectorized over columns; `dplyr_extending` advice says to implement
+# `dplyr_reconstruct`, 1d `[`, `dplyr_col_modify`, and `names<-`, but not
+# `dplyr_row_slice`.  We'll implement `[` to allow either 1d or 2d.
+
+#' @param data tibble or `epi_df` (`dplyr` feeds in former, but we may
+#'   directly feed in latter from our other methods)
+#' @param template `epi_df` template to use to restore
+#' @return `epi_df` or degrade into `tbl_df`
+#' @importFrom dplyr dplyr_reconstruct
 #' @export
-`[.epi_df` <- function(x, i, j, drop = FALSE) { 
-  res <- NextMethod() 
-  
-  if (!is.data.frame(res)) return(res)
-  
-  if (missing(i)) {
-    i <- NULL
-  } 
-  
-  if (missing(j)) {
-    j <- NULL
-  } 
+#' @noRd
+dplyr_reconstruct.epi_df = function(data, template) {
+  # Start from a reconstruction for the backing S3 classes; this ensures that we
+  # keep any grouping that has been applied:
+  res <- NextMethod()
 
   cn <- names(res)
-  
+
   # Duplicate columns, Abort
   dup_col_names = cn[duplicated(cn)]
   if (length(dup_col_names) != 0) {
     Abort(paste0("Column name(s) ", 
                  paste(unique(dup_col_names), 
                        collapse = ", "), " must not be duplicated."))
-  } 
+  }
 
   not_epi_df <- !("time_value" %in% cn) || !("geo_value" %in% cn)
 
   if (not_epi_df) {
-    attributes(res)$metadata <- NULL 
-    return(tibble::as_tibble(res))
+    # If we're calling on an `epi_df` from one of our own functions, we need to
+    # decay to a non-`epi_df` result. If `dplyr` is calling, `x` is a tibble,
+    # `res` is not an `epi_df` yet (but might, e.g., be a `grouped_df`), and we
+    # simply need to skip adding the metadata & class. Current `decay_epi_df`
+    # should work in both cases.
+    return(decay_epi_df(res))
   }
 
-  # Use reclass as safeguard (in case class &/or metadata are dropped)
-  res <- reclass(res, attr(x, "metadata"))
+  res <- reclass(res, attr(template, "metadata"))
+
+  # XXX we may want verify the `geo_type` and `time_type` here. If it's
+  # significant overhead, we may also want to keep this less strict version
+  # around and implement some extra S3 methods that use it, when appropriate.
 
   # Amend additional metadata if some other_keys cols are dropped in the subset
-  old_other_keys = attr(x, "metadata")$other_keys
+  old_other_keys = attr(template, "metadata")$other_keys
   attr(res, "metadata")$other_keys <- old_other_keys[old_other_keys %in% cn]
 
   res
 }
 
-#' `dplyr` verbs
-#'
-#' `dplyr` verbs for `epi_df` objects, preserving class and attributes. 
-#'
-#' @method arrange epi_df
-#' @param .data The `epi_df` object.
-#' @rdname print.epi_df
-#' @export
-arrange.epi_df = function(.data, ...) {
-  metadata = attributes(.data)$metadata
-  .data = NextMethod()
-  reclass(.data, metadata)
-}
-
-#' @method filter epi_df
-#' @rdname print.epi_df
-#' @export
-filter.epi_df = function(.data, ...) {
-  metadata = attributes(.data)$metadata
-  .data = NextMethod()
-  reclass(.data, metadata)
-}
-
-#' @method group_by epi_df
-#' @rdname print.epi_df
 #' @export
-group_by.epi_df = function(.data, ...) {
-  metadata = attributes(.data)$metadata
-  .data = NextMethod()
-  reclass(.data, metadata)
-}
-
-#' @method group_modify epi_df
-#' @rdname print.epi_df
-#' @export
-group_modify.epi_df = function(.data, ...) {
-  metadata = attributes(.data)$metadata
-  .data = NextMethod()
-  reclass(.data, metadata)
-}
-
-#' @method mutate epi_df
-#' @rdname print.epi_df
-#' @export
-mutate.epi_df = function(.data, ...) {
-  metadata = attributes(.data)$metadata
-  .data = NextMethod()
-  reclass(.data, metadata)
+`[.epi_df` <- function(x, i, j, drop = FALSE) {
+  res <- NextMethod()
+  
+  if (!is.data.frame(res)) return(res)
+  
+  dplyr::dplyr_reconstruct(res, x)
 }
 
-#' @method relocate epi_df
-#' @rdname print.epi_df
+#' @importFrom dplyr dplyr_col_modify
 #' @export
-relocate.epi_df = function(.data, ...) {
-  metadata = attributes(.data)$metadata
-  .data = NextMethod()
-  reclass(.data, metadata)
+dplyr_col_modify.epi_df = function(data, cols) {
+  dplyr::dplyr_reconstruct(NextMethod(), data)
 }
 
-#' @method rename epi_df
-#' @rdname print.epi_df
 #' @export
-rename.epi_df = function(.data, ...) {
-  metadata = attributes(.data)$metadata
-  .data = NextMethod()
-  reclass(.data, metadata)
+`names<-.epi_df` = function(x, value) {
+  old_names = names(x)
+  old_other_keys = attributes(x)$metadata$other_keys
+  result = NextMethod()
+  attributes(x)$metadata$other_keys <- value[match(old_other_keys, old_names)]
+  dplyr::dplyr_reconstruct(result, result)
 }
 
-#' @method slice epi_df
+#' @method group_by epi_df
 #' @rdname print.epi_df
 #' @export
-slice.epi_df = function(.data, ...) {
+group_by.epi_df = function(.data, ...) {
   metadata = attributes(.data)$metadata
   .data = NextMethod()
   reclass(.data, metadata)