cmu-delphi · dsweber2 · Aug 13, 2024 · Jul 23, 2024 · Jul 24, 2024 · Jul 24, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: epiprocess
 Title: Tools for basic signal processing in epidemiology
-Version: 0.8.2
+Version: 0.8.3
 Authors@R: c(
     person("Jacob", "Bien", role = "ctb"),
     person("Logan", "Brooks", email = "[email protected]", role = c("aut", "cre")),
@@ -31,6 +31,7 @@ Imports:
     data.table,
     dplyr (>= 1.0.8),
     genlasso,
+    glue,
     ggplot2,
     glue,
     lifecycle (>= 1.0.1),
@@ -83,6 +84,7 @@ Collate:
     'methods-epi_df.R'
     'outliers.R'
     'reexports.R'
+    'revision_analysis.R'
     'slide.R'
     'utils.R'
     'utils_pipe.R'
diff --git a/NAMESPACE b/NAMESPACE
@@ -81,6 +81,7 @@ export(new_epi_df)
 export(next_after)
 export(relocate)
 export(rename)
+export(revision_summary)
 export(slice)
 export(time_column_names)
 export(ungroup)
@@ -111,6 +112,7 @@ importFrom(checkmate,vname)
 importFrom(cli,cat_line)
 importFrom(cli,cli_abort)
 importFrom(cli,cli_inform)
+importFrom(cli,cli_li)
 importFrom(cli,cli_vec)
 importFrom(cli,cli_warn)
 importFrom(cli,format_message)
@@ -128,8 +130,11 @@ importFrom(data.table,set)
 importFrom(data.table,setDF)
 importFrom(data.table,setkeyv)
 importFrom(dplyr,"%>%")
+importFrom(dplyr,across)
+importFrom(dplyr,all_of)
 importFrom(dplyr,arrange)
 importFrom(dplyr,bind_rows)
+importFrom(dplyr,c_across)
 importFrom(dplyr,dplyr_col_modify)
 importFrom(dplyr,dplyr_reconstruct)
 importFrom(dplyr,dplyr_row_slice)
@@ -142,11 +147,17 @@ importFrom(dplyr,group_vars)
 importFrom(dplyr,groups)
 importFrom(dplyr,if_all)
 importFrom(dplyr,if_any)
+importFrom(dplyr,if_else)
+importFrom(dplyr,lag)
 importFrom(dplyr,mutate)
+importFrom(dplyr,near)
+importFrom(dplyr,pick)
+importFrom(dplyr,pull)
 importFrom(dplyr,relocate)
 importFrom(dplyr,rename)
 importFrom(dplyr,select)
 importFrom(dplyr,slice)
+importFrom(dplyr,summarize)
 importFrom(dplyr,tibble)
 importFrom(dplyr,ungroup)
 importFrom(ggplot2,autoplot)
@@ -178,6 +189,7 @@ importFrom(rlang,is_formula)
 importFrom(rlang,is_function)
 importFrom(rlang,is_missing)
 importFrom(rlang,is_quosure)
+importFrom(rlang,list2)
 importFrom(rlang,missing_arg)
 importFrom(rlang,new_function)
 importFrom(rlang,quo_get_expr)

diff --git a/NEWS.md b/NEWS.md
@@ -8,6 +8,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
 
 - Added `complete.epi_df`, which fills in missing values in an `epi_df` with
   `NA`s. Uses `tidyr::complete` underneath and preserves `epi_df` metadata.
+- Inclusion of the function `revision_summary` to provide basic revision information for `epi_archive`s out of the box. (#492)
 
 ## Bug fixes
 

diff --git a/R/archive.R b/R/archive.R
@@ -240,6 +240,7 @@ NULL
 #'   value of `clobberable_versions_start` does not fully trust these empty
 #'   updates, and assumes that any version `>= max(x$version)` could be
 #'   clobbered.) If `nrow(x) == 0`, then this argument is mandatory.
+#' @param compactify_tol double. the tolerance used to detect approximate equality for compactification
 #' @return An `epi_archive` object.
 #'
 #' @importFrom data.table as.data.table key setkeyv
@@ -295,15 +296,16 @@ new_epi_archive <- function(
     additional_metadata,
     compactify,
     clobberable_versions_start,
-    versions_end) {
+    versions_end,
+    compactify_tol = .Machine$double.eps^0.5) {
   # Create the data table; if x was an un-keyed data.table itself,
   # then the call to as.data.table() will fail to set keys, so we
   # need to check this, then do it manually if needed
   key_vars <- c("geo_value", "time_value", other_keys, "version")
-  DT <- as.data.table(x, key = key_vars) # nolint: object_name_linter
-  if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars)
+  data_table <- as.data.table(x, key = key_vars) # nolint: object_name_linter
+  if (!identical(key_vars, key(data_table))) setkeyv(data_table, cols = key_vars)
 
-  if (anyDuplicated(DT, by = key(DT)) != 0L) {
+  if (anyDuplicated(data_table, by = key(data_table)) != 0L) {
     cli_abort("`x` must have one row per unique combination of the key variables. If you
             have additional key variables other than `geo_value`, `time_value`, and
             `version`, such as an age group column, please specify them in `other_keys`.
@@ -313,38 +315,17 @@ new_epi_archive <- function(
     )
   }
 
-  # Checks to see if a value in a vector is LOCF
-  is_locf <- function(vec) { # nolint: object_usage_linter
-    dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)),
-      vec == dplyr::lag(vec),
-      is.na(vec) & is.na(dplyr::lag(vec))
-    )
-  }
-
-  # LOCF is defined by a row where all values except for the version
-  # differ from their respective lag values
-
-  # Checks for LOCF's in a data frame
-  rm_locf <- function(df) {
-    dplyr::filter(df, if_any(c(everything(), -version), ~ !is_locf(.))) # nolint: object_usage_linter
-  }
-
-  # Keeps LOCF values, such as to be printed
-  keep_locf <- function(df) {
-    dplyr::filter(df, if_all(c(everything(), -version), ~ is_locf(.))) # nolint: object_usage_linter
-  }
-
+  nrow_before_compactify <- nrow(data_table)
   # Runs compactify on data frame
   if (is.null(compactify) || compactify == TRUE) {
-    elim <- keep_locf(DT)
-    DT <- rm_locf(DT) # nolint: object_name_linter
+    compactified <- apply_compactify(data_table, key_vars, compactify_tol)
   } else {
-    # Create empty data frame for nrow(elim) to be 0
-    elim <- tibble::tibble()
+    compactified <- data_table
   }
-
-  # Warns about redundant rows
-  if (is.null(compactify) && nrow(elim) > 0) {
+  # Warns about redundant rows if the number of rows decreased, and we didn't
+  # explicitly say to compactify
+  if (is.null(compactify) && nrow(compactified) < nrow_before_compactify) {
+    elim <- removed_by_compactify(data_table, key_vars, compactify_tol)
     warning_intro <- cli::format_inline(
       "Found rows that appear redundant based on
             last (version of each) observation carried forward;
@@ -366,7 +347,7 @@ new_epi_archive <- function(
 
   structure(
     list(
-      DT = DT,
+      DT = compactified,
       geo_type = geo_type,
       time_type = time_type,
       additional_metadata = additional_metadata,
@@ -377,6 +358,63 @@ new_epi_archive <- function(
   )
 }
 
+#' given a tibble as would be found in an epi_archive, remove duplicate entries.
+#' @description
+#' works by shifting all rows except the version, then comparing values to see
+#'   if they've changed. We need to arrange in descending order, but note that
+#'   we don't need to group, since at least one column other than version has
+#'   changed, and so is kept.
+#' @keywords internal
+#' @importFrom dplyr filter
+apply_compactify <- function(df, keys, tolerance = .Machine$double.eps^.5) {
+  df %>%
+    arrange(!!!keys) %>%
+    filter(if_any(
+      c(everything(), -version), # all non-version columns
+      ~ !is_locf(., tolerance)
+    ))
+}
+
+#' get the entries that `compactify` would remove
+#' @keywords internal
+#' @importFrom dplyr filter if_all everything
+removed_by_compactify <- function(df, keys, tolerance) {
+  df %>%
+    arrange(!!!keys) %>%
+    filter(if_all(
+      c(everything(), -version),
+      ~ is_locf(., tolerance)
+    )) # nolint: object_usage_linter
+}
+
+#' Checks to see if a value in a vector is LOCF
+#' @description
+#' LOCF meaning last observation carried forward. lags the vector by 1, then
+#'   compares with itself. For doubles it uses float comparison via
+#'   [`dplyr::near`], otherwise it uses equality.  `NA`'s and `NaN`'s are
+#'   considered equal to themselves and each other.
+#' @importFrom dplyr lag if_else near
+#' @keywords internal
+is_locf <- function(vec, tolerance) { # nolint: object_usage_linter
+  lag_vec <- dplyr::lag(vec)
+  if (typeof(vec) == "double") {
+    res <- if_else(
+      !is.na(vec) & !is.na(lag_vec),
+      near(vec, lag_vec, tol = tolerance),
+      is.na(vec) & is.na(lag_vec)
+    )
+    return(res)
+  } else {
+    res <- if_else(
+      !is.na(vec) & !is.na(lag_vec),
+      vec == lag_vec,
+      is.na(vec) & is.na(lag_vec)
+    )
+    return(res)
+  }
+}
+
+
 #' `validate_epi_archive` ensures correctness of arguments fed to `as_epi_archive`.
 #'
 #' @rdname epi_archive

diff --git a/R/epi_df.R b/R/epi_df.R
@@ -242,7 +242,6 @@ as_epi_df.tbl_df <- function(
        must be present in `x`."
     )
   }
-
   if (lifecycle::is_present(geo_type)) {
     cli_warn("epi_archive constructor argument `geo_type` is now ignored. Consider removing.")
   }
-Original file line number
+Diff line change
@@ Expand Up / @@ -242,7 +242,6 @@ as_epi_df.tbl_df <- function( @@
            must be present in `x`."
         )
       }
       if (lifecycle::is_present(geo_type)) {
         cli_warn("epi_archive constructor argument `geo_type` is now ignored. Consider removing.")
       }
@@ Expand Down @@