better docs, separate compactify, improved printing

dsweber2 · dsweber2 · commit c08760cb60c5 · 2024-08-13T11:11:23.000-05:00
diff --git a/R/epi_df.R b/R/epi_df.R
@@ -242,7 +242,6 @@ as_epi_df.tbl_df <- function(
        must be present in `x`."
     )
   }
-
   if (lifecycle::is_present(geo_type)) {
     cli_warn("epi_archive constructor argument `geo_type` is now ignored. Consider removing.")
   }
diff --git a/R/revision_analysis.R b/R/revision_analysis.R
@@ -2,19 +2,19 @@
 #' @description
 #' `revision_summary` removes all missing values (if requested), and then
 #'   computes some basic statistics about the revision behavior of an archive,
-#'   returning a tibble of a per-epi-key (so time_value, geo_value pair,
-#'   possibly others based on the metadata). If `print_inform` is true, it
+#'   returning a tibble summarizing the revisions per time_value+epi_key features. If `print_inform` is true, it
 #'   prints a concise summary. The columns returned are:
-#'  1. `min_lag`: the minimum time to any value (if `drop_nas=FALSE`, this
+#'  1. `n_revisions`: the total number of revisions for that entry
+#'  2. `min_lag`: the minimum time to any value (if `drop_nas=FALSE`, this
 #'   includes `NA`'s)
-#'  2. `max_lag`: the amount of time until the final (new) version (same caveat
+#'  3. `max_lag`: the amount of time until the final (new) version (same caveat
 #'   for `drop_nas=FALSE`, though it is far less likely to matter)
-#'  3. `spread`: the difference between the smallest and largest values (this
+#'  4. `spread`: the difference between the smallest and largest values (this
 #'   always excludes `NA` values)
-#'  4. `rel_spread`: `spread` divided by the largest value (so it will
+#'  5. `rel_spread`: `spread` divided by the largest value (so it will
 #'   always be less than 1). Note that this need not be the final value. It will
 #'   be `NA` whenever `spread` is 0.
-#'  5. `time_near_latest`: This gives the lag when the value is within
+#'  6. `time_near_latest`: This gives the lag when the value is within
 #'   `within_latest` (default 20%) of the value at the latest time. For example,
 #'   consider the series (0,20, 99, 150, 102, 100); then `time_near_latest` is
 #'   the 5th index, since even though 99 is within 20%, it is outside the window
@@ -65,7 +65,8 @@ revision_summary <- function(epi_arch,
                              few_revisions = 3,
                              rel_spread_threshold = 0.1,
                              abs_spread_threshold = NULL,
-                             compactify_tol = .Machine$double.eps^0.5) {
+                             compactify_tol = .Machine$double.eps^0.5,
+                             should_compactify = TRUE) {
   arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT))
   if (length(arg) == 0) {
     first_non_key <- !(names(epi_arch$DT) %in% c(key_colnames(epi_arch), "version"))
@@ -95,12 +96,15 @@ revision_summary <- function(epi_arch,
     # if we're dropping NA's, we should recompactify
     revision_behavior <-
       revision_behavior %>%
-      filter(!is.na(c_across(!!arg))) %>%
-      arrange(across(c(geo_value, time_value, all_of(keys), version))) %>% # need to sort before compactifying
-      compactify(c(keys, version), compactify_tol)
+      filter(!is.na(c_across(!!arg)))
   } else {
     revision_behavior <- epi_arch$DT
   }
+  if (should_compactify) {
+    revision_behavior <- revision_behavior %>%
+      arrange(across(c(geo_value, time_value, all_of(keys), version))) %>% # need to sort before compactifying
+      compactify(c(keys, version), compactify_tol)
+  }
   revision_behavior <-
     revision_behavior %>%
     mutate(lag = as.integer(version) - as.integer(time_value)) %>% # nolint: object_usage_linter
@@ -122,35 +126,36 @@ revision_summary <- function(epi_arch,
     ) %>%
     select(-time_to)
   if (print_inform) {
-    cli_inform("Number of revisions:")
     cli_inform("Min lag (time to first version):")
     difftime_summary(revision_behavior$min_lag) %>% print()
     if (!drop_nas) {
       total_na <- epi_arch$DT %>%
         filter(is.na(c_across(!!arg))) %>% # nolint: object_usage_linter
         nrow()
       cli_inform("Fraction of all versions that are `NA`:")
-      cli_li(num_percent(total_na, nrow(epi_arch$DT), "versions"))
+      cli_li(num_percent(total_na, nrow(epi_arch$DT), ""))
+      cli_inform("")
     }
+    cli_inform("Fraction of epi_key+time_values with")
     total_num <- nrow(revision_behavior) # nolint: object_usage_linter
     total_num_unrevised <- sum(revision_behavior$n_revisions == 0) # nolint: object_usage_linter
     cli_inform("No revisions:")
-    cli_li(num_percent(total_num_unrevised, total_num, "entries"))
+    cli_li(num_percent(total_num_unrevised, total_num, ""))
     total_quickly_revised <- sum( # nolint: object_usage_linter
       revision_behavior$max_lag <=
         as.difftime(quick_revision, units = "days")
     )
     cli_inform("Quick revisions (last revision within {quick_revision}
 {units(quick_revision)} of the `time_value`):")
-    cli_li(num_percent(total_quickly_revised, total_num, "entries"))
+    cli_li(num_percent(total_quickly_revised, total_num, ""))
     total_barely_revised <- sum( # nolint: object_usage_linter
       revision_behavior$n_revisions <=
         few_revisions
     )
     cli_inform("Few revisions (At most {few_revisions} revisions for that `time_value`):")
-    cli_li(num_percent(total_barely_revised, total_num, "entries"))
+    cli_li(num_percent(total_barely_revised, total_num, ""))
     cli_inform("")
-    cli_inform("Changes in Value:")
+    cli_inform("Fraction of revised epi_key+time_values which have:")
 
     real_revisions <- revision_behavior %>% filter(n_revisions > 0) # nolint: object_usage_linter
     n_real_revised <- nrow(real_revisions) # nolint: object_usage_linter
@@ -159,17 +164,17 @@ revision_summary <- function(epi_arch,
         rel_spread_threshold,
       na.rm = TRUE
     ) + sum(is.na(real_revisions$rel_spread))
-    cli_inform("Less than {rel_spread_threshold} spread in relative value (only from the revised subset):")
-    cli_li(num_percent(rel_spread, n_real_revised, "revised entries"))
-    na_rel_spread <- sum(is.na(real_revisions$rel_spread)) # nolint: object_usage_linter
-    cli_inform("{units(quick_revision)} until within {within_latest*100}% of the latest value:")
-    difftime_summary(revision_behavior[["time_near_latest"]]) %>% print()
+    cli_inform("Less than {rel_spread_threshold} spread in relative value:")
+    cli_li(num_percent(rel_spread, n_real_revised, ""))
     abs_spread <- sum( # nolint: object_usage_linter
       real_revisions$spread >
         abs_spread_threshold
     ) # nolint: object_usage_linter
     cli_inform("Spread of more than {abs_spread_threshold} in actual value (when revised):")
-    cli_li(num_percent(abs_spread, n_real_revised, "revised entries"))
+    cli_li(num_percent(abs_spread, n_real_revised, ""))
+
+    cli_inform("{units(quick_revision)} until within {within_latest*100}% of the latest value:")
+    difftime_summary(revision_behavior[["time_near_latest"]]) %>% print()
   }
   return(revision_behavior)
 }
diff --git a/tests/testthat/_snaps/revision-latency-functions.md b/tests/testthat/_snaps/revision-latency-functions.md
@@ -3,29 +3,27 @@
     Code
       dummy_ex %>% revision_summary() %>% print(n = 10, width = 300)
     Message
-      Number of revisions:
       Min lag (time to first version):
     Output
            min median     mean    max
         0 days 1 days 1.6 days 4 days
     Message
+      Fraction of epi_key+time_values with
       No revisions:
       * 3 out of 7 (42.86%)
       Quick revisions (last revision within 3 days of the `time_value`):
       * 4 out of 7 (57.14%)
       Few revisions (At most 3 revisions for that `time_value`):
       * 6 out of 7 (85.71%)
-      Changes in Value:
-      Less than 0.1 spread in relative value (only from the revised subset):
+      Fraction of revised epi_key+time_values which have:
+      Less than 0.1 spread in relative value:
       * 1 out of 4 (25%)
+      Spread of more than 5.1 in actual value (when revised):
+      * 3 out of 4 (75%)
       days until within 20% of the latest value:
     Output
            min median     mean     max
         0 days 3 days 6.9 days 19 days
-    Message
-      Spread of more than 5.1 in actual value (when revised):
-      * 3 out of 4 (75%)
-    Output
       # A tibble: 7 x 8
         time_value geo_value n_revisions min_lag max_lag spread rel_spread
         <date>     <chr>           <dbl> <drtn>  <drtn>   <dbl>      <dbl>
@@ -51,31 +49,29 @@
     Code
       dummy_ex %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300)
     Message
-      Number of revisions:
       Min lag (time to first version):
     Output
            min median     mean    max
         0 days 1 days 1.4 days 4 days
     Message
       Fraction of all versions that are `NA`:
       * 2 out of 19 (10.53%)
+      Fraction of epi_key+time_values with
       No revisions:
       * 1 out of 7 (14.29%)
       Quick revisions (last revision within 3 days of the `time_value`):
       * 3 out of 7 (42.86%)
       Few revisions (At most 3 revisions for that `time_value`):
       * 6 out of 7 (85.71%)
-      Changes in Value:
-      Less than 0.1 spread in relative value (only from the revised subset):
+      Fraction of revised epi_key+time_values which have:
+      Less than 0.1 spread in relative value:
+      * 3 out of 6 (50%)
+      Spread of more than 5.1 in actual value (when revised):
       * 3 out of 6 (50%)
       days until within 20% of the latest value:
     Output
            min median     mean     max
         0 days 3 days 6.9 days 19 days
-    Message
-      Spread of more than 5.1 in actual value (when revised):
-      * 3 out of 6 (50%)
-    Output
       # A tibble: 7 x 8
         time_value geo_value n_revisions min_lag max_lag spread rel_spread
         <date>     <chr>           <dbl> <drtn>  <drtn>   <dbl>      <dbl>
diff --git a/tests/testthat/test-revision-latency-functions.R b/tests/testthat/test-revision-latency-functions.R
@@ -35,9 +35,6 @@ test_that("revision_summary works for a dummy dataset", {
     revision_summary() %>%
     print(n = 10, width = 300)
   expect_snapshot(dummy_ex %>% revision_summary() %>% print(n = 10, width = 300))
-  dummy_ex %>%
-    revision_summary(drop_nas = FALSE) %>%
-    print(n = 10, width = 300)
   expect_snapshot(dummy_ex %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300))
 })
 test_that("tidyselect is functional", {

Original file line number	Diff line number	Diff line change
`@@ -242,7 +242,6 @@ as_epi_df.tbl_df <- function(`
`242`	`242`	must be present in `x`."
`243`	`243`	`)`
`244`	`244`	`}`
`245`		`-`
`246`	`245`	`if (lifecycle::is_present(geo_type)) {`
`247`	`246`	cli_warn("epi_archive constructor argument `geo_type` is now ignored. Consider removing.")
`248`	`247`	`}`