Skip to content

Commit c08760c

Browse files
committed
better docs, separate compactify, improved printing
1 parent d56689d commit c08760c

File tree

4 files changed

+38
-41
lines changed

4 files changed

+38
-41
lines changed

R/epi_df.R

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,6 @@ as_epi_df.tbl_df <- function(
242242
must be present in `x`."
243243
)
244244
}
245-
246245
if (lifecycle::is_present(geo_type)) {
247246
cli_warn("epi_archive constructor argument `geo_type` is now ignored. Consider removing.")
248247
}

R/revision_analysis.R

Lines changed: 28 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,19 @@
22
#' @description
33
#' `revision_summary` removes all missing values (if requested), and then
44
#' computes some basic statistics about the revision behavior of an archive,
5-
#' returning a tibble of a per-epi-key (so time_value, geo_value pair,
6-
#' possibly others based on the metadata). If `print_inform` is true, it
5+
#' returning a tibble summarizing the revisions per time_value+epi_key features. If `print_inform` is true, it
76
#' prints a concise summary. The columns returned are:
8-
#' 1. `min_lag`: the minimum time to any value (if `drop_nas=FALSE`, this
7+
#' 1. `n_revisions`: the total number of revisions for that entry
8+
#' 2. `min_lag`: the minimum time to any value (if `drop_nas=FALSE`, this
99
#' includes `NA`'s)
10-
#' 2. `max_lag`: the amount of time until the final (new) version (same caveat
10+
#' 3. `max_lag`: the amount of time until the final (new) version (same caveat
1111
#' for `drop_nas=FALSE`, though it is far less likely to matter)
12-
#' 3. `spread`: the difference between the smallest and largest values (this
12+
#' 4. `spread`: the difference between the smallest and largest values (this
1313
#' always excludes `NA` values)
14-
#' 4. `rel_spread`: `spread` divided by the largest value (so it will
14+
#' 5. `rel_spread`: `spread` divided by the largest value (so it will
1515
#' always be less than 1). Note that this need not be the final value. It will
1616
#' be `NA` whenever `spread` is 0.
17-
#' 5. `time_near_latest`: This gives the lag when the value is within
17+
#' 6. `time_near_latest`: This gives the lag when the value is within
1818
#' `within_latest` (default 20%) of the value at the latest time. For example,
1919
#' consider the series (0,20, 99, 150, 102, 100); then `time_near_latest` is
2020
#' the 5th index, since even though 99 is within 20%, it is outside the window
@@ -65,7 +65,8 @@ revision_summary <- function(epi_arch,
6565
few_revisions = 3,
6666
rel_spread_threshold = 0.1,
6767
abs_spread_threshold = NULL,
68-
compactify_tol = .Machine$double.eps^0.5) {
68+
compactify_tol = .Machine$double.eps^0.5,
69+
should_compactify = TRUE) {
6970
arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT))
7071
if (length(arg) == 0) {
7172
first_non_key <- !(names(epi_arch$DT) %in% c(key_colnames(epi_arch), "version"))
@@ -95,12 +96,15 @@ revision_summary <- function(epi_arch,
9596
# if we're dropping NA's, we should recompactify
9697
revision_behavior <-
9798
revision_behavior %>%
98-
filter(!is.na(c_across(!!arg))) %>%
99-
arrange(across(c(geo_value, time_value, all_of(keys), version))) %>% # need to sort before compactifying
100-
compactify(c(keys, version), compactify_tol)
99+
filter(!is.na(c_across(!!arg)))
101100
} else {
102101
revision_behavior <- epi_arch$DT
103102
}
103+
if (should_compactify) {
104+
revision_behavior <- revision_behavior %>%
105+
arrange(across(c(geo_value, time_value, all_of(keys), version))) %>% # need to sort before compactifying
106+
compactify(c(keys, version), compactify_tol)
107+
}
104108
revision_behavior <-
105109
revision_behavior %>%
106110
mutate(lag = as.integer(version) - as.integer(time_value)) %>% # nolint: object_usage_linter
@@ -122,35 +126,36 @@ revision_summary <- function(epi_arch,
122126
) %>%
123127
select(-time_to)
124128
if (print_inform) {
125-
cli_inform("Number of revisions:")
126129
cli_inform("Min lag (time to first version):")
127130
difftime_summary(revision_behavior$min_lag) %>% print()
128131
if (!drop_nas) {
129132
total_na <- epi_arch$DT %>%
130133
filter(is.na(c_across(!!arg))) %>% # nolint: object_usage_linter
131134
nrow()
132135
cli_inform("Fraction of all versions that are `NA`:")
133-
cli_li(num_percent(total_na, nrow(epi_arch$DT), "versions"))
136+
cli_li(num_percent(total_na, nrow(epi_arch$DT), ""))
137+
cli_inform("")
134138
}
139+
cli_inform("Fraction of epi_key+time_values with")
135140
total_num <- nrow(revision_behavior) # nolint: object_usage_linter
136141
total_num_unrevised <- sum(revision_behavior$n_revisions == 0) # nolint: object_usage_linter
137142
cli_inform("No revisions:")
138-
cli_li(num_percent(total_num_unrevised, total_num, "entries"))
143+
cli_li(num_percent(total_num_unrevised, total_num, ""))
139144
total_quickly_revised <- sum( # nolint: object_usage_linter
140145
revision_behavior$max_lag <=
141146
as.difftime(quick_revision, units = "days")
142147
)
143148
cli_inform("Quick revisions (last revision within {quick_revision}
144149
{units(quick_revision)} of the `time_value`):")
145-
cli_li(num_percent(total_quickly_revised, total_num, "entries"))
150+
cli_li(num_percent(total_quickly_revised, total_num, ""))
146151
total_barely_revised <- sum( # nolint: object_usage_linter
147152
revision_behavior$n_revisions <=
148153
few_revisions
149154
)
150155
cli_inform("Few revisions (At most {few_revisions} revisions for that `time_value`):")
151-
cli_li(num_percent(total_barely_revised, total_num, "entries"))
156+
cli_li(num_percent(total_barely_revised, total_num, ""))
152157
cli_inform("")
153-
cli_inform("Changes in Value:")
158+
cli_inform("Fraction of revised epi_key+time_values which have:")
154159

155160
real_revisions <- revision_behavior %>% filter(n_revisions > 0) # nolint: object_usage_linter
156161
n_real_revised <- nrow(real_revisions) # nolint: object_usage_linter
@@ -159,17 +164,17 @@ revision_summary <- function(epi_arch,
159164
rel_spread_threshold,
160165
na.rm = TRUE
161166
) + sum(is.na(real_revisions$rel_spread))
162-
cli_inform("Less than {rel_spread_threshold} spread in relative value (only from the revised subset):")
163-
cli_li(num_percent(rel_spread, n_real_revised, "revised entries"))
164-
na_rel_spread <- sum(is.na(real_revisions$rel_spread)) # nolint: object_usage_linter
165-
cli_inform("{units(quick_revision)} until within {within_latest*100}% of the latest value:")
166-
difftime_summary(revision_behavior[["time_near_latest"]]) %>% print()
167+
cli_inform("Less than {rel_spread_threshold} spread in relative value:")
168+
cli_li(num_percent(rel_spread, n_real_revised, ""))
167169
abs_spread <- sum( # nolint: object_usage_linter
168170
real_revisions$spread >
169171
abs_spread_threshold
170172
) # nolint: object_usage_linter
171173
cli_inform("Spread of more than {abs_spread_threshold} in actual value (when revised):")
172-
cli_li(num_percent(abs_spread, n_real_revised, "revised entries"))
174+
cli_li(num_percent(abs_spread, n_real_revised, ""))
175+
176+
cli_inform("{units(quick_revision)} until within {within_latest*100}% of the latest value:")
177+
difftime_summary(revision_behavior[["time_near_latest"]]) %>% print()
173178
}
174179
return(revision_behavior)
175180
}

tests/testthat/_snaps/revision-latency-functions.md

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,29 +3,27 @@
33
Code
44
dummy_ex %>% revision_summary() %>% print(n = 10, width = 300)
55
Message
6-
Number of revisions:
76
Min lag (time to first version):
87
Output
98
min median mean max
109
0 days 1 days 1.6 days 4 days
1110
Message
11+
Fraction of epi_key+time_values with
1212
No revisions:
1313
* 3 out of 7 (42.86%)
1414
Quick revisions (last revision within 3 days of the `time_value`):
1515
* 4 out of 7 (57.14%)
1616
Few revisions (At most 3 revisions for that `time_value`):
1717
* 6 out of 7 (85.71%)
18-
Changes in Value:
19-
Less than 0.1 spread in relative value (only from the revised subset):
18+
Fraction of revised epi_key+time_values which have:
19+
Less than 0.1 spread in relative value:
2020
* 1 out of 4 (25%)
21+
Spread of more than 5.1 in actual value (when revised):
22+
* 3 out of 4 (75%)
2123
days until within 20% of the latest value:
2224
Output
2325
min median mean max
2426
0 days 3 days 6.9 days 19 days
25-
Message
26-
Spread of more than 5.1 in actual value (when revised):
27-
* 3 out of 4 (75%)
28-
Output
2927
# A tibble: 7 x 8
3028
time_value geo_value n_revisions min_lag max_lag spread rel_spread
3129
<date> <chr> <dbl> <drtn> <drtn> <dbl> <dbl>
@@ -51,31 +49,29 @@
5149
Code
5250
dummy_ex %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300)
5351
Message
54-
Number of revisions:
5552
Min lag (time to first version):
5653
Output
5754
min median mean max
5855
0 days 1 days 1.4 days 4 days
5956
Message
6057
Fraction of all versions that are `NA`:
6158
* 2 out of 19 (10.53%)
59+
Fraction of epi_key+time_values with
6260
No revisions:
6361
* 1 out of 7 (14.29%)
6462
Quick revisions (last revision within 3 days of the `time_value`):
6563
* 3 out of 7 (42.86%)
6664
Few revisions (At most 3 revisions for that `time_value`):
6765
* 6 out of 7 (85.71%)
68-
Changes in Value:
69-
Less than 0.1 spread in relative value (only from the revised subset):
66+
Fraction of revised epi_key+time_values which have:
67+
Less than 0.1 spread in relative value:
68+
* 3 out of 6 (50%)
69+
Spread of more than 5.1 in actual value (when revised):
7070
* 3 out of 6 (50%)
7171
days until within 20% of the latest value:
7272
Output
7373
min median mean max
7474
0 days 3 days 6.9 days 19 days
75-
Message
76-
Spread of more than 5.1 in actual value (when revised):
77-
* 3 out of 6 (50%)
78-
Output
7975
# A tibble: 7 x 8
8076
time_value geo_value n_revisions min_lag max_lag spread rel_spread
8177
<date> <chr> <dbl> <drtn> <drtn> <dbl> <dbl>

tests/testthat/test-revision-latency-functions.R

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,6 @@ test_that("revision_summary works for a dummy dataset", {
3535
revision_summary() %>%
3636
print(n = 10, width = 300)
3737
expect_snapshot(dummy_ex %>% revision_summary() %>% print(n = 10, width = 300))
38-
dummy_ex %>%
39-
revision_summary(drop_nas = FALSE) %>%
40-
print(n = 10, width = 300)
4138
expect_snapshot(dummy_ex %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300))
4239
})
4340
test_that("tidyselect is functional", {

0 commit comments

Comments
 (0)