2
2
# ' @description
3
3
# ' `revision_summary` removes all missing values (if requested), and then
4
4
# ' computes some basic statistics about the revision behavior of an archive,
5
- # ' returning a tibble of a per-epi-key (so time_value, geo_value pair,
6
- # ' possibly others based on the metadata). If `print_inform` is true, it
5
+ # ' returning a tibble summarizing the revisions per time_value+epi_key features. If `print_inform` is true, it
7
6
# ' prints a concise summary. The columns returned are:
8
- # ' 1. `min_lag`: the minimum time to any value (if `drop_nas=FALSE`, this
7
+ # ' 1. `n_revisions`: the total number of revisions for that entry
8
+ # ' 2. `min_lag`: the minimum time to any value (if `drop_nas=FALSE`, this
9
9
# ' includes `NA`'s)
10
- # ' 2 . `max_lag`: the amount of time until the final (new) version (same caveat
10
+ # ' 3 . `max_lag`: the amount of time until the final (new) version (same caveat
11
11
# ' for `drop_nas=FALSE`, though it is far less likely to matter)
12
- # ' 3 . `spread`: the difference between the smallest and largest values (this
12
+ # ' 4 . `spread`: the difference between the smallest and largest values (this
13
13
# ' always excludes `NA` values)
14
- # ' 4 . `rel_spread`: `spread` divided by the largest value (so it will
14
+ # ' 5 . `rel_spread`: `spread` divided by the largest value (so it will
15
15
# ' always be less than 1). Note that this need not be the final value. It will
16
16
# ' be `NA` whenever `spread` is 0.
17
- # ' 5 . `time_near_latest`: This gives the lag when the value is within
17
+ # ' 6 . `time_near_latest`: This gives the lag when the value is within
18
18
# ' `within_latest` (default 20%) of the value at the latest time. For example,
19
19
# ' consider the series (0,20, 99, 150, 102, 100); then `time_near_latest` is
20
20
# ' the 5th index, since even though 99 is within 20%, it is outside the window
@@ -65,7 +65,8 @@ revision_summary <- function(epi_arch,
65
65
few_revisions = 3 ,
66
66
rel_spread_threshold = 0.1 ,
67
67
abs_spread_threshold = NULL ,
68
- compactify_tol = .Machine $ double.eps ^ 0.5 ) {
68
+ compactify_tol = .Machine $ double.eps ^ 0.5 ,
69
+ should_compactify = TRUE ) {
69
70
arg <- names(eval_select(rlang :: expr(c(... )), allow_rename = FALSE , data = epi_arch $ DT ))
70
71
if (length(arg ) == 0 ) {
71
72
first_non_key <- ! (names(epi_arch $ DT ) %in% c(key_colnames(epi_arch ), " version" ))
@@ -95,12 +96,15 @@ revision_summary <- function(epi_arch,
95
96
# if we're dropping NA's, we should recompactify
96
97
revision_behavior <-
97
98
revision_behavior %> %
98
- filter(! is.na(c_across(!! arg ))) %> %
99
- arrange(across(c(geo_value , time_value , all_of(keys ), version ))) %> % # need to sort before compactifying
100
- compactify(c(keys , version ), compactify_tol )
99
+ filter(! is.na(c_across(!! arg )))
101
100
} else {
102
101
revision_behavior <- epi_arch $ DT
103
102
}
103
+ if (should_compactify ) {
104
+ revision_behavior <- revision_behavior %> %
105
+ arrange(across(c(geo_value , time_value , all_of(keys ), version ))) %> % # need to sort before compactifying
106
+ compactify(c(keys , version ), compactify_tol )
107
+ }
104
108
revision_behavior <-
105
109
revision_behavior %> %
106
110
mutate(lag = as.integer(version ) - as.integer(time_value )) %> % # nolint: object_usage_linter
@@ -122,35 +126,36 @@ revision_summary <- function(epi_arch,
122
126
) %> %
123
127
select(- time_to )
124
128
if (print_inform ) {
125
- cli_inform(" Number of revisions:" )
126
129
cli_inform(" Min lag (time to first version):" )
127
130
difftime_summary(revision_behavior $ min_lag ) %> % print()
128
131
if (! drop_nas ) {
129
132
total_na <- epi_arch $ DT %> %
130
133
filter(is.na(c_across(!! arg ))) %> % # nolint: object_usage_linter
131
134
nrow()
132
135
cli_inform(" Fraction of all versions that are `NA`:" )
133
- cli_li(num_percent(total_na , nrow(epi_arch $ DT ), " versions" ))
136
+ cli_li(num_percent(total_na , nrow(epi_arch $ DT ), " " ))
137
+ cli_inform(" " )
134
138
}
139
+ cli_inform(" Fraction of epi_key+time_values with" )
135
140
total_num <- nrow(revision_behavior ) # nolint: object_usage_linter
136
141
total_num_unrevised <- sum(revision_behavior $ n_revisions == 0 ) # nolint: object_usage_linter
137
142
cli_inform(" No revisions:" )
138
- cli_li(num_percent(total_num_unrevised , total_num , " entries " ))
143
+ cli_li(num_percent(total_num_unrevised , total_num , " " ))
139
144
total_quickly_revised <- sum( # nolint: object_usage_linter
140
145
revision_behavior $ max_lag < =
141
146
as.difftime(quick_revision , units = " days" )
142
147
)
143
148
cli_inform(" Quick revisions (last revision within {quick_revision}
144
149
{units(quick_revision)} of the `time_value`):" )
145
- cli_li(num_percent(total_quickly_revised , total_num , " entries " ))
150
+ cli_li(num_percent(total_quickly_revised , total_num , " " ))
146
151
total_barely_revised <- sum( # nolint: object_usage_linter
147
152
revision_behavior $ n_revisions < =
148
153
few_revisions
149
154
)
150
155
cli_inform(" Few revisions (At most {few_revisions} revisions for that `time_value`):" )
151
- cli_li(num_percent(total_barely_revised , total_num , " entries " ))
156
+ cli_li(num_percent(total_barely_revised , total_num , " " ))
152
157
cli_inform(" " )
153
- cli_inform(" Changes in Value :" )
158
+ cli_inform(" Fraction of revised epi_key+time_values which have :" )
154
159
155
160
real_revisions <- revision_behavior %> % filter(n_revisions > 0 ) # nolint: object_usage_linter
156
161
n_real_revised <- nrow(real_revisions ) # nolint: object_usage_linter
@@ -159,17 +164,17 @@ revision_summary <- function(epi_arch,
159
164
rel_spread_threshold ,
160
165
na.rm = TRUE
161
166
) + sum(is.na(real_revisions $ rel_spread ))
162
- cli_inform(" Less than {rel_spread_threshold} spread in relative value (only from the revised subset):" )
163
- cli_li(num_percent(rel_spread , n_real_revised , " revised entries" ))
164
- na_rel_spread <- sum(is.na(real_revisions $ rel_spread )) # nolint: object_usage_linter
165
- cli_inform(" {units(quick_revision)} until within {within_latest*100}% of the latest value:" )
166
- difftime_summary(revision_behavior [[" time_near_latest" ]]) %> % print()
167
+ cli_inform(" Less than {rel_spread_threshold} spread in relative value:" )
168
+ cli_li(num_percent(rel_spread , n_real_revised , " " ))
167
169
abs_spread <- sum( # nolint: object_usage_linter
168
170
real_revisions $ spread >
169
171
abs_spread_threshold
170
172
) # nolint: object_usage_linter
171
173
cli_inform(" Spread of more than {abs_spread_threshold} in actual value (when revised):" )
172
- cli_li(num_percent(abs_spread , n_real_revised , " revised entries" ))
174
+ cli_li(num_percent(abs_spread , n_real_revised , " " ))
175
+
176
+ cli_inform(" {units(quick_revision)} until within {within_latest*100}% of the latest value:" )
177
+ difftime_summary(revision_behavior [[" time_near_latest" ]]) %> % print()
173
178
}
174
179
return (revision_behavior )
175
180
}
0 commit comments