Skip to content

Commit 8ff000e

Browse files
committed
Merge branch 'dev' into arrange-cannonical
2 parents 08ac040 + 1f44295 commit 8ff000e

22 files changed

+776
-53
lines changed

DESCRIPTION

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ Imports:
3131
data.table,
3232
dplyr (>= 1.0.8),
3333
genlasso,
34+
glue,
3435
ggplot2,
3536
glue,
3637
lifecycle (>= 1.0.1),
@@ -83,6 +84,7 @@ Collate:
8384
'methods-epi_df.R'
8485
'outliers.R'
8586
'reexports.R'
87+
'revision_analysis.R'
8688
'slide.R'
8789
'utils.R'
8890
'utils_pipe.R'

NAMESPACE

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ export(new_epi_df)
8484
export(next_after)
8585
export(relocate)
8686
export(rename)
87+
export(revision_summary)
8788
export(slice)
8889
export(time_column_names)
8990
export(ungroup)
@@ -114,6 +115,7 @@ importFrom(checkmate,vname)
114115
importFrom(cli,cat_line)
115116
importFrom(cli,cli_abort)
116117
importFrom(cli,cli_inform)
118+
importFrom(cli,cli_li)
117119
importFrom(cli,cli_vec)
118120
importFrom(cli,cli_warn)
119121
importFrom(cli,format_message)
@@ -131,8 +133,11 @@ importFrom(data.table,set)
131133
importFrom(data.table,setDF)
132134
importFrom(data.table,setkeyv)
133135
importFrom(dplyr,"%>%")
136+
importFrom(dplyr,across)
137+
importFrom(dplyr,all_of)
134138
importFrom(dplyr,arrange)
135139
importFrom(dplyr,bind_rows)
140+
importFrom(dplyr,c_across)
136141
importFrom(dplyr,dplyr_col_modify)
137142
importFrom(dplyr,dplyr_reconstruct)
138143
importFrom(dplyr,dplyr_row_slice)
@@ -145,11 +150,17 @@ importFrom(dplyr,group_vars)
145150
importFrom(dplyr,groups)
146151
importFrom(dplyr,if_all)
147152
importFrom(dplyr,if_any)
153+
importFrom(dplyr,if_else)
154+
importFrom(dplyr,lag)
148155
importFrom(dplyr,mutate)
156+
importFrom(dplyr,near)
157+
importFrom(dplyr,pick)
158+
importFrom(dplyr,pull)
149159
importFrom(dplyr,relocate)
150160
importFrom(dplyr,rename)
151161
importFrom(dplyr,select)
152162
importFrom(dplyr,slice)
163+
importFrom(dplyr,summarize)
153164
importFrom(dplyr,tibble)
154165
importFrom(dplyr,ungroup)
155166
importFrom(ggplot2,autoplot)
@@ -181,6 +192,7 @@ importFrom(rlang,is_formula)
181192
importFrom(rlang,is_function)
182193
importFrom(rlang,is_missing)
183194
importFrom(rlang,is_quosure)
195+
importFrom(rlang,list2)
184196
importFrom(rlang,missing_arg)
185197
importFrom(rlang,new_function)
186198
importFrom(rlang,quo_get_expr)

NEWS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
88

99
- Added `complete.epi_df`, which fills in missing values in an `epi_df` with
1010
`NA`s. Uses `tidyr::complete` underneath and preserves `epi_df` metadata.
11+
- Inclusion of the function `revision_summary` to provide basic revision information for `epi_archive`s out of the box. (#492)
1112

1213
## Bug fixes
1314

R/archive.R

Lines changed: 71 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ NULL
240240
#' value of `clobberable_versions_start` does not fully trust these empty
241241
#' updates, and assumes that any version `>= max(x$version)` could be
242242
#' clobbered.) If `nrow(x) == 0`, then this argument is mandatory.
243+
#' @param compactify_tol double. the tolerance used to detect approximate equality for compactification
243244
#' @return An `epi_archive` object.
244245
#'
245246
#' @importFrom data.table as.data.table key setkeyv
@@ -295,15 +296,16 @@ new_epi_archive <- function(
295296
additional_metadata,
296297
compactify,
297298
clobberable_versions_start,
298-
versions_end) {
299+
versions_end,
300+
compactify_tol = .Machine$double.eps^0.5) {
299301
# Create the data table; if x was an un-keyed data.table itself,
300302
# then the call to as.data.table() will fail to set keys, so we
301303
# need to check this, then do it manually if needed
302304
key_vars <- c("geo_value", "time_value", other_keys, "version")
303-
DT <- as.data.table(x, key = key_vars) # nolint: object_name_linter
304-
if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars)
305+
data_table <- as.data.table(x, key = key_vars) # nolint: object_name_linter
306+
if (!identical(key_vars, key(data_table))) setkeyv(data_table, cols = key_vars)
305307

306-
if (anyDuplicated(DT, by = key(DT)) != 0L) {
308+
if (anyDuplicated(data_table, by = key(data_table)) != 0L) {
307309
cli_abort("`x` must have one row per unique combination of the key variables. If you
308310
have additional key variables other than `geo_value`, `time_value`, and
309311
`version`, such as an age group column, please specify them in `other_keys`.
@@ -313,38 +315,17 @@ new_epi_archive <- function(
313315
)
314316
}
315317

316-
# Checks to see if a value in a vector is LOCF
317-
is_locf <- function(vec) { # nolint: object_usage_linter
318-
dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)),
319-
vec == dplyr::lag(vec),
320-
is.na(vec) & is.na(dplyr::lag(vec))
321-
)
322-
}
323-
324-
# LOCF is defined by a row where all values except for the version
325-
# differ from their respective lag values
326-
327-
# Checks for LOCF's in a data frame
328-
rm_locf <- function(df) {
329-
dplyr::filter(df, if_any(c(everything(), -version), ~ !is_locf(.))) # nolint: object_usage_linter
330-
}
331-
332-
# Keeps LOCF values, such as to be printed
333-
keep_locf <- function(df) {
334-
dplyr::filter(df, if_all(c(everything(), -version), ~ is_locf(.))) # nolint: object_usage_linter
335-
}
336-
318+
nrow_before_compactify <- nrow(data_table)
337319
# Runs compactify on data frame
338320
if (is.null(compactify) || compactify == TRUE) {
339-
elim <- keep_locf(DT)
340-
DT <- rm_locf(DT) # nolint: object_name_linter
321+
compactified <- apply_compactify(data_table, key_vars, compactify_tol)
341322
} else {
342-
# Create empty data frame for nrow(elim) to be 0
343-
elim <- tibble::tibble()
323+
compactified <- data_table
344324
}
345-
346-
# Warns about redundant rows
347-
if (is.null(compactify) && nrow(elim) > 0) {
325+
# Warns about redundant rows if the number of rows decreased, and we didn't
326+
# explicitly say to compactify
327+
if (is.null(compactify) && nrow(compactified) < nrow_before_compactify) {
328+
elim <- removed_by_compactify(data_table, key_vars, compactify_tol)
348329
warning_intro <- cli::format_inline(
349330
"Found rows that appear redundant based on
350331
last (version of each) observation carried forward;
@@ -366,7 +347,7 @@ new_epi_archive <- function(
366347

367348
structure(
368349
list(
369-
DT = DT,
350+
DT = compactified,
370351
geo_type = geo_type,
371352
time_type = time_type,
372353
additional_metadata = additional_metadata,
@@ -377,6 +358,63 @@ new_epi_archive <- function(
377358
)
378359
}
379360

361+
#' given a tibble as would be found in an epi_archive, remove duplicate entries.
362+
#' @description
363+
#' works by shifting all rows except the version, then comparing values to see
364+
#' if they've changed. We need to arrange in descending order, but note that
365+
#' we don't need to group, since at least one column other than version has
366+
#' changed, and so is kept.
367+
#' @keywords internal
368+
#' @importFrom dplyr filter
369+
apply_compactify <- function(df, keys, tolerance = .Machine$double.eps^.5) {
370+
df %>%
371+
arrange(!!!keys) %>%
372+
filter(if_any(
373+
c(everything(), -version), # all non-version columns
374+
~ !is_locf(., tolerance)
375+
))
376+
}
377+
378+
#' get the entries that `compactify` would remove
379+
#' @keywords internal
380+
#' @importFrom dplyr filter if_all everything
381+
removed_by_compactify <- function(df, keys, tolerance) {
382+
df %>%
383+
arrange(!!!keys) %>%
384+
filter(if_all(
385+
c(everything(), -version),
386+
~ is_locf(., tolerance)
387+
)) # nolint: object_usage_linter
388+
}
389+
390+
#' Checks to see if a value in a vector is LOCF
391+
#' @description
392+
#' LOCF meaning last observation carried forward. lags the vector by 1, then
393+
#' compares with itself. For doubles it uses float comparison via
394+
#' [`dplyr::near`], otherwise it uses equality. `NA`'s and `NaN`'s are
395+
#' considered equal to themselves and each other.
396+
#' @importFrom dplyr lag if_else near
397+
#' @keywords internal
398+
is_locf <- function(vec, tolerance) { # nolint: object_usage_linter
399+
lag_vec <- dplyr::lag(vec)
400+
if (typeof(vec) == "double") {
401+
res <- if_else(
402+
!is.na(vec) & !is.na(lag_vec),
403+
near(vec, lag_vec, tol = tolerance),
404+
is.na(vec) & is.na(lag_vec)
405+
)
406+
return(res)
407+
} else {
408+
res <- if_else(
409+
!is.na(vec) & !is.na(lag_vec),
410+
vec == lag_vec,
411+
is.na(vec) & is.na(lag_vec)
412+
)
413+
return(res)
414+
}
415+
}
416+
417+
380418
#' `validate_epi_archive` ensures correctness of arguments fed to `as_epi_archive`.
381419
#'
382420
#' @rdname epi_archive

R/epi_df.R

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,6 @@ as_epi_df.tbl_df <- function(
242242
must be present in `x`."
243243
)
244244
}
245-
246245
if (lifecycle::is_present(geo_type)) {
247246
cli_warn("epi_archive constructor argument `geo_type` is now ignored. Consider removing.")
248247
}

0 commit comments

Comments
 (0)