Skip to content

Commit d6a5d2b

Browse files
committed
review recs from logan
1 parent 7ae7a5d commit d6a5d2b

File tree

9 files changed

+243
-171
lines changed

9 files changed

+243
-171
lines changed

R/archive.R

Lines changed: 28 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ NULL
240240
#' value of `clobberable_versions_start` does not fully trust these empty
241241
#' updates, and assumes that any version `>= max(x$version)` could be
242242
#' clobbered.) If `nrow(x) == 0`, then this argument is mandatory.
243+
#' @param compactify_tol double. the tolerance used to detect approximate equality for compactification
243244
#' @return An `epi_archive` object.
244245
#'
245246
#' @importFrom data.table as.data.table key setkeyv
@@ -295,15 +296,16 @@ new_epi_archive <- function(
295296
additional_metadata,
296297
compactify,
297298
clobberable_versions_start,
298-
versions_end) {
299+
versions_end,
300+
compactify_tol = .Machine$double.eps^0.5) {
299301
# Create the data table; if x was an un-keyed data.table itself,
300302
# then the call to as.data.table() will fail to set keys, so we
301303
# need to check this, then do it manually if needed
302304
key_vars <- c("geo_value", "time_value", other_keys, "version")
303-
DT <- as.data.table(x, key = key_vars) # nolint: object_name_linter
304-
if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars)
305+
data_table <- as.data.table(x, key = key_vars) # nolint: object_name_linter
306+
if (!identical(key_vars, key(data_table))) setkeyv(data_table, cols = key_vars)
305307

306-
if (anyDuplicated(DT, by = key(DT)) != 0L) {
308+
if (anyDuplicated(data_table, by = key(data_table)) != 0L) {
307309
cli_abort("`x` must have one row per unique combination of the key variables. If you
308310
have additional key variables other than `geo_value`, `time_value`, and
309311
`version`, such as an age group column, please specify them in `other_keys`.
@@ -313,15 +315,15 @@ new_epi_archive <- function(
313315
)
314316
}
315317

316-
nrow_before_compactify <- nrow(DT)
318+
nrow_before_compactify <- nrow(data_table)
317319
# Runs compactify on data frame
318320
if (is.null(compactify) || compactify == TRUE) {
319-
DT <- compactify_tibble(DT, key_vars) # nolint: object_name_linter
321+
data_table <- compactify_tibble(data_table, key_vars, compactify_tol)
320322
}
321323
# Warns about redundant rows if the number of rows decreased, and we didn't
322324
# explicitly say to compactify
323-
if (is.null(compactify) && nrow(DT) < nrow_before_compactify) {
324-
elim <- removed_by_compactify(DT, key_vars)
325+
if (is.null(compactify) && nrow(data_table) < nrow_before_compactify) {
326+
elim <- removed_by_compactify(data_table, key_vars, compactify_tol)
325327
warning_intro <- cli::format_inline(
326328
"Found rows that appear redundant based on
327329
last (version of each) observation carried forward;
@@ -343,7 +345,7 @@ new_epi_archive <- function(
343345

344346
structure(
345347
list(
346-
DT = DT,
348+
DT = data_table,
347349
geo_type = geo_type,
348350
time_type = time_type,
349351
additional_metadata = additional_metadata,
@@ -362,51 +364,51 @@ new_epi_archive <- function(
362364
#' changed, and so is kept.
363365
#' @keywords internal
364366
#' @importFrom dplyr filter
365-
compactify_tibble <- function(df, keys) {
367+
compactify_tibble <- function(df, keys, tolerance = .Machine$double.eps^.5) {
366368
df %>%
367369
arrange(!!!keys) %>%
368370
filter(if_any(
369371
c(everything(), -version), # all non-version columns
370-
~ !is_locf(.)
372+
~ !is_locf(., tolerance)
371373
))
372374
}
373375

374376
#' get the entries that `compactify_tibble` would remove
375377
#' @keywords internal
376378
#' @importFrom dplyr filter if_all everything
377-
removed_by_compactify <- function(df, keys) {
379+
removed_by_compactify <- function(df, keys, tolerance) {
378380
df %>%
379381
arrange(!!!keys) %>%
380-
filter(if_all(
382+
filter(if_any(
381383
c(everything(), -version),
382-
~ is_locf(.)
384+
~ is_locf(., tolerance)
383385
)) # nolint: object_usage_linter
384386
}
385387

386388
#' Checks to see if a value in a vector is LOCF
387389
#' @description
388390
#' LOCF meaning last observation carried forward. lags the vector by 1, then
389-
#' compares with itself. For floats it uses float comparison, otherwise it
390-
#' uses equality.
391-
#' `NA`'s are considered equal to `NA`'s, while nan's are not.
391+
#' compares with itself. For doubles it uses float comparison via
392+
#' [`dplyr::near`], otherwise it uses equality. `NA`'s and `NaN`'s are
393+
#' considered equal to themselves and each other.
392394
#' @importFrom dplyr lag if_else near
393395
#' @keywords internal
394-
is_locf <- function(vec) { # nolint: object_usage_linter
395-
lag_vec <- lag(vec)
396+
is_locf <- function(vec, tolerance) { # nolint: object_usage_linter
397+
lag_vec <- dplyr::lag(vec)
396398
if (typeof(vec) == "double") {
397-
if_else(
399+
res <- if_else(
398400
!is.na(vec) & !is.na(lag_vec),
399-
near(vec, lag_vec),
401+
near(vec, lag_vec, tol = tolerance),
400402
is.na(vec) & is.na(lag_vec)
401-
) %>%
402-
return()
403+
)
404+
return(res)
403405
} else {
404-
if_else(
406+
res <- if_else(
405407
!is.na(vec) & !is.na(lag_vec),
406408
vec == lag_vec,
407409
is.na(vec) & is.na(lag_vec)
408-
) %>%
409-
return()
410+
)
411+
return(res)
410412
}
411413
}
412414

0 commit comments

Comments
 (0)