Skip to content

revision analysis first draft #492

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Aug 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: epiprocess
Title: Tools for basic signal processing in epidemiology
Version: 0.8.2
Version: 0.8.3
Authors@R: c(
person("Jacob", "Bien", role = "ctb"),
person("Logan", "Brooks", email = "[email protected]", role = c("aut", "cre")),
Expand Down Expand Up @@ -31,6 +31,7 @@ Imports:
data.table,
dplyr (>= 1.0.8),
genlasso,
glue,
ggplot2,
glue,
lifecycle (>= 1.0.1),
Expand Down Expand Up @@ -83,6 +84,7 @@ Collate:
'methods-epi_df.R'
'outliers.R'
'reexports.R'
'revision_analysis.R'
'slide.R'
'utils.R'
'utils_pipe.R'
12 changes: 12 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ export(new_epi_df)
export(next_after)
export(relocate)
export(rename)
export(revision_summary)
export(slice)
export(time_column_names)
export(ungroup)
Expand Down Expand Up @@ -111,6 +112,7 @@ importFrom(checkmate,vname)
importFrom(cli,cat_line)
importFrom(cli,cli_abort)
importFrom(cli,cli_inform)
importFrom(cli,cli_li)
importFrom(cli,cli_vec)
importFrom(cli,cli_warn)
importFrom(cli,format_message)
Expand All @@ -128,8 +130,11 @@ importFrom(data.table,set)
importFrom(data.table,setDF)
importFrom(data.table,setkeyv)
importFrom(dplyr,"%>%")
importFrom(dplyr,across)
importFrom(dplyr,all_of)
importFrom(dplyr,arrange)
importFrom(dplyr,bind_rows)
importFrom(dplyr,c_across)
importFrom(dplyr,dplyr_col_modify)
importFrom(dplyr,dplyr_reconstruct)
importFrom(dplyr,dplyr_row_slice)
Expand All @@ -142,11 +147,17 @@ importFrom(dplyr,group_vars)
importFrom(dplyr,groups)
importFrom(dplyr,if_all)
importFrom(dplyr,if_any)
importFrom(dplyr,if_else)
importFrom(dplyr,lag)
importFrom(dplyr,mutate)
importFrom(dplyr,near)
importFrom(dplyr,pick)
importFrom(dplyr,pull)
importFrom(dplyr,relocate)
importFrom(dplyr,rename)
importFrom(dplyr,select)
importFrom(dplyr,slice)
importFrom(dplyr,summarize)
importFrom(dplyr,tibble)
importFrom(dplyr,ungroup)
importFrom(ggplot2,autoplot)
Expand Down Expand Up @@ -178,6 +189,7 @@ importFrom(rlang,is_formula)
importFrom(rlang,is_function)
importFrom(rlang,is_missing)
importFrom(rlang,is_quosure)
importFrom(rlang,list2)
importFrom(rlang,missing_arg)
importFrom(rlang,new_function)
importFrom(rlang,quo_get_expr)
Expand Down
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat

- Added `complete.epi_df`, which fills in missing values in an `epi_df` with
`NA`s. Uses `tidyr::complete` underneath and preserves `epi_df` metadata.
- Inclusion of the function `revision_summary` to provide basic revision information for `epi_archive`s out of the box. (#492)

## Bug fixes

Expand Down
104 changes: 71 additions & 33 deletions R/archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ NULL
#' value of `clobberable_versions_start` does not fully trust these empty
#' updates, and assumes that any version `>= max(x$version)` could be
#' clobbered.) If `nrow(x) == 0`, then this argument is mandatory.
#' @param compactify_tol double. the tolerance used to detect approximate equality for compactification
#' @return An `epi_archive` object.
#'
#' @importFrom data.table as.data.table key setkeyv
Expand Down Expand Up @@ -295,15 +296,16 @@ new_epi_archive <- function(
additional_metadata,
compactify,
clobberable_versions_start,
versions_end) {
versions_end,
compactify_tol = .Machine$double.eps^0.5) {
# Create the data table; if x was an un-keyed data.table itself,
# then the call to as.data.table() will fail to set keys, so we
# need to check this, then do it manually if needed
key_vars <- c("geo_value", "time_value", other_keys, "version")
DT <- as.data.table(x, key = key_vars) # nolint: object_name_linter
if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars)
data_table <- as.data.table(x, key = key_vars) # nolint: object_name_linter
if (!identical(key_vars, key(data_table))) setkeyv(data_table, cols = key_vars)

if (anyDuplicated(DT, by = key(DT)) != 0L) {
if (anyDuplicated(data_table, by = key(data_table)) != 0L) {
cli_abort("`x` must have one row per unique combination of the key variables. If you
have additional key variables other than `geo_value`, `time_value`, and
`version`, such as an age group column, please specify them in `other_keys`.
Expand All @@ -313,38 +315,17 @@ new_epi_archive <- function(
)
}

# Checks to see if a value in a vector is LOCF
is_locf <- function(vec) { # nolint: object_usage_linter
dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)),
vec == dplyr::lag(vec),
is.na(vec) & is.na(dplyr::lag(vec))
)
}

# LOCF is defined by a row where all values except for the version
# differ from their respective lag values

# Checks for LOCF's in a data frame
rm_locf <- function(df) {
dplyr::filter(df, if_any(c(everything(), -version), ~ !is_locf(.))) # nolint: object_usage_linter
}

# Keeps LOCF values, such as to be printed
keep_locf <- function(df) {
dplyr::filter(df, if_all(c(everything(), -version), ~ is_locf(.))) # nolint: object_usage_linter
}

nrow_before_compactify <- nrow(data_table)
# Runs compactify on data frame
if (is.null(compactify) || compactify == TRUE) {
elim <- keep_locf(DT)
DT <- rm_locf(DT) # nolint: object_name_linter
compactified <- apply_compactify(data_table, key_vars, compactify_tol)
} else {
# Create empty data frame for nrow(elim) to be 0
elim <- tibble::tibble()
compactified <- data_table
}

# Warns about redundant rows
if (is.null(compactify) && nrow(elim) > 0) {
# Warns about redundant rows if the number of rows decreased, and we didn't
# explicitly say to compactify
if (is.null(compactify) && nrow(compactified) < nrow_before_compactify) {
elim <- removed_by_compactify(data_table, key_vars, compactify_tol)
warning_intro <- cli::format_inline(
"Found rows that appear redundant based on
last (version of each) observation carried forward;
Expand All @@ -366,7 +347,7 @@ new_epi_archive <- function(

structure(
list(
DT = DT,
DT = compactified,
geo_type = geo_type,
time_type = time_type,
additional_metadata = additional_metadata,
Expand All @@ -377,6 +358,63 @@ new_epi_archive <- function(
)
}

#' given a tibble as would be found in an epi_archive, remove duplicate entries.
#' @description
#' works by shifting all rows except the version, then comparing values to see
#' if they've changed. We need to arrange in descending order, but note that
#' we don't need to group, since at least one column other than version has
#' changed, and so is kept.
#' @keywords internal
#' @importFrom dplyr filter
apply_compactify <- function(df, keys, tolerance = .Machine$double.eps^.5) {
df %>%
arrange(!!!keys) %>%
filter(if_any(
c(everything(), -version), # all non-version columns
~ !is_locf(., tolerance)
))
}

#' get the entries that `compactify` would remove
#' @keywords internal
#' @importFrom dplyr filter if_all everything
removed_by_compactify <- function(df, keys, tolerance) {
df %>%
arrange(!!!keys) %>%
filter(if_all(
c(everything(), -version),
~ is_locf(., tolerance)
)) # nolint: object_usage_linter
}

#' Checks to see if a value in a vector is LOCF
#' @description
#' LOCF meaning last observation carried forward. lags the vector by 1, then
#' compares with itself. For doubles it uses float comparison via
#' [`dplyr::near`], otherwise it uses equality. `NA`'s and `NaN`'s are
#' considered equal to themselves and each other.
#' @importFrom dplyr lag if_else near
#' @keywords internal
is_locf <- function(vec, tolerance) { # nolint: object_usage_linter
lag_vec <- dplyr::lag(vec)
if (typeof(vec) == "double") {
res <- if_else(
!is.na(vec) & !is.na(lag_vec),
near(vec, lag_vec, tol = tolerance),
is.na(vec) & is.na(lag_vec)
)
return(res)
} else {
res <- if_else(
!is.na(vec) & !is.na(lag_vec),
vec == lag_vec,
is.na(vec) & is.na(lag_vec)
)
return(res)
}
}


#' `validate_epi_archive` ensures correctness of arguments fed to `as_epi_archive`.
#'
#' @rdname epi_archive
Expand Down
1 change: 0 additions & 1 deletion R/epi_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,6 @@ as_epi_df.tbl_df <- function(
must be present in `x`."
)
}

if (lifecycle::is_present(geo_type)) {
cli_warn("epi_archive constructor argument `geo_type` is now ignored. Consider removing.")
}
Expand Down
Loading
Loading