diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 886055ae7..1381682d4 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -28,7 +28,7 @@ jobs: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} DELPHI_EPIDATA_KEY: ${{ secrets.SECRET_EPIPROCESS_GHACTIONS_DELPHI_EPIDATA_KEY }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-pandoc@v2 @@ -42,19 +42,14 @@ jobs: needs: website - name: Build site - # - target_ref gets the ref from a different variable, depending on the event - # - override allows us to set the pkgdown mode and version_label - # - mode: release is the standard build mode, devel places the site in /dev - # - version_label: 'light' and 'success' are CSS labels for Bootswatch: Cosmo - # https://bootswatch.com/cosmo/ - # - we use pkgdown:::build_github_pages to build the site because of an issue in pkgdown - # https://github.com/r-lib/pkgdown/issues/2257 + # our versioning system+dev branch doesn't match the requirements for + # develop mode = auto run: | target_ref <- "${{ github.event_name == 'pull_request' && github.base_ref || github.ref }}" override <- if (target_ref == "main" || target_ref == "refs/heads/main") { - list(development = list(mode = "release", version_label = "light")) + list(development = list(mode = "release")) } else if (target_ref == "dev" || target_ref == "refs/heads/dev") { - list(development = list(mode = "devel", version_label = "success")) + list(development = list(mode = "devel")) } else { stop("Unexpected target_ref: ", target_ref) } @@ -67,7 +62,7 @@ jobs: - name: Deploy to GitHub pages 🚀 if: github.event_name != 'pull_request' - uses: JamesIves/github-pages-deploy-action@v4.4.1 + uses: JamesIves/github-pages-deploy-action@v4.5.0 with: clean: false branch: gh-pages diff --git a/DESCRIPTION b/DESCRIPTION index fd09aa57e..e894f0999 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,10 +1,10 @@ Package: epiprocess Type: Package Title: Tools for basic signal processing in epidemiology -Version: 0.9.6 +Version: 0.10.0 Authors@R: c( person("Jacob", "Bien", role = "ctb"), - person("Logan", "Brooks", , "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), + person("Logan", "Brooks", , "lcbrooks+github@andrew.cmu.edu", role = c("aut", "cre")), person("Rafael", "Catoia", role = "ctb"), person("Nat", "DeFries", role = "ctb"), person("Daniel", "McDonald", role = "aut"), @@ -13,8 +13,9 @@ Authors@R: c( person("Chloe", "You", role = "ctb"), person("Quang", "Nguyen", role = "ctb"), person("Evan", "Ray", role = "aut"), - person("Dmitry", "Shemetov", role = "ctb"), + person("Dmitry", "Shemetov", role = "aut"), person("Ryan", "Tibshirani", role = "aut"), + person("David", "Weber", , "davidweb@andrew.cmu.edu", role = "ctb"), person("Lionel", "Henry", role = "ctb", comment = "Author of included rlang fragments"), person("Hadley", "Wickham", role = "ctb", @@ -72,10 +73,12 @@ Suggests: VignetteBuilder: knitr Remotes: + cmu-delphi/delphidocs, cmu-delphi/epidatasets, cmu-delphi/epidatr, glmgen/genlasso, reconverse/outbreaks +Config/Needs/website: cmu-delphi/delphidocs Config/testthat/edition: 3 Config/testthat/parallel: true Copyright: file inst/COPYRIGHTS diff --git a/NAMESPACE b/NAMESPACE index 1fd65d371..1f5180fb7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,6 +11,7 @@ S3method(arrange_row_canonical,default) S3method(arrange_row_canonical,epi_df) S3method(as_epi_df,data.frame) S3method(as_epi_df,epi_df) +S3method(as_epi_df,grouped_df) S3method(as_epi_df,tbl_df) S3method(as_epi_df,tbl_ts) S3method(as_tibble,epi_df) @@ -108,12 +109,16 @@ importFrom(checkmate,assert) importFrom(checkmate,assert_character) importFrom(checkmate,assert_class) importFrom(checkmate,assert_data_frame) +importFrom(checkmate,assert_false) importFrom(checkmate,assert_function) importFrom(checkmate,assert_int) importFrom(checkmate,assert_list) importFrom(checkmate,assert_logical) importFrom(checkmate,assert_numeric) importFrom(checkmate,assert_scalar) +importFrom(checkmate,assert_string) +importFrom(checkmate,assert_subset) +importFrom(checkmate,assert_tibble) importFrom(checkmate,checkInt) importFrom(checkmate,check_atomic) importFrom(checkmate,check_data_frame) @@ -163,6 +168,7 @@ importFrom(dplyr,groups) importFrom(dplyr,if_all) importFrom(dplyr,if_any) importFrom(dplyr,if_else) +importFrom(dplyr,is_grouped_df) importFrom(dplyr,lag) importFrom(dplyr,mutate) importFrom(dplyr,near) @@ -176,6 +182,7 @@ importFrom(dplyr,summarize) importFrom(dplyr,tibble) importFrom(dplyr,ungroup) importFrom(ggplot2,autoplot) +importFrom(glue,glue) importFrom(lifecycle,deprecated) importFrom(lubridate,as.period) importFrom(lubridate,days) @@ -189,7 +196,6 @@ importFrom(rlang,"%||%") importFrom(rlang,.data) importFrom(rlang,.env) importFrom(rlang,arg_match) -importFrom(rlang,as_label) importFrom(rlang,caller_arg) importFrom(rlang,caller_env) importFrom(rlang,check_dots_empty) @@ -199,6 +205,7 @@ importFrom(rlang,env) importFrom(rlang,expr_label) importFrom(rlang,f_env) importFrom(rlang,f_rhs) +importFrom(rlang,is_bare_integerish) importFrom(rlang,is_environment) importFrom(rlang,is_formula) importFrom(rlang,is_function) @@ -207,7 +214,7 @@ importFrom(rlang,is_quosure) importFrom(rlang,list2) importFrom(rlang,missing_arg) importFrom(rlang,new_function) -importFrom(rlang,quo_get_expr) +importFrom(rlang,quo_get_env) importFrom(rlang,quo_is_missing) importFrom(rlang,sym) importFrom(rlang,syms) @@ -232,3 +239,5 @@ importFrom(tidyselect,starts_with) importFrom(tsibble,as_tsibble) importFrom(utils,capture.output) importFrom(utils,tail) +importFrom(vctrs,vec_data) +importFrom(vctrs,vec_equal) diff --git a/NEWS.md b/NEWS.md index 8ab77ee47..ba6826da2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,12 +14,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat with `covid`. The data set previously named `jhu_confirmed_cumulative_num` has been removed from the package, but a renamed version is has been removed from the package, but a renamed version is still available in `epidatasets`. - -## Bug fixes - -- Removed `.window_size = 1` default from `epi_slide_{mean,sum,opt}`; this - argument is now mandatory, and should nearly always be greater than 1 except - for testing purposes. +- `epi_slide_{sum,mean,opt}` have improved default output column names, and + additional arguments for specifying names: `.prefix`, `.suffix`, + `.new_col_names`. To obtain the old naming behavior, use `.prefix = + "slide_value_"`. +- `as_epi_df` now removes any grouping that `x` had applied. ## Improvements @@ -29,6 +28,19 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat - Improved validation of `.window_size` arguments. - Rewrote a lot of the package documentation to be more consistent and informative. Simplified and streamlined the vignettes. +- `epi_slide_{sum,mean,opt}` on ungrouped `epi_df`s will now temporarily group + by `geo_value` and any `other_keys` for the slide operation rather than raise + an error about duplicated time values. `epi_slide`'s analogous automatic + grouping has been made temporary in order to match. +- Improved speed of key-uniqueness checks. + +## Bug fixes + +- Removed `.window_size = 1` default from `epi_slide_{mean,sum,opt}`; this + argument is now mandatory, and should nearly always be greater than 1 except + for testing purposes. +- Fixed `epi_slide_{sum,mean,opt}` raising an error on certain tidyselect + expressions. ## Cleanup diff --git a/R/epi_df.R b/R/epi_df.R index bcf9e56fd..6cae22dd2 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -174,7 +174,7 @@ NULL #' @param other_keys If your tibble has additional keys, be sure to specify them #' as a character vector here (typical examples are "age" or sub-geographies). #' @param ... Additional arguments passed to methods. -#' @return An `epi_df` object. +#' @return * Of `new_epi_df()`: an `epi_df` #' #' @export new_epi_df <- function(x = tibble::tibble(geo_value = character(), time_value = as.Date(integer())), @@ -205,6 +205,8 @@ new_epi_df <- function(x = tibble::tibble(geo_value = character(), time_value = #' to be converted #' @param ... used for specifying column names, as in [`dplyr::rename`]. For #' example, `geo_value = STATEFP, time_value = end_date`. +#' @return * Of `as_epi_df()`: an (ungrouped) `epi_df` +#' #' @export as_epi_df <- function(x, ...) { UseMethod("as_epi_df") @@ -215,6 +217,7 @@ as_epi_df <- function(x, ...) { #' @method as_epi_df epi_df #' @export as_epi_df.epi_df <- function(x, ...) { + x <- ungroup(x) return(x) } @@ -232,7 +235,6 @@ as_epi_df.tbl_df <- function( as_of, other_keys = character(), ...) { - # possible standard substitutions for time_value x <- rename(x, ...) x <- guess_column_name(x, "time_value", time_column_names()) x <- guess_column_name(x, "geo_value", geo_column_names()) @@ -277,26 +279,32 @@ as_epi_df.tbl_df <- function( } assert_character(other_keys) + assert_subset(other_keys, names(x)) + # Fix up if given more than just other keys, at least until epipredict#428 + # merged: + other_keys <- other_keys[!other_keys %in% c("geo_value", "time_value")] if (".time_value_counts" %in% other_keys) { cli_abort("as_epi_df: `other_keys` can't include \".time_value_counts\"") } - duplicated_time_values <- x %>% - group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>% - filter(dplyr::n() > 1) %>% - ungroup() - if (nrow(duplicated_time_values) > 0) { - bad_data <- capture.output(duplicated_time_values) - cli_abort( - "as_epi_df: some groups in the data have duplicated time values. epi_df requires a unique time_value per group.", - body = c("Sample groups:", bad_data) - ) - } + assert(check_ukey_unique(x, c("geo_value", other_keys, "time_value"), c( + ">" = "If this is line list data, convert it to counts/rates first.", + ">" = "If this contains a demographic breakdown, check that you have + specified appropriate `other_keys`" # . from checkmate + ))) new_epi_df(x, geo_type, time_type, as_of, other_keys) } +#' @rdname epi_df +#' @order 1 +#' @method as_epi_df grouped_df +#' @export +as_epi_df.grouped_df <- function(x, ...) { + as_epi_df(ungroup(x), ...) +} + #' @rdname epi_df #' @order 1 #' @method as_epi_df data.frame @@ -320,9 +328,11 @@ as_epi_df.tbl_ts <- function(x, as_of, other_keys = character(), ...) { #' Test for `epi_df` format #' #' @param x An object. -#' @return `TRUE` if the object inherits from `epi_df`. +#' @return * Of `is_epi_df`: `TRUE` if the object inherits from `epi_df`, +#' otherwise `FALSE`. #' #' @rdname epi_df +#' @order 1 #' @export is_epi_df <- function(x) { inherits(x, "epi_df") diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index e28cec0f3..675d000db 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -5,7 +5,11 @@ #' @import epidatasets #' @importFrom checkmate anyInfinite anyMissing assert assert_character #' @importFrom checkmate assert_class assert_data_frame assert_int assert_list +#' @importFrom checkmate assert_false #' @importFrom checkmate assert_logical assert_numeric assert_scalar checkInt +#' @importFrom checkmate assert_string +#' @importFrom checkmate assert_subset +#' @importFrom checkmate assert_tibble #' @importFrom checkmate check_atomic check_data_frame expect_class test_int #' @importFrom checkmate check_names #' @importFrom checkmate test_subset test_set_equal vname @@ -13,9 +17,14 @@ #' @importFrom data.table as.data.table #' @importFrom data.table key #' @importFrom data.table setkeyv +#' @importFrom dplyr arrange +#' @importFrom dplyr is_grouped_df #' @importFrom dplyr select #' @importFrom lifecycle deprecated #' @importFrom rlang %||% +#' @importFrom rlang is_bare_integerish +#' @importFrom vctrs vec_data +#' @importFrom vctrs vec_equal ## usethis namespace: end NULL @@ -24,5 +33,5 @@ utils::globalVariables(c( "fitted", ".response", "geo_value", "time_value", "value", ".real", "lag", "max_value", "min_value", "median_value", "spread", "rel_spread", "time_to", - "time_near_latest", "n_revisions" + "time_near_latest", "n_revisions", "min_lag", "max_lag" )) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 04bb841f0..9ad456735 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -688,10 +688,10 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' requested `.versions`) for rows having a `time_value` of at least `.version #' - before`. Otherwise, the slide computation will be passed only the most #' recent `version` for every unique `time_value`. Default is `FALSE`. -#' @return A tibble whose columns are: the grouping variables, `time_value`, -#' containing the reference time values for the slide computation, and a -#' column named according to the `.new_col_name` argument, containing the slide -#' values. +#' @return A tibble whose columns are: the grouping variables (if any), +#' `time_value`, containing the reference time values for the slide +#' computation, and a column named according to the `.new_col_name` argument, +#' containing the slide values. It will be grouped by the grouping variables. #' #' @details A few key distinctions between the current function and `epi_slide()`: #' 1. In `.f` functions for `epix_slide`, one should not assume that the input diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 84a75e460..6e19f7531 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -256,9 +256,9 @@ group_modify.epi_df <- function(.data, .f, ..., .keep = FALSE) { dplyr::dplyr_reconstruct(NextMethod(), .data) } -#' Complete epi_df +#' "Complete" an `epi_df`, adding missing rows and/or replacing `NA`s #' -#' A `tidyr::complete()` analogue for `epi_df`` objects. This function +#' A `tidyr::complete()` analogue for `epi_df` objects. This function #' can be used, for example, to add rows for missing combinations #' of `geo_value` and `time_value`, filling other columns with `NA`s. #' See the examples for usage details. diff --git a/R/reexports.R b/R/reexports.R index e091ce120..9a33e94bf 100644 --- a/R/reexports.R +++ b/R/reexports.R @@ -79,104 +79,36 @@ ggplot2::autoplot # epidatasets ------------------------------------------------------------------- -#' @inherit epidatasets::cases_deaths_subset description source references title -#' @inheritSection epidatasets::cases_deaths_subset Data dictionary -#' @examples -#' # Since this is a re-exported dataset, it cannot be loaded using -#' # the `data()` function. `data()` looks for a file of the same name -#' # in the `data/` directory, which doesn't exist in this package. -#' # works -#' epiprocess::cases_deaths_subset +#' @rdname epidatasets_reexports #' -#' # works -#' library(epiprocess) -#' cases_deaths_subset +#' @title Selected example data sets from `epidatasets` #' -#' # fails -#' \dontrun{ -#' data(cases_deaths_subset, package = "epiprocess") -#' } +#' @description Data sets re-exported from `epidatasets`; please see +#' documentation for each of these objects in `epidatasets`. +#' +#' A brief description of the format of each of the objects above are described +#' in matching order below. +#' +#' @keywords internal #' @export delayedAssign("cases_deaths_subset", epidatasets::cases_deaths_subset) -#' @inherit epidatasets::covid_incidence_county_subset description source references title -#' @inheritSection epidatasets::covid_incidence_county_subset Data dictionary -#' @examples -#' # Since this is a re-exported dataset, it cannot be loaded using -#' # the `data()` function. `data()` looks for a file of the same name -#' # in the `data/` directory, which doesn't exist in this package. -#' # works -#' epiprocess::covid_incidence_county_subset -#' -#' # works -#' library(epiprocess) -#' covid_incidence_county_subset -#' -#' # fails -#' \dontrun{ -#' data(covid_incidence_county_subset, package = "epiprocess") -#' } +#' @rdname epidatasets_reexports +#' @keywords internal #' @export delayedAssign("covid_incidence_county_subset", epidatasets::covid_incidence_county_subset) -#' @inherit epidatasets::covid_incidence_outliers description source references title -#' @inheritSection epidatasets::covid_incidence_outliers Data dictionary -#' @examples -#' # Since this is a re-exported dataset, it cannot be loaded using -#' # the `data()` function. `data()` looks for a file of the same name -#' # in the `data/` directory, which doesn't exist in this package. -#' # works -#' epiprocess::covid_incidence_outliers -#' -#' # works -#' library(epiprocess) -#' covid_incidence_outliers -#' -#' # fails -#' \dontrun{ -#' data(covid_incidence_outliers, package = "epiprocess") -#' } +#' @rdname epidatasets_reexports +#' @keywords internal #' @export delayedAssign("covid_incidence_outliers", epidatasets::covid_incidence_outliers) -#' @inherit epidatasets::archive_cases_dv_subset description source references title -#' @inheritSection epidatasets::archive_cases_dv_subset Data dictionary -#' @examples -#' # Since this is a re-exported dataset, it cannot be loaded using -#' # the `data()` function. `data()` looks for a file of the same name -#' # in the `data/` directory, which doesn't exist in this package. -#' # works -#' epiprocess::archive_cases_dv_subset -#' -#' # works -#' library(epiprocess) -#' archive_cases_dv_subset -#' -#' # fails -#' \dontrun{ -#' data(archive_cases_dv_subset, package = "epiprocess") -#' } -#' +#' @rdname epidatasets_reexports +#' @keywords internal #' @export delayedAssign("archive_cases_dv_subset", epidatasets::archive_cases_dv_subset) -#' @inherit epidatasets::covid_case_death_rates_extended description source references title -#' @inheritSection epidatasets::covid_case_death_rates_extended Data dictionary -#' @examples -#' # Since this is a re-exported dataset, it cannot be loaded using -#' # the `data()` function. `data()` looks for a file of the same name -#' # in the `data/` directory, which doesn't exist in this package. -#' # works -#' epiprocess::covid_case_death_rates_extended -#' -#' # works -#' library(epiprocess) -#' covid_case_death_rates_extended -#' -#' # fails -#' \dontrun{ -#' data(covid_case_death_rates_extended, package = "epiprocess") -#' } -#' +#' @rdname epidatasets_reexports +#' @keywords internal #' @export delayedAssign("covid_case_death_rates_extended", epidatasets::covid_case_death_rates_extended) diff --git a/R/slide.R b/R/slide.R index f936916eb..761639d44 100644 --- a/R/slide.R +++ b/R/slide.R @@ -6,8 +6,8 @@ #' as follows: #' #' ``` -#' # Create new column `cases_7dm` that contains a 7-day trailing median of cases -#' epi_slide(edf, cases_7dav = median(cases), .window_size = 7) +#' # Create new column `cases_7dmed` that contains a 7-day trailing median of cases +#' epi_slide(edf, cases_7dmed = median(cases), .window_size = 7) #' ``` #' #' For two very common use cases, we provide optimized functions that are much @@ -70,6 +70,8 @@ #' @export #' @seealso [`epi_slide_opt`] for optimized slide functions #' @examples +#' library(dplyr) +#' #' # Get the 7-day trailing standard deviation of cases and the 7-day trailing mean of cases #' cases_deaths_subset %>% #' epi_slide( @@ -77,44 +79,72 @@ #' cases_7dav = mean(cases, na.rm = TRUE), #' .window_size = 7 #' ) %>% -#' dplyr::select(geo_value, time_value, cases, cases_7sd, cases_7dav) +#' select(geo_value, time_value, cases, cases_7sd, cases_7dav) +#' # Note that epi_slide_mean could be used to more quickly calculate cases_7dav. +#' +#' # In addition to the [`dplyr::mutate`]-like syntax, you can feed in a function or +#' # formula in a way similar to [`dplyr::group_modify`]: +#' my_summarizer <- function(window_data) { +#' window_data %>% +#' summarize( +#' cases_7sd = sd(cases, na.rm = TRUE), +#' cases_7dav = mean(cases, na.rm = TRUE) +#' ) +#' } +#' cases_deaths_subset %>% +#' epi_slide( +#' ~ my_summarizer(.x), +#' .window_size = 7 +#' ) %>% +#' select(geo_value, time_value, cases, cases_7sd, cases_7dav) +#' #' -#' # The same as above, but unpacking using an unnamed data.frame with a formula +#' +#' +#' +#' #### Advanced: #### +#' +#' # The tidyverse supports ["packing"][tidyr::pack] multiple columns into a +#' # single tibble-type column contained within some larger tibble. Like dplyr, +#' # we normally don't pack output columns together. However, packing behavior can be turned on +#' # by providing a name for a tibble-type output: #' cases_deaths_subset %>% #' epi_slide( -#' ~ data.frame( +#' slide_packed = tibble( #' cases_7sd = sd(.x$cases, na.rm = TRUE), #' cases_7dav = mean(.x$cases, na.rm = TRUE) #' ), #' .window_size = 7 #' ) %>% -#' dplyr::select(geo_value, time_value, cases, cases_7sd, cases_7dav) -#' -#' # The same as above, but packing using a named data.frame with a tidy evaluation -#' # expression +#' select(geo_value, time_value, cases, slide_packed) #' cases_deaths_subset %>% #' epi_slide( -#' slide_packed = data.frame( +#' ~ tibble( #' cases_7sd = sd(.x$cases, na.rm = TRUE), #' cases_7dav = mean(.x$cases, na.rm = TRUE) #' ), +#' .new_col_name = "slide_packed", #' .window_size = 7 #' ) %>% -#' dplyr::select(geo_value, time_value, cases, slide_packed) +#' select(geo_value, time_value, cases, slide_packed) #' -#' # nested new columns +#' # You can also get ["nested"][tidyr::nest] format by wrapping your results in +#' # a list: #' cases_deaths_subset %>% #' group_by(geo_value) %>% #' epi_slide( #' function(x, g, t) { -#' data.frame( +#' list(tibble( #' cases_7sd = sd(x$cases, na.rm = TRUE), #' cases_7dav = mean(x$cases, na.rm = TRUE) -#' ) +#' )) #' }, #' .window_size = 7 #' ) %>% -#' dplyr::select(geo_value, time_value, cases, cases_7sd, cases_7dav) +#' ungroup() %>% +#' select(geo_value, time_value, slide_value) +#' +#' #' #' # Use the geo_value or the ref_time_value in the slide computation #' cases_deaths_subset %>% @@ -156,15 +186,16 @@ epi_slide <- function( # Validate arguments assert_class(.x, "epi_df") - if (checkmate::test_class(.x, "grouped_df")) { + .x_orig_groups <- groups(.x) + if (inherits(.x, "grouped_df")) { expected_group_keys <- .x %>% key_colnames(exclude = "time_value") %>% sort() if (!identical(.x %>% group_vars() %>% sort(), expected_group_keys)) { cli_abort( - "epi_slide: `.x` must be either grouped by {expected_group_keys}. (Or you can just ungroup - `.x` and we'll do this grouping automatically.) You may need to aggregate your data first, - see aggregate_epi_df().", + "`.x` must be either grouped by {expected_group_keys} or ungrouped; if the latter, + we'll temporarily group by {expected_group_keys} for this operation. You may need + to aggregate your data first; see sum_groups_epi_df().", class = "epiprocess__epi_slide__invalid_grouping" ) } @@ -228,18 +259,7 @@ epi_slide <- function( assert_logical(.all_rows, len = 1) # Check for duplicated time values within groups - duplicated_time_values <- .x %>% - group_epi_df() %>% - filter(dplyr::n() > 1) %>% - ungroup() - if (nrow(duplicated_time_values) > 0) { - bad_data <- capture.output(duplicated_time_values) - cli_abort( - "as_epi_df: some groups in a resulting dplyr computation have duplicated time values. - epi_df requires a unique time_value per group.", - body = c("Sample groups:", bad_data) - ) - } + assert(check_ukey_unique(ungroup(.x), c(group_vars(.x), "time_value"))) # Begin handling completion. This will create a complete time index between # the smallest and largest time values in the data. This is used to ensure @@ -270,7 +290,6 @@ epi_slide <- function( # `epi_slide_one_group`. # - `...` from top of `epi_slide` are forwarded to `.f` here through # group_modify and through the lambda. - .x_groups <- groups(.x) result <- group_map( .x, .f = function(.data_group, .group_key, ...) { @@ -294,7 +313,7 @@ epi_slide <- function( filter(.real) %>% select(-.real) %>% arrange_col_canonical() %>% - group_by(!!!.x_groups) + group_by(!!!.x_orig_groups) # If every group in epi_slide_one_group takes the # length(available_ref_time_values) == 0 branch then we end up here. @@ -537,7 +556,7 @@ get_before_after_from_window <- function(window_size, align, time_type) { #' #' @template basic-slide-params #' @param .col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column -#' name(e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), +#' name (e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), #' [other tidy-select expression][tidyselect::language], or a vector of #' characters (e.g. `c("cases", "deaths")`). Variable names can be used as if #' they were positions in the data frame, so expressions like `x:y` can be @@ -559,13 +578,41 @@ get_before_after_from_window <- function(window_size, align, time_type) { #' `epi_slide_mean` and `epi_slide_sum`) take care of window completion #' automatically to prevent associated errors. #' @param ... Additional arguments to pass to the slide computation `.f`, for -#' example, `algo` or `na.rm` in data.table functions. You don't need to -#' specify `.x`, `.window_size`, or `.align` (or `before`/`after` for slider -#' functions). +#' example, `algo` or `na.rm` in data.table functions. You don't need to +#' specify `.x`, `.window_size`, or `.align` (or `before`/`after` for slider +#' functions). +#' @param .prefix Optional [`glue::glue`] format string; name the slide result +#' column(s) by attaching this prefix to the corresponding input column(s). +#' Some shorthand is supported for basing the output names on `.window_size` +#' or other arguments; see "Prefix and suffix shorthand" below. +#' @param .suffix Optional [`glue::glue`] format string; like `.prefix`. The +#' default naming behavior is equivalent to `.suffix = +#' "_{.n}{.time_unit_abbr}{.align_abbr}{.f_abbr}"`. Can be used in combination +#' with `.prefix`. +#' @param .new_col_names Optional character vector with length matching the +#' number of input columns from `.col_names`; name the slide result column(s) +#' with these names. Cannot be used in combination with `.prefix` and/or +#' `.suffix`. +#' +#' @section Prefix and suffix shorthand: +#' +#' [`glue::glue`] format strings specially interpret content within curly +#' braces. E.g., `glue::glue("ABC{2 + 2}")` evaluates to `"ABC4"`. For `.prefix` +#' and `.suffix`, we provide `glue` with some additional variable bindings: +#' +#' - `{.n}` will be the number of time steps in the computation +#' corresponding to the `.window_size`. +#' - `{.time_unit_abbr}` will be a lower-case letter corresponding to the +#' `time_type` of `.x` +#' - `{.align_abbr}` will be `""` if `.align` is the default of `"right"`; +#' otherwise, it will be the first letter of `.align` +#' - `{.f_abbr}` will be a character vector containing a short abbreviation +#' for `.f` factoring in the input column type(s) for `.col_names` #' #' @importFrom dplyr bind_rows mutate %>% arrange tibble select all_of -#' @importFrom rlang enquo quo_get_expr as_label expr_label caller_arg +#' @importFrom rlang enquo expr_label caller_arg quo_get_env #' @importFrom tidyselect eval_select +#' @importFrom glue glue #' @importFrom purrr map map_lgl #' @importFrom data.table frollmean frollsum frollapply #' @importFrom lubridate as.period @@ -574,25 +621,59 @@ get_before_after_from_window <- function(window_size, align, time_type) { #' @export #' @seealso [`epi_slide`] for the more general slide function #' @examples -#' # Compute a 7-day trailing average on cases. +#' library(dplyr) +#' +#' # Add a column (`cases_7dsum`) containing a 7-day trailing sum on `cases`: #' cases_deaths_subset %>% -#' group_by(geo_value) %>% -#' epi_slide_opt(cases, .f = data.table::frollmean, .window_size = 7) %>% -#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) +#' select(geo_value, time_value, cases) %>% +#' epi_slide_sum(cases, .window_size = 7) +#' +#' # Add a column (`cases_rate_7dav`) containing a 7-day trailing average on `case_rate`: +#' covid_case_death_rates_extended %>% +#' epi_slide_mean(case_rate, .window_size = 7) +#' +#' # Use a less common specialized slide function: +#' cases_deaths_subset %>% +#' epi_slide_opt(cases, slider::slide_min, .window_size = 7) #' -#' # Same as above, but adjust `frollmean` settings for speed, accuracy, and -#' # to allow partially-missing windows. +#' # Specify output column names and/or a naming scheme: +#' cases_deaths_subset %>% +#' select(geo_value, time_value, cases) %>% +#' group_by(geo_value) %>% +#' epi_slide_sum(cases, .window_size = 7, .new_col_names = "case_sum") %>% +#' ungroup() #' cases_deaths_subset %>% +#' select(geo_value, time_value, cases) %>% #' group_by(geo_value) %>% -#' epi_slide_opt( -#' cases, -#' .f = data.table::frollmean, .window_size = 7, -#' algo = "exact", hasNA = TRUE, na.rm = TRUE +#' epi_slide_sum(cases, .window_size = 7, .prefix = "sum_") %>% +#' ungroup() +#' +#' # Additional settings can be sent to the {data.table} and {slider} functions +#' # via `...`. This example passes some arguments to `frollmean` settings for +#' # speed, accuracy, and to allow partially-missing windows: +#' covid_case_death_rates_extended %>% +#' epi_slide_mean( +#' case_rate, +#' .window_size = 7, +#' na.rm = TRUE, algo = "exact", hasNA = TRUE +#' ) +#' +#' # If the more specialized possibilities for `.f` don't cover your needs, you +#' # can use `epi_slide_opt` with `.f = data.table::frollapply` to apply a +#' # custom function at the cost of more computation time. See also `epi_slide` +#' # if you need something even more general. +#' cases_deaths_subset %>% +#' select(geo_value, time_value, case_rate_7d_av, death_rate_7d_av) %>% +#' epi_slide_opt(c(case_rate_7d_av, death_rate_7d_av), +#' data.table::frollapply, +#' FUN = median, .window_size = 28, +#' .suffix = "_{.n}{.time_unit_abbr}_median" #' ) %>% -#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) +#' print(n = 40) epi_slide_opt <- function( .x, .col_names, .f, ..., .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, .suffix = NULL, .new_col_names = NULL, .ref_time_values = NULL, .all_rows = FALSE) { assert_class(.x, "epi_df") @@ -620,7 +701,7 @@ epi_slide_opt <- function( if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { cli::cli_abort( "epi_slide_opt: the argument `new_col_name` is not supported for `epi_slide_opt`. If you want to customize - the output column names, use `dplyr::rename` after the slide.", + the output column names, use `.prefix =`, `.suffix =`, or `.new_col_**names** =`.", class = "epiprocess__epi_slide_opt__new_name_not_supported" ) } @@ -632,33 +713,71 @@ epi_slide_opt <- function( ) } + assert_class(.x, "epi_df") + .x_orig_groups <- groups(.x) + if (inherits(.x, "grouped_df")) { + expected_group_keys <- .x %>% + key_colnames(exclude = "time_value") %>% + sort() + if (!identical(.x %>% group_vars() %>% sort(), expected_group_keys)) { + cli_abort( + "`.x` must be either grouped by {expected_group_keys} or ungrouped; if the latter, + we'll temporarily group by {expected_group_keys} for this operation. You may need + to aggregate your data first; see sum_groups_epi_df().", + class = "epiprocess__epi_slide_opt__invalid_grouping" + ) + } + } else { + .x <- group_epi_df(.x, exclude = "time_value") + } if (nrow(.x) == 0L) { cli_abort( c( "input data `.x` unexpectedly has 0 rows", "i" = "If this computation is occuring within an `epix_slide` call, - check that `epix_slide` `.versions` argument was set appropriately" + check that `epix_slide` `.versions` argument was set appropriately + so that you don't get any completely-empty snapshots" ), class = "epiprocess__epi_slide_opt__0_row_input", epiprocess__x = .x ) } + # Check for duplicated time values within groups + assert(check_ukey_unique(ungroup(.x), c(group_vars(.x), "time_value"))) + + # The position of a given column can be differ between input `.x` and + # `.data_group` since the grouping step by default drops grouping columns. + # To avoid rerunning `eval_select` for every `.data_group`, convert + # positions of user-provided `col_names` into string column names. We avoid + # using `names(pos)` directly for robustness and in case we later want to + # allow users to rename fields via tidyselection. + col_names_quo <- enquo(.col_names) + pos <- eval_select(col_names_quo, data = .x, allow_rename = FALSE) + col_names_chr <- names(.x)[pos] + # Check that slide function `.f` is one of those short-listed from - # `data.table` and `slider` (or a function that has the exact same - # definition, e.g. if the function has been reexported or defined - # locally). - if (any(map_lgl( - list(frollmean, frollsum, frollapply), - ~ identical(.f, .x) - ))) { - f_from_package <- "data.table" - } else if (any(map_lgl( - list(slide_sum, slide_prod, slide_mean, slide_min, slide_max, slide_all, slide_any), - ~ identical(.f, .x) - ))) { - f_from_package <- "slider" - } else { + # `data.table` and `slider` (or a function that has the exact same definition, + # e.g. if the function has been reexported or defined locally). Extract some + # metadata. `namer` will be mapped over columns (.x will be a column, not the + # entire edf). + f_possibilities <- + tibble::tribble( + ~f, ~package, ~namer, + frollmean, "data.table", ~ if (is.logical(.x)) "prop" else "av", + frollsum, "data.table", ~ if (is.logical(.x)) "count" else "sum", + frollapply, "data.table", ~"slide", + slide_sum, "slider", ~ if (is.logical(.x)) "count" else "sum", + slide_prod, "slider", ~"prod", + slide_mean, "slider", ~ if (is.logical(.x)) "prop" else "av", + slide_min, "slider", ~"min", + slide_max, "slider", ~"max", + slide_all, "slider", ~"all", + slide_any, "slider", ~"any", + ) + f_info <- f_possibilities %>% + filter(map_lgl(.data$f, ~ identical(.f, .x))) + if (nrow(f_info) == 0L) { # `f` is from somewhere else and not supported cli_abort( c( @@ -672,6 +791,13 @@ epi_slide_opt <- function( epiprocess__f = .f ) } + if (nrow(f_info) > 1L) { + cli_abort('epiprocess internal error: looking up `.f` in table of possible + functions yielded multiple matches. Please report it using "New + issue" at https://github.com/cmu-delphi/epiprocess/issues, using + reprex::reprex to provide a minimal reproducible example.') + } + f_from_package <- f_info$package user_provided_rtvs <- !is.null(.ref_time_values) if (!user_provided_rtvs) { @@ -702,26 +828,72 @@ epi_slide_opt <- function( validate_slide_window_arg(.window_size, time_type) window_args <- get_before_after_from_window(.window_size, .align, time_type) + # Handle output naming + if ((!is.null(.prefix) || !is.null(.suffix)) && !is.null(.new_col_names)) { + cli_abort( + "Can't use both .prefix/.suffix and .new_col_names at the same time.", + class = "epiprocess__epi_slide_opt_incompatible_naming_args" + ) + } + assert_string(.prefix, null.ok = TRUE) + assert_string(.suffix, null.ok = TRUE) + assert_character(.new_col_names, len = length(col_names_chr), null.ok = TRUE) + if (is.null(.prefix) && is.null(.suffix) && is.null(.new_col_names)) { + .suffix <- "_{.n}{.time_unit_abbr}{.align_abbr}{.f_abbr}" + # ^ does not account for any arguments specified to underlying functions via + # `...` such as `na.rm =`, nor does it distinguish between functions from + # different packages accomplishing the same type of computation. Those are + # probably only set one way per task, so this probably produces cleaner + # names without clashes (though maybe some confusion if switching between + # code with different settings). + } + if (!is.null(.prefix) || !is.null(.suffix)) { + .prefix <- .prefix %||% "" + .suffix <- .suffix %||% "" + if (identical(.window_size, Inf)) { + n <- "running_" + time_unit_abbr <- "" + align_abbr <- "" + } else { + n <- time_delta_to_n_steps(.window_size, time_type) + time_unit_abbr <- time_type_unit_abbr(time_type) + align_abbr <- c(right = "", center = "c", left = "l")[[.align]] + } + glue_env <- rlang::env( + .n = n, + .time_unit_abbr = time_unit_abbr, + .align_abbr = align_abbr, + .f_abbr = purrr::map_chr(.x[col_names_chr], unwrap(f_info$namer)), + quo_get_env(col_names_quo) + ) + .new_col_names <- unclass( + glue(.prefix, .envir = glue_env) + + col_names_chr + + glue(.suffix, .envir = glue_env) + ) + } else { + # `.new_col_names` was provided by user; we don't need to do anything. + } + if (any(.new_col_names %in% names(.x))) { + cli_abort(c( + "Naming conflict between new columns and existing columns", + "x" = "Overlapping names: {format_varnames(intersect(.new_col_names, names(.x)))}" + ), class = "epiprocess__epi_slide_opt_old_new_name_conflict") + } + if (anyDuplicated(.new_col_names)) { + cli_abort(c( + "New column names contain duplicates", + "x" = "Duplicated names: {format_varnames(unique(.new_col_names[duplicated(.new_col_names)]))}" + ), class = "epiprocess__epi_slide_opt_new_name_duplicated") + } + result_col_names <- .new_col_names + # Make a complete date sequence between min(.x$time_value) and max(.x$time_value). date_seq_list <- full_date_seq(.x, window_args$before, window_args$after, time_type) all_dates <- date_seq_list$all_dates pad_early_dates <- date_seq_list$pad_early_dates pad_late_dates <- date_seq_list$pad_late_dates - # The position of a given column can be differ between input `.x` and - # `.data_group` since the grouping step by default drops grouping columns. - # To avoid rerunning `eval_select` for every `.data_group`, convert - # positions of user-provided `col_names` into string column names. We avoid - # using `names(pos)` directly for robustness and in case we later want to - # allow users to rename fields via tidyselection. - if (inherits(quo_get_expr(enquo(.col_names)), "character")) { - pos <- eval_select(dplyr::all_of(.col_names), data = .x, allow_rename = FALSE) - } else { - pos <- eval_select(enquo(.col_names), data = .x, allow_rename = FALSE) - } - col_names_chr <- names(.x)[pos] - # Always rename results to "slide_value_". - result_col_names <- paste0("slide_value_", col_names_chr) slide_one_grp <- function(.data_group, .group_key, ...) { missing_times <- all_dates[!(all_dates %in% .data_group$time_value)] # `frollmean` requires a full window to compute a result. Add NA values @@ -734,27 +906,9 @@ epi_slide_opt <- function( arrange(.data$time_value) if (f_from_package == "data.table") { - # If a group contains duplicate time values, `frollmean` will still only - # use the last `k` obs. It isn't looking at dates, it just goes in row - # order. So if the computation is aggregating across multiple obs for the - # same date, `epi_slide_opt` and derivates will produce incorrect results; - # `epi_slide` should be used instead. - if (anyDuplicated(.data_group$time_value) != 0L) { - cli_abort( - c( - "group contains duplicate time values. Using `epi_slide_[opt/mean/sum]` on this - group will result in incorrect results", - "i" = "Please change the grouping structure of the input data so that - each group has non-duplicate time values (e.g. `x %>% group_by(geo_value) - %>% epi_slide_opt(.f = frollmean)`)", - "i" = "Use `epi_slide` to aggregate across groups" - ), - class = "epiprocess__epi_slide_opt__duplicate_time_values", - epiprocess__data_group = .data_group, - epiprocess__group_key = .group_key - ) - } - + # Grouping should ensure that we don't have duplicate time values. + # Completion above should ensure we have at least .window_size rows. Check + # that we don't have more than .window_size rows (or fewer somehow): if (nrow(.data_group) != length(c(all_dates, pad_early_dates, pad_late_dates))) { cli_abort( c( @@ -805,7 +959,8 @@ epi_slide_opt <- function( group_modify(slide_one_grp, ..., .keep = FALSE) %>% filter(.data$.real) %>% select(-.real) %>% - arrange_col_canonical() + arrange_col_canonical() %>% + group_by(!!!.x_orig_groups) if (.all_rows) { result[!(result$time_value %in% ref_time_values), result_col_names] <- NA @@ -827,26 +982,10 @@ epi_slide_opt <- function( #' datatable::frollmean`. #' #' @export -#' @examples -#' # Compute a 7-day trailing average on cases. -#' cases_deaths_subset %>% -#' group_by(geo_value) %>% -#' epi_slide_mean(cases, .window_size = 7) %>% -#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) -#' -#' # Same as above, but adjust `frollmean` settings for speed, accuracy, and -#' # to allow partially-missing windows. -#' cases_deaths_subset %>% -#' group_by(geo_value) %>% -#' epi_slide_mean( -#' cases, -#' .window_size = 7, -#' na.rm = TRUE, algo = "exact", hasNA = TRUE -#' ) %>% -#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) epi_slide_mean <- function( .x, .col_names, ..., .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, .suffix = NULL, .new_col_names = NULL, .ref_time_values = NULL, .all_rows = FALSE) { # Deprecated argument handling provided_args <- rlang::call_args_names(rlang::call_match()) @@ -871,8 +1010,9 @@ epi_slide_mean <- function( } if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { cli::cli_abort( - "epi_slide_mean: the argument `new_col_name` is not supported. If you want to customize - the output column names, use `dplyr::rename` after the slide." + "epi_slide_opt: the argument `new_col_name` is not supported for `epi_slide_opt`. If you want to customize + the output column names, use `.prefix =`, `.suffix =`, or `.new_col_**names** =`.", + class = "epiprocess__epi_slide_opt__new_name_not_supported" ) } if ("names_sep" %in% provided_args || ".names_sep" %in% provided_args) { @@ -889,6 +1029,9 @@ epi_slide_mean <- function( ..., .window_size = .window_size, .align = .align, + .prefix = .prefix, + .suffix = .suffix, + .new_col_names = .new_col_names, .ref_time_values = .ref_time_values, .all_rows = .all_rows ) @@ -899,15 +1042,10 @@ epi_slide_mean <- function( #' datatable::frollsum`. #' #' @export -#' @examples -#' # Compute a 7-day trailing sum on cases. -#' cases_deaths_subset %>% -#' group_by(geo_value) %>% -#' epi_slide_sum(cases, .window_size = 7) %>% -#' dplyr::select(geo_value, time_value, cases, cases_7dsum = slide_value_cases) epi_slide_sum <- function( .x, .col_names, ..., .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, .suffix = NULL, .new_col_names = NULL, .ref_time_values = NULL, .all_rows = FALSE) { # Deprecated argument handling provided_args <- rlang::call_args_names(rlang::call_match()) @@ -932,8 +1070,9 @@ epi_slide_sum <- function( } if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { cli::cli_abort( - "epi_slide_sum: the argument `new_col_name` is not supported. If you want to customize - the output column names, use `dplyr::rename` after the slide." + "epi_slide_opt: the argument `new_col_name` is not supported for `epi_slide_opt`. If you want to customize + the output column names, use `.prefix =`, `.suffix =`, or `.new_col_**names** =`.", + class = "epiprocess__epi_slide_opt__new_name_not_supported" ) } if ("names_sep" %in% provided_args || ".names_sep" %in% provided_args) { @@ -949,6 +1088,9 @@ epi_slide_sum <- function( ..., .window_size = .window_size, .align = .align, + .prefix = .prefix, + .suffix = .suffix, + .new_col_names = .new_col_names, .ref_time_values = .ref_time_values, .all_rows = .all_rows ) diff --git a/R/utils.R b/R/utils.R index 1bfd21292..e350ade25 100644 --- a/R/utils.R +++ b/R/utils.R @@ -640,7 +640,7 @@ guess_time_type <- function(time_value, time_value_arg = rlang::caller_arg(time_ return("day") } else if (inherits(time_value, "yearmonth")) { return("yearmonth") - } else if (rlang::is_integerish(time_value)) { + } else if (is_bare_integerish(time_value)) { return("integer") } @@ -1109,3 +1109,147 @@ validate_slide_window_arg <- function(arg, time_type, lower = 1, allow_inf = TRU ) } } + + +#' Convert a time delta to a integerish number of "unit" steps between time values +#' +#' @param time_delta a vector that can be added to time values of time type +#' `time_type` to arrive at other time values of that time type, or +#' `r lifecycle::badge("experimental")` such a vector with Inf/-Inf entries mixed +#' in, if supported by the class of `time_delta`, even if `time_type` doesn't +#' necessarily support Inf/-Inf entries. Basically a slide window arg but +#' without sign and length restrictions. +#' @param time_type as in `validate_slide_window_arg` +#' @return [bare integerish][rlang::is_integerish] vector (with possible +#' infinite values) that produces the same result as `time_delta` when +#' multiplied by the natural [`unit_time_delta`] for +#' that time type and added to time values of time type `time_type`. If the +#' given time type does not support infinite values, then it should produce +#' +Inf or -Inf for analogous entries of `time_delta`, and match the addition +#' result match the addition result for non-infinite entries. +#' +#' @keywords internal +time_delta_to_n_steps <- function(time_delta, time_type) { + # could be S3 if we're willing to export + if (inherits(time_delta, "difftime")) { + output_units <- switch(time_type, + day = "days", + week = "weeks", + cli_abort("difftime objects not supported for time_type {format_chr_with_quotes(time_type)}") + ) + units(time_delta) <- output_units # converts number to represent same duration; not just attr<- + n_steps <- vec_data(time_delta) + if (!is_bare_integerish(n_steps)) { + cli_abort("`time_delta` did not appear to contain only integerish numbers + of steps between time values of time type {format_chr_with_quotes(time_type)}") + } + n_steps + } else if (is_bare_integerish(time_delta)) { # (allows infinite values) + switch(time_type, + day = , + week = , + yearmonth = , + integer = time_delta, + cli_abort("Invalid or unsupported time_type {format_chr_with_quotes(time_type)}") + ) + } else { + cli_abort("Invalid or unsupported kind of `time_delta`") + } +} + +#' Object that, added to time_values of time_type, advances by one time step/interval +#' +#' @param time_type string; `epi_df`'s or `epi_archive`'s `time_type` +#' @return an object `u` such that `time_values + u` represents advancing by one +#' time step / moving to the subsequent time interval for any `time_values` +#' object of time type `time_type`, and such that `time_values + k * u` for +#' integerish vector `k` advances by `k` steps (with vectorization, +#' recycling). +#' +#' @keywords internal +unit_time_delta <- function(time_type) { + switch(time_type, + day = as.difftime(1, units = "days"), + week = as.difftime(1, units = "weeks"), + yearmonth = 1, + integer = 1L, + cli_abort("Unsupported time_type: {time_type}") + ) +} + +# Using these unit abbreviations happens to make our automatic slide output +# naming look like taking ISO-8601 duration designations, removing the P, and +# lowercasing any characters. Fortnightly or sub-daily time types would need an +# adjustment to remain consistent. +time_type_unit_abbrs <- c( + day = "d", + week = "w", + yearmonth = "m" +) + +time_type_unit_abbr <- function(time_type) { + maybe_unit_abbr <- time_type_unit_abbrs[time_type] + if (is.na(maybe_unit_abbr)) { + cli_abort("Cannot determine the units of time type {format_chr_with_quotes(time_type)}") + } + maybe_unit_abbr +} + +#' Extract singular element of a length-1 unnamed list (validated) +#' +#' Inverse of `list(elt)`. +#' +#' @param x a length-1 list +#' @return x[[1L]], if x actually was a length-1 list; error otherwise +#' +#' @keywords internal +unwrap <- function(x) { + checkmate::assert_list(x, len = 1L, names = "unnamed") + x[[1L]] +} + +#' Check that a unique key is indeed unique in a tibble (TRUE/str) +#' +#' A `checkmate`-style check function. +#' +#' @param x a tibble, with no particular row or column order (if you have a +#' guaranteed row order based on the ukey you can probably do something more +#' efficient) +#' @param ukey_names character vector; subset of column names of `x` denoting a +#' unique key. +#' @param end_cli_message optional character vector, a cli message format +#' string/vector; information/advice to tack onto any error messages. +#' @return `TRUE` if no ukey is duplicated (i.e., `x[ukey_names]` has no +#' duplicated rows); string with an error message if there are errors. +#' +#' @keywords internal +check_ukey_unique <- function(x, ukey_names, end_cli_message = character()) { + assert_tibble(x) # to not have to think about `data.table` perf, xface + assert_false(is_grouped_df(x)) # to not have to think about `grouped_df` perf, xface + assert_character(ukey_names) + assert_subset(ukey_names, names(x)) + # + if (nrow(x) <= 1L) { + TRUE + } else { + # Fast check, slow error message. + arranged_ukeys <- arrange(x[ukey_names], across(all_of(ukey_names))) + if (!any(vec_equal(arranged_ukeys[-1L, ], arranged_ukeys[-nrow(arranged_ukeys), ]))) { + TRUE + } else { + bad_data <- x %>% + group_by(across(all_of(ukey_names))) %>% + filter(dplyr::n() > 1) %>% + ungroup() + lines <- c( + cli::format_error(" + There cannot be more than one row with the same combination of + {format_varnames(ukey_names)}. Problematic rows: + "), + capture.output(bad_data), + cli::format_message(end_cli_message) + ) + paste(collapse = "\n", lines) + } + } +} diff --git a/README.Rmd b/README.Rmd index 82d60fc75..0e8756d3d 100644 --- a/README.Rmd +++ b/README.Rmd @@ -113,8 +113,7 @@ Compute the 7 day moving average of the confirmed daily cases for each geo_value ```{r} edf <- edf %>% group_by(geo_value) %>% - epi_slide_mean(cases_daily, .window_size = 7, na.rm = TRUE) %>% - rename(smoothed_cases_daily = slide_value_cases_daily) + epi_slide_mean(cases_daily, .window_size = 7, na.rm = TRUE, .prefix = "smoothed_") edf ``` diff --git a/README.md b/README.md index 7c14443da..af8c24e90 100644 --- a/README.md +++ b/README.md @@ -136,8 +136,7 @@ geo\_value ``` r edf <- edf %>% group_by(geo_value) %>% - epi_slide_mean(cases_daily, .window_size = 7, na.rm = TRUE) %>% - rename(smoothed_cases_daily = slide_value_cases_daily) + epi_slide_mean(cases_daily, .window_size = 7, na.rm = TRUE, .prefix = "smoothed_") edf #> An `epi_df` object, 2,808 x 5 with metadata: #> * geo_type = state diff --git a/_pkgdown.yml b/_pkgdown.yml index 2214df7c3..e8a2c8c3b 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,40 +1,25 @@ -# Colors should stay consistent across epipredict, epiprocess, and epidatr, -# using Carnegie Red -# https://www.cmu.edu/brand/brand-guidelines/visual-identity/colors.html - -# This is to give a default value to the `mode` parameter in the -# `pkgdown::build_site` function. This is useful when building the site locally, -# as it will default to `devel` mode. In practice, this should all be handled -# dynamically by the CI/CD pipeline. development: mode: devel - version_label: success template: - bootstrap: 5 - bootswatch: cosmo - bslib: - font_scale: 1.0 - primary: "#C41230" - success: "#B4D43C" - link-color: "#C41230" - -navbar: - bg: primary - type: light + package: delphidocs url: https://cmu-delphi.github.io/epiprocess/ home: links: - - text: Introduction to Delphi's Tooling Work - href: https://cmu-delphi.github.io/delphi-tooling-book/ - - text: Get the epipredict R package + - text: The epipredict package href: https://cmu-delphi.github.io/epipredict/ - - text: Get the epidatr R package - href: https://github.com/cmu-delphi/epidatr - - text: Get the epidatasets R package - href: https://cmu-delphi.github.io/epidatasets/ + sidebar: + structure: [links, license, community, citation, authors, dev, related] + components: + related: + title: Delphi packages and resources + text: | + * [The epidatr package](https://github.com/cmu-delphi/epidatr/) + * [The epipredict package](https://cmu-delphi.github.io/epipredict/) + * [The epidatasets package](https://cmu-delphi.github.io/epidatasets/) + * [Introduction to Delphi's Tooling Work](https://cmu-delphi.github.io/delphi-tooling-book/) articles: - title: Using the package @@ -54,10 +39,9 @@ articles: repo: url: - home: https://github.com/cmu-delphi/epiprocess/tree/main/ - source: https://github.com/cmu-delphi/epiprocess/blob/main/ + home: https://github.com/cmu-delphi/epiprocess/ + source: https://github.com/cmu-delphi/epiprocess/ issue: https://github.com/cmu-delphi/epiprocess/issues - user: https://github.com/ reference: - title: "`epi_df` basics" @@ -103,11 +87,11 @@ reference: - title: Example data - contents: - - cases_deaths_subset - - archive_cases_dv_subset - - covid_incidence_county_subset - - covid_incidence_outliers - - covid_case_death_rates_extended + - epidatasets::cases_deaths_subset + - epidatasets::archive_cases_dv_subset + - epidatasets::covid_incidence_county_subset + - epidatasets::covid_incidence_outliers + - epidatasets::covid_case_death_rates_extended - title: internal - contents: diff --git a/man-roxygen/basic-slide-params.R b/man-roxygen/basic-slide-params.R index 638307d6a..0e5f28ae6 100644 --- a/man-roxygen/basic-slide-params.R +++ b/man-roxygen/basic-slide-params.R @@ -1,5 +1,5 @@ -#' @param .x An `epi_df` object. If ungrouped, we group by `geo_value` and any -#' columns in `other_keys`. If grouped, we make sure the grouping is by +#' @param .x An `epi_df` object. If ungrouped, we temporarily group by `geo_value` +#' and any columns in `other_keys`. If grouped, we make sure the grouping is by #' `geo_value` and `other_keys`. #' @param .window_size The size of the sliding window. The accepted values #' depend on the type of the `time_value` column in `.x`: @@ -32,4 +32,5 @@ #' a missing value marker (typically NA, but more technically the result of #' `vctrs::vec_cast`-ing `NA` to the type of the slide computation output). #' @return An `epi_df` object with one or more new slide computation columns -#' added. +#' added. It will be ungrouped if `.x` was ungrouped, and have the same groups +#' as `.x` if `.x` was grouped. diff --git a/man/archive_cases_dv_subset.Rd b/man/archive_cases_dv_subset.Rd deleted file mode 100644 index 207bb025e..000000000 --- a/man/archive_cases_dv_subset.Rd +++ /dev/null @@ -1,84 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/reexports.R -\docType{data} -\name{archive_cases_dv_subset} -\alias{archive_cases_dv_subset} -\title{Subset of daily COVID-19 doctor visits and cases from 6 states in archive format} -\format{ -An object of class \code{epi_archive} of length 6. -} -\source{ -This object contains a modified part of the \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. This data set is licensed under the terms of the -\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} -by Johns Hopkins University on behalf of its Center for Systems Science in Engineering. -Copyright Johns Hopkins University 2020. - -Modifications: -\itemize{ -\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: \code{case_rate_7d_av} signal was computed by Delphi from the original JHU-CSSE data by calculating moving averages of the preceding 7 days, so the signal for June 7 is the average of the underlying data for June 1 through 7, inclusive. -\item Furthermore, the data has been limited to a very small number of rows, the -signal names slightly altered, and formatted into an \code{epi_archive}. -} - -This object contains a modified part of the -\href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html}{Delphi \code{doctor-visits} indicator}. -This data source is computed by the Delphi -Group from information about outpatient visits, provided to Delphi by -health system partners, and published in the COVIDcast Epidata API. This -data set is licensed under the terms of the -\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} -by the Delphi group. - -Modifications: -\itemize{ -\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html}{From the COVIDcast Doctor Visits signal}: The signal \code{smoothed_adj_cli} is taken directly from the API without changes. -\item Furthermore, the data has been limited to a very small number of rows, the -signal names slightly altered, and formatted into an \code{epi_archive}. -} -} -\usage{ -archive_cases_dv_subset -} -\description{ -This data source is based on information about outpatient visits, provided -to us by health system partners, and also contains confirmed COVID-19 -cases based on reports made available by the Center for Systems Science -and Engineering at Johns Hopkins University. This example data ranges from -June 1, 2020 to December 1, 2021, issued on dates from June 1, 2020 to December 1, -2021. It is limited to California, Florida, Texas, and New York. - -It is used in the {epiprocess} \code{compactify}, \code{epi_archive}, and -advanced-use (\code{advanced}) vignettes. -} -\section{Data dictionary}{ - - -The data in the \code{epi_archive$DT} attribute has columns: -\describe{ -\item{geo_value}{the geographic value associated with each row of measurements.} -\item{time_value}{the time value associated with each row of measurements.} -\item{version}{the time value specifying the version for each row of measurements. } -\item{percent_cli}{percentage of doctor’s visits with CLI (COVID-like illness) computed from medical insurance claims} -\item{case_rate_7d_av}{7-day average signal of number of new confirmed cases due to COVID-19 per 100,000 population, daily} -} - -} - -\examples{ -# Since this is a re-exported dataset, it cannot be loaded using -# the `data()` function. `data()` looks for a file of the same name -# in the `data/` directory, which doesn't exist in this package. -# works -epiprocess::archive_cases_dv_subset - -# works -library(epiprocess) -archive_cases_dv_subset - -# fails -\dontrun{ -data(archive_cases_dv_subset, package = "epiprocess") -} - -} -\keyword{datasets} diff --git a/man/cases_deaths_subset.Rd b/man/cases_deaths_subset.Rd deleted file mode 100644 index 45e8dd4cb..000000000 --- a/man/cases_deaths_subset.Rd +++ /dev/null @@ -1,79 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/reexports.R -\docType{data} -\name{cases_deaths_subset} -\alias{cases_deaths_subset} -\title{Subset of JHU daily state COVID-19 cases and deaths from 6 states} -\format{ -An object of class \code{epi_df} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 4026 rows and 6 columns. -} -\source{ -This object contains a modified part of the -\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} -as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. -This data set is licensed under the terms of the -\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} -by the Johns Hopkins University on behalf of its Center for Systems Science -in Engineering. Copyright Johns Hopkins University 2020. - -Modifications: -\itemize{ -\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: -The case signal is taken directly from the JHU CSSE -\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository}. -The rate signals were computed by Delphi using Census population data. -The 7-day average signals were computed by Delphi by calculating moving -averages of the preceding 7 days, so the signal for June 7 is the -average of the underlying data for June 1 through 7, inclusive. -\item Furthermore, the data has been limited to a very small number of rows, the -signal names slightly altered, and formatted into an \code{epi_df}. -} -} -\usage{ -cases_deaths_subset -} -\description{ -This data source of confirmed COVID-19 cases and deaths is based on reports -made available by the Center for Systems Science and Engineering at Johns -Hopkins University. This example data is a snapshot as of March 20, 2024, and -ranges from March 1, 2020 to December 31, 2021. It is limited to California, -Florida, Texas, New York, Georgia, and Pennsylvania. - -It is used in the {epiprocess} growth rate and \code{epi_slide} vignettes. -} -\section{Data dictionary}{ - - -The data has columns: -\describe{ -\item{geo_value}{the geographic value associated with each row -of measurements.} -\item{time_value}{the time value associated with each row of measurements.} -\item{case_rate_7d_av}{7-day average signal of number of new -confirmed COVID-19 cases per 100,000 population, daily} -\item{death_rate_7d_av}{7-day average signal of number of new confirmed -deaths due to COVID-19 per 100,000 population, daily} -\item{cases}{Number of new confirmed COVID-19 cases, daily} -\item{cases_7d_av}{7-day average signal of number of new confirmed -COVID-19 cases, daily} -} - -} - -\examples{ -# Since this is a re-exported dataset, it cannot be loaded using -# the `data()` function. `data()` looks for a file of the same name -# in the `data/` directory, which doesn't exist in this package. -# works -epiprocess::cases_deaths_subset - -# works -library(epiprocess) -cases_deaths_subset - -# fails -\dontrun{ -data(cases_deaths_subset, package = "epiprocess") -} -} -\keyword{datasets} diff --git a/man/check_ukey_unique.Rd b/man/check_ukey_unique.Rd new file mode 100644 index 000000000..c6306f07f --- /dev/null +++ b/man/check_ukey_unique.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{check_ukey_unique} +\alias{check_ukey_unique} +\title{Check that a unique key is indeed unique in a tibble (TRUE/str)} +\usage{ +check_ukey_unique(x, ukey_names, end_cli_message = character()) +} +\arguments{ +\item{x}{a tibble, with no particular row or column order (if you have a +guaranteed row order based on the ukey you can probably do something more +efficient)} + +\item{ukey_names}{character vector; subset of column names of \code{x} denoting a +unique key.} + +\item{end_cli_message}{optional character vector, a cli message format +string/vector; information/advice to tack onto any error messages.} +} +\value{ +\code{TRUE} if no ukey is duplicated (i.e., \code{x[ukey_names]} has no +duplicated rows); string with an error message if there are errors. +} +\description{ +A \code{checkmate}-style check function. +} +\keyword{internal} diff --git a/man/complete.epi_df.Rd b/man/complete.epi_df.Rd index 71dbcb385..38c9a6fe2 100644 --- a/man/complete.epi_df.Rd +++ b/man/complete.epi_df.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/methods-epi_df.R \name{complete.epi_df} \alias{complete.epi_df} -\title{Complete epi_df} +\title{"Complete" an \code{epi_df}, adding missing rows and/or replacing \code{NA}s} \usage{ \method{complete}{epi_df}(data, ..., fill = list(), explicit = TRUE) } @@ -16,7 +16,9 @@ \item{explicit}{see \code{\link[tidyr:complete]{tidyr::complete}}} } \description{ -A \code{tidyr::complete()} analogue for \verb{epi_df`` objects. This function can be used, for example, to add rows for missing combinations of }geo_value\code{and}time_value\verb{, filling other columns with }NA`s. +A \code{tidyr::complete()} analogue for \code{epi_df} objects. This function +can be used, for example, to add rows for missing combinations +of \code{geo_value} and \code{time_value}, filling other columns with \code{NA}s. See the examples for usage details. } \examples{ diff --git a/man/covid_case_death_rates_extended.Rd b/man/covid_case_death_rates_extended.Rd deleted file mode 100644 index 72482edde..000000000 --- a/man/covid_case_death_rates_extended.Rd +++ /dev/null @@ -1,74 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/reexports.R -\docType{data} -\name{covid_case_death_rates_extended} -\alias{covid_case_death_rates_extended} -\title{JHU daily COVID-19 cases and deaths rates from all states} -\format{ -An object of class \code{epi_df} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 37576 rows and 4 columns. -} -\source{ -This object contains a modified part of the -\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} -as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. -This data set is licensed under the terms of the -\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} -by the Johns Hopkins University on behalf of its Center for Systems Science -in Engineering. Copyright Johns Hopkins University 2020. - -Modifications: -\itemize{ -\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: -These signals are taken directly from the JHU CSSE -\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} -without changes. The 7-day average signals are computed by Delphi by -calculating moving averages of the preceding 7 days, so the signal for -June 7 is the average of the underlying data for June 1 through 7, -inclusive. -} -} -\usage{ -covid_case_death_rates_extended -} -\description{ -This data source of confirmed COVID-19 cases and deaths is based on reports -made available by the Center for Systems Science and Engineering at Johns -Hopkins University, as downloaded from the CMU Delphi COVIDcast Epidata -API. This example data is a snapshot as of May 31, 2022, and -ranges from March 1, 2020 to December 31, 2021. It -includes all states. -} -\section{Data dictionary}{ - - -The data has columns: -\describe{ -\item{geo_value}{the geographic value associated with each row -of measurements.} -\item{time_value}{the time value associated with each row of measurements.} -\item{case_rate}{7-day average signal of number of new -confirmed COVID-19 cases per 100,000 population, daily} -\item{death_rate}{7-day average signal of number of new confirmed -deaths due to COVID-19 per 100,000 population, daily} -} - -} - -\examples{ -# Since this is a re-exported dataset, it cannot be loaded using -# the `data()` function. `data()` looks for a file of the same name -# in the `data/` directory, which doesn't exist in this package. -# works -epiprocess::covid_case_death_rates_extended - -# works -library(epiprocess) -covid_case_death_rates_extended - -# fails -\dontrun{ -data(covid_case_death_rates_extended, package = "epiprocess") -} - -} -\keyword{datasets} diff --git a/man/covid_incidence_county_subset.Rd b/man/covid_incidence_county_subset.Rd deleted file mode 100644 index edc881d9d..000000000 --- a/man/covid_incidence_county_subset.Rd +++ /dev/null @@ -1,75 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/reexports.R -\docType{data} -\name{covid_incidence_county_subset} -\alias{covid_incidence_county_subset} -\title{Subset of JHU daily COVID-19 cases from counties in Massachusetts and Vermont} -\format{ -An object of class \code{epi_df} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 16212 rows and 5 columns. -} -\source{ -This object contains a modified part of the -\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as -\href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. -This data set is licensed under the terms of the -\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} -by the Johns Hopkins University on behalf of its Center for Systems -Science in Engineering. Copyright Johns Hopkins University 2020. - -Modifications: -\itemize{ -\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: -These signals are taken directly from the JHU CSSE -\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} -without changes. The 7-day average signals are computed by Delphi by -as moving averages of the preceding 7 days, so the signal for -June 7 is the average of the underlying data for June 1 through 7, -inclusive. -\item Furthermore, the data has been limited to a very small number of rows, -formatted into an \code{epi_df}, and the signal names slightly altered. -} -} -\usage{ -covid_incidence_county_subset -} -\description{ -This data source of confirmed COVID-19 cases and deaths -is based on reports made available by the Center for -Systems Science and Engineering at Johns Hopkins University. -This example data is a snapshot as of March 20, 2024, and -ranges from March 1, 2020 to December 31, 2021. -It is limited to counties from Massachusetts and Vermont. - -It is used in the {epiprocess} aggregation vignette. -} -\section{Data dictionary}{ - - -The data has columns: -\describe{ -\item{geo_value}{the geographic value associated with each row of measurements.} -\item{time_value}{the time value associated with each row of measurements.} -\item{cases}{Number of new confirmed COVID-19 cases, daily} -\item{county_name}{the name of the county} -\item{state_name}{the full name of the state} -} - -} - -\examples{ -# Since this is a re-exported dataset, it cannot be loaded using -# the `data()` function. `data()` looks for a file of the same name -# in the `data/` directory, which doesn't exist in this package. -# works -epiprocess::covid_incidence_county_subset - -# works -library(epiprocess) -covid_incidence_county_subset - -# fails -\dontrun{ -data(covid_incidence_county_subset, package = "epiprocess") -} -} -\keyword{datasets} diff --git a/man/covid_incidence_outliers.Rd b/man/covid_incidence_outliers.Rd deleted file mode 100644 index 52b49fd31..000000000 --- a/man/covid_incidence_outliers.Rd +++ /dev/null @@ -1,68 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/reexports.R -\docType{data} -\name{covid_incidence_outliers} -\alias{covid_incidence_outliers} -\title{Subset of JHU daily COVID-19 cases from New Jersey and Florida} -\format{ -An object of class \code{epi_df} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 730 rows and 3 columns. -} -\source{ -This object contains a modified part of the -\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} -as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. -This data set is licensed under the terms of the -\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} -by the Johns Hopkins University on behalf of its Center for Systems -Science in Engineering. Copyright Johns Hopkins University 2020. - -Modifications: -\itemize{ -\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: -These signals are taken directly from the JHU CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} without changes. -\item Furthermore, the data has been limited to a very small number of rows, -formatted into an \code{epi_df}, and the signal names slightly altered. -} -} -\usage{ -covid_incidence_outliers -} -\description{ -This data source of confirmed COVID-19 cases is based on reports made -available by the Center for Systems Science and Engineering at Johns -Hopkins University. This example data is downloaded from the CMU Delphi -COVIDcast Epidata API. It is a snapshot as of October 28, 2021, and captures the -cases from June 1, 2020 to May 31, 2021. It is limited to New Jersey and -Florida. - -This data set is used in the {epiprocess} vignette on outliers. -} -\section{Data dictionary}{ - - -The data has columns: -\describe{ -\item{geo_value}{the geographic value associated with each row of measurements.} -\item{time_value}{the time value associated with each row of measurements.} -\item{cases}{Number of new confirmed COVID-19 cases, daily} -} - -} - -\examples{ -# Since this is a re-exported dataset, it cannot be loaded using -# the `data()` function. `data()` looks for a file of the same name -# in the `data/` directory, which doesn't exist in this package. -# works -epiprocess::covid_incidence_outliers - -# works -library(epiprocess) -covid_incidence_outliers - -# fails -\dontrun{ -data(covid_incidence_outliers, package = "epiprocess") -} -} -\keyword{datasets} diff --git a/man/epi_df.Rd b/man/epi_df.Rd index 4c592ab75..a67827187 100644 --- a/man/epi_df.Rd +++ b/man/epi_df.Rd @@ -4,12 +4,13 @@ \alias{as_epi_df} \alias{as_epi_df.epi_df} \alias{as_epi_df.tbl_df} +\alias{as_epi_df.grouped_df} \alias{as_epi_df.data.frame} \alias{as_epi_df.tbl_ts} +\alias{is_epi_df} \alias{new_epi_df} \alias{epi_df} -\alias{is_epi_df} -\title{\code{epi_df} object} +\title{Test for \code{epi_df} format} \usage{ as_epi_df(x, ...) @@ -24,10 +25,14 @@ as_epi_df(x, ...) ... ) +\method{as_epi_df}{grouped_df}(x, ...) + \method{as_epi_df}{data.frame}(x, as_of, other_keys = character(), ...) \method{as_epi_df}{tbl_ts}(x, as_of, other_keys = character(), ...) +is_epi_df(x) + new_epi_df( x = tibble::tibble(geo_value = character(), time_value = as.Date(integer())), geo_type, @@ -36,8 +41,6 @@ new_epi_df( other_keys = character(), ... ) - -is_epi_df(x) } \arguments{ \item{x}{An object.} @@ -65,9 +68,18 @@ then the current day-time will be used.} as a character vector here (typical examples are "age" or sub-geographies).} } \value{ -An \code{epi_df} object. +\itemize{ +\item Of \code{as_epi_df()}: an (ungrouped) \code{epi_df} +} -\code{TRUE} if the object inherits from \code{epi_df}. +\itemize{ +\item Of \code{is_epi_df}: \code{TRUE} if the object inherits from \code{epi_df}, +otherwise \code{FALSE}. +} + +\itemize{ +\item Of \code{new_epi_df()}: an \code{epi_df} +} } \description{ One of the two main data structures for storing time series in \code{epiprocess}. diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 10d389578..f053909db 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -16,8 +16,8 @@ epi_slide( ) } \arguments{ -\item{.x}{An \code{epi_df} object. If ungrouped, we group by \code{geo_value} and any -columns in \code{other_keys}. If grouped, we make sure the grouping is by +\item{.x}{An \code{epi_df} object. If ungrouped, we temporarily group by \code{geo_value} +and any columns in \code{other_keys}. If grouped, we make sure the grouping is by \code{geo_value} and \code{other_keys}.} \item{.f}{Function, formula, or missing; together with \code{...} specifies the @@ -91,7 +91,8 @@ a missing value marker (typically NA, but more technically the result of } \value{ An \code{epi_df} object with one or more new slide computation columns -added. +added. It will be ungrouped if \code{.x} was ungrouped, and have the same groups +as \code{.x} if \code{.x} was grouped. } \description{ Slides a given function over variables in an \code{epi_df} object. @@ -99,8 +100,8 @@ This is useful for computations like rolling averages. The function supports many ways to specify the computation, but by far the most common use case is as follows: -\if{html}{\out{
}}\preformatted{# Create new column `cases_7dm` that contains a 7-day trailing median of cases -epi_slide(edf, cases_7dav = median(cases), .window_size = 7) +\if{html}{\out{
}}\preformatted{# Create new column `cases_7dmed` that contains a 7-day trailing median of cases +epi_slide(edf, cases_7dmed = median(cases), .window_size = 7) }\if{html}{\out{
}} For two very common use cases, we provide optimized functions that are much @@ -128,6 +129,8 @@ determined the time window for the current computation. } } \examples{ +library(dplyr) + # Get the 7-day trailing standard deviation of cases and the 7-day trailing mean of cases cases_deaths_subset \%>\% epi_slide( @@ -135,44 +138,72 @@ cases_deaths_subset \%>\% cases_7dav = mean(cases, na.rm = TRUE), .window_size = 7 ) \%>\% - dplyr::select(geo_value, time_value, cases, cases_7sd, cases_7dav) + select(geo_value, time_value, cases, cases_7sd, cases_7dav) +# Note that epi_slide_mean could be used to more quickly calculate cases_7dav. + +# In addition to the [`dplyr::mutate`]-like syntax, you can feed in a function or +# formula in a way similar to [`dplyr::group_modify`]: +my_summarizer <- function(window_data) { + window_data \%>\% + summarize( + cases_7sd = sd(cases, na.rm = TRUE), + cases_7dav = mean(cases, na.rm = TRUE) + ) +} +cases_deaths_subset \%>\% + epi_slide( + ~ my_summarizer(.x), + .window_size = 7 + ) \%>\% + select(geo_value, time_value, cases, cases_7sd, cases_7dav) + -# The same as above, but unpacking using an unnamed data.frame with a formula + + + +#### Advanced: #### + +# The tidyverse supports ["packing"][tidyr::pack] multiple columns into a +# single tibble-type column contained within some larger tibble. Like dplyr, +# we normally don't pack output columns together. However, packing behavior can be turned on +# by providing a name for a tibble-type output: cases_deaths_subset \%>\% epi_slide( - ~ data.frame( + slide_packed = tibble( cases_7sd = sd(.x$cases, na.rm = TRUE), cases_7dav = mean(.x$cases, na.rm = TRUE) ), .window_size = 7 ) \%>\% - dplyr::select(geo_value, time_value, cases, cases_7sd, cases_7dav) - -# The same as above, but packing using a named data.frame with a tidy evaluation -# expression + select(geo_value, time_value, cases, slide_packed) cases_deaths_subset \%>\% epi_slide( - slide_packed = data.frame( + ~ tibble( cases_7sd = sd(.x$cases, na.rm = TRUE), cases_7dav = mean(.x$cases, na.rm = TRUE) ), + .new_col_name = "slide_packed", .window_size = 7 ) \%>\% - dplyr::select(geo_value, time_value, cases, slide_packed) + select(geo_value, time_value, cases, slide_packed) -# nested new columns +# You can also get ["nested"][tidyr::nest] format by wrapping your results in +# a list: cases_deaths_subset \%>\% group_by(geo_value) \%>\% epi_slide( function(x, g, t) { - data.frame( + list(tibble( cases_7sd = sd(x$cases, na.rm = TRUE), cases_7dav = mean(x$cases, na.rm = TRUE) - ) + )) }, .window_size = 7 ) \%>\% - dplyr::select(geo_value, time_value, cases, cases_7sd, cases_7dav) + ungroup() \%>\% + select(geo_value, time_value, slide_value) + + # Use the geo_value or the ref_time_value in the slide computation cases_deaths_subset \%>\% diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index cd293ee10..68244410b 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -13,6 +13,9 @@ epi_slide_opt( ..., .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, + .suffix = NULL, + .new_col_names = NULL, .ref_time_values = NULL, .all_rows = FALSE ) @@ -23,6 +26,9 @@ epi_slide_mean( ..., .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, + .suffix = NULL, + .new_col_names = NULL, .ref_time_values = NULL, .all_rows = FALSE ) @@ -33,17 +39,20 @@ epi_slide_sum( ..., .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, + .suffix = NULL, + .new_col_names = NULL, .ref_time_values = NULL, .all_rows = FALSE ) } \arguments{ -\item{.x}{An \code{epi_df} object. If ungrouped, we group by \code{geo_value} and any -columns in \code{other_keys}. If grouped, we make sure the grouping is by +\item{.x}{An \code{epi_df} object. If ungrouped, we temporarily group by \code{geo_value} +and any columns in \code{other_keys}. If grouped, we make sure the grouping is by \code{geo_value} and \code{other_keys}.} \item{.col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column -name(e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), +name (e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), \link[tidyselect:language]{other tidy-select expression}, or a vector of characters (e.g. \code{c("cases", "deaths")}). Variable names can be used as if they were positions in the data frame, so expressions like \code{x:y} can be @@ -96,6 +105,20 @@ window will be asymmetric and have one more value before the reference time than after. }} +\item{.prefix}{Optional \code{\link[glue:glue]{glue::glue}} format string; name the slide result +column(s) by attaching this prefix to the corresponding input column(s). +Some shorthand is supported for basing the output names on \code{.window_size} +or other arguments; see "Prefix and suffix shorthand" below.} + +\item{.suffix}{Optional \code{\link[glue:glue]{glue::glue}} format string; like \code{.prefix}. The +default naming behavior is equivalent to \code{.suffix = "_{.n}{.time_unit_abbr}{.align_abbr}{.f_abbr}"}. Can be used in combination +with \code{.prefix}.} + +\item{.new_col_names}{Optional character vector with length matching the +number of input columns from \code{.col_names}; name the slide result column(s) +with these names. Cannot be used in combination with \code{.prefix} and/or +\code{.suffix}.} + \item{.ref_time_values}{The time values at which to compute the slides values. By default, this is all the unique time values in \code{.x}.} @@ -107,7 +130,8 @@ a missing value marker (typically NA, but more technically the result of } \value{ An \code{epi_df} object with one or more new slide computation columns -added. +added. It will be ungrouped if \code{.x} was ungrouped, and have the same groups +as \code{.x} if \code{.x} was grouped. } \description{ \code{epi_slide_opt} allows sliding an n-timestep \link[data.table:froll]{data.table::froll} @@ -119,44 +143,74 @@ These functions tend to be much faster than \code{epi_slide()}. See \code{epi_slide_sum} is a wrapper around \code{epi_slide_opt} with \code{.f = datatable::frollsum}. } +\section{Prefix and suffix shorthand}{ + + +\code{\link[glue:glue]{glue::glue}} format strings specially interpret content within curly +braces. E.g., \code{glue::glue("ABC{2 + 2}")} evaluates to \code{"ABC4"}. For \code{.prefix} +and \code{.suffix}, we provide \code{glue} with some additional variable bindings: +\itemize{ +\item \code{{.n}} will be the number of time steps in the computation +corresponding to the \code{.window_size}. +\item \code{{.time_unit_abbr}} will be a lower-case letter corresponding to the +\code{time_type} of \code{.x} +\item \code{{.align_abbr}} will be \code{""} if \code{.align} is the default of \code{"right"}; +otherwise, it will be the first letter of \code{.align} +\item \code{{.f_abbr}} will be a character vector containing a short abbreviation +for \code{.f} factoring in the input column type(s) for \code{.col_names} +} +} + \examples{ -# Compute a 7-day trailing average on cases. +library(dplyr) + +# Add a column (`cases_7dsum`) containing a 7-day trailing sum on `cases`: cases_deaths_subset \%>\% - group_by(geo_value) \%>\% - epi_slide_opt(cases, .f = data.table::frollmean, .window_size = 7) \%>\% - dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) + select(geo_value, time_value, cases) \%>\% + epi_slide_sum(cases, .window_size = 7) -# Same as above, but adjust `frollmean` settings for speed, accuracy, and -# to allow partially-missing windows. +# Add a column (`cases_rate_7dav`) containing a 7-day trailing average on `case_rate`: +covid_case_death_rates_extended \%>\% + epi_slide_mean(case_rate, .window_size = 7) + +# Use a less common specialized slide function: cases_deaths_subset \%>\% - group_by(geo_value) \%>\% - epi_slide_opt( - cases, - .f = data.table::frollmean, .window_size = 7, - algo = "exact", hasNA = TRUE, na.rm = TRUE - ) \%>\% - dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) -# Compute a 7-day trailing average on cases. + epi_slide_opt(cases, slider::slide_min, .window_size = 7) + +# Specify output column names and/or a naming scheme: cases_deaths_subset \%>\% + select(geo_value, time_value, cases) \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, .window_size = 7) \%>\% - dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) - -# Same as above, but adjust `frollmean` settings for speed, accuracy, and -# to allow partially-missing windows. + epi_slide_sum(cases, .window_size = 7, .new_col_names = "case_sum") \%>\% + ungroup() cases_deaths_subset \%>\% + select(geo_value, time_value, cases) \%>\% group_by(geo_value) \%>\% + epi_slide_sum(cases, .window_size = 7, .prefix = "sum_") \%>\% + ungroup() + +# Additional settings can be sent to the {data.table} and {slider} functions +# via `...`. This example passes some arguments to `frollmean` settings for +# speed, accuracy, and to allow partially-missing windows: +covid_case_death_rates_extended \%>\% epi_slide_mean( - cases, + case_rate, .window_size = 7, na.rm = TRUE, algo = "exact", hasNA = TRUE - ) \%>\% - dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) -# Compute a 7-day trailing sum on cases. + ) + +# If the more specialized possibilities for `.f` don't cover your needs, you +# can use `epi_slide_opt` with `.f = data.table::frollapply` to apply a +# custom function at the cost of more computation time. See also `epi_slide` +# if you need something even more general. cases_deaths_subset \%>\% - group_by(geo_value) \%>\% - epi_slide_sum(cases, .window_size = 7) \%>\% - dplyr::select(geo_value, time_value, cases, cases_7dsum = slide_value_cases) + select(geo_value, time_value, case_rate_7d_av, death_rate_7d_av) \%>\% + epi_slide_opt(c(case_rate_7d_av, death_rate_7d_av), + data.table::frollapply, + FUN = median, .window_size = 28, + .suffix = "_{.n}{.time_unit_abbr}_median" + ) \%>\% + print(n = 40) } \seealso{ \code{\link{epi_slide}} for the more general slide function diff --git a/man/epidatasets_reexports.Rd b/man/epidatasets_reexports.Rd new file mode 100644 index 000000000..3dc809e45 --- /dev/null +++ b/man/epidatasets_reexports.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/reexports.R +\docType{data} +\name{cases_deaths_subset} +\alias{cases_deaths_subset} +\alias{covid_incidence_county_subset} +\alias{covid_incidence_outliers} +\alias{archive_cases_dv_subset} +\alias{covid_case_death_rates_extended} +\title{Selected example data sets from \code{epidatasets}} +\format{ +An object of class \code{epi_df} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 4026 rows and 6 columns. + +An object of class \code{epi_df} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 16212 rows and 5 columns. + +An object of class \code{epi_df} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 730 rows and 3 columns. + +An object of class \code{epi_archive} of length 6. + +An object of class \code{epi_df} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 37576 rows and 4 columns. +} +\usage{ +cases_deaths_subset + +covid_incidence_county_subset + +covid_incidence_outliers + +archive_cases_dv_subset + +covid_case_death_rates_extended +} +\description{ +Data sets re-exported from \code{epidatasets}; please see +documentation for each of these objects in \code{epidatasets}. + +A brief description of the format of each of the objects above are described +in matching order below. +} +\keyword{internal} diff --git a/man/epiprocess-package.Rd b/man/epiprocess-package.Rd index b4f3e1740..774d5f8ac 100644 --- a/man/epiprocess-package.Rd +++ b/man/epiprocess-package.Rd @@ -16,12 +16,13 @@ Useful links: } \author{ -\strong{Maintainer}: Logan Brooks \email{lcbrooks@andrew.cmu.edu} +\strong{Maintainer}: Logan Brooks \email{lcbrooks+github@andrew.cmu.edu} Authors: \itemize{ \item Daniel McDonald \item Evan Ray + \item Dmitry Shemetov \item Ryan Tibshirani } @@ -34,7 +35,7 @@ Other contributors: \item Ken Mawer [contributor] \item Chloe You [contributor] \item Quang Nguyen [contributor] - \item Dmitry Shemetov [contributor] + \item David Weber \email{davidweb@andrew.cmu.edu} [contributor] \item Lionel Henry (Author of included rlang fragments) [contributor] \item Hadley Wickham (Author of included rlang fragments) [contributor] \item Posit (Copyright holder of included rlang fragments) [copyright holder] diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index 6fe2cfa5c..cadb19833 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -104,10 +104,10 @@ requested \code{.versions}) for rows having a \code{time_value} of at least `.ve }} } \value{ -A tibble whose columns are: the grouping variables, \code{time_value}, -containing the reference time values for the slide computation, and a -column named according to the \code{.new_col_name} argument, containing the slide -values. +A tibble whose columns are: the grouping variables (if any), +\code{time_value}, containing the reference time values for the slide +computation, and a column named according to the \code{.new_col_name} argument, +containing the slide values. It will be grouped by the grouping variables. } \description{ Slides a given function over variables in an \code{epi_archive} object. This diff --git a/man/time_delta_to_n_steps.Rd b/man/time_delta_to_n_steps.Rd new file mode 100644 index 000000000..937159195 --- /dev/null +++ b/man/time_delta_to_n_steps.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{time_delta_to_n_steps} +\alias{time_delta_to_n_steps} +\title{Convert a time delta to a integerish number of "unit" steps between time values} +\usage{ +time_delta_to_n_steps(time_delta, time_type) +} +\arguments{ +\item{time_delta}{a vector that can be added to time values of time type +\code{time_type} to arrive at other time values of that time type, or +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} such a vector with Inf/-Inf entries mixed +in, if supported by the class of \code{time_delta}, even if \code{time_type} doesn't +necessarily support Inf/-Inf entries. Basically a slide window arg but +without sign and length restrictions.} + +\item{time_type}{as in \code{validate_slide_window_arg}} +} +\value{ +\link[rlang:is_integerish]{bare integerish} vector (with possible +infinite values) that produces the same result as \code{time_delta} when +multiplied by the natural \code{\link{unit_time_delta}} for +that time type and added to time values of time type \code{time_type}. If the +given time type does not support infinite values, then it should produce ++Inf or -Inf for analogous entries of \code{time_delta}, and match the addition +result match the addition result for non-infinite entries. +} +\description{ +Convert a time delta to a integerish number of "unit" steps between time values +} +\keyword{internal} diff --git a/man/unit_time_delta.Rd b/man/unit_time_delta.Rd new file mode 100644 index 000000000..46b3c48d6 --- /dev/null +++ b/man/unit_time_delta.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{unit_time_delta} +\alias{unit_time_delta} +\title{Object that, added to time_values of time_type, advances by one time step/interval} +\usage{ +unit_time_delta(time_type) +} +\arguments{ +\item{time_type}{string; \code{epi_df}'s or \code{epi_archive}'s \code{time_type}} +} +\value{ +an object \code{u} such that \code{time_values + u} represents advancing by one +time step / moving to the subsequent time interval for any \code{time_values} +object of time type \code{time_type}, and such that \code{time_values + k * u} for +integerish vector \code{k} advances by \code{k} steps (with vectorization, +recycling). +} +\description{ +Object that, added to time_values of time_type, advances by one time step/interval +} +\keyword{internal} diff --git a/man/unwrap.Rd b/man/unwrap.Rd new file mode 100644 index 000000000..dad0b4417 --- /dev/null +++ b/man/unwrap.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{unwrap} +\alias{unwrap} +\title{Extract singular element of a length-1 unnamed list (validated)} +\usage{ +unwrap(x) +} +\arguments{ +\item{x}{a length-1 list} +} +\value{ +x[\link{1L}], if x actually was a length-1 list; error otherwise +} +\description{ +Inverse of \code{list(elt)}. +} +\keyword{internal} diff --git a/tests/testthat/_snaps/epi_df.md b/tests/testthat/_snaps/epi_df.md new file mode 100644 index 000000000..29280bf82 --- /dev/null +++ b/tests/testthat/_snaps/epi_df.md @@ -0,0 +1,15 @@ +# as_epi_df errors on nonunique epikeytime + + Code + as_epi_df(tibble::tibble(geo_value = 1, time_value = 1, value = 1:2), as_of = 5) + Condition + Error: + ! Assertion on 'x' failed: There cannot be more than one row with the same combination of geo_value and time_value. Problematic rows: + # A tibble: 2 x 3 + geo_value time_value value + + 1 1 1 1 + 2 1 1 2 + > If this is line list data, convert it to counts/rates first. + > If this contains a demographic breakdown, check that you have specified appropriate `other_keys`. + diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index 297d68dfc..c3e51aa2b 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -40,6 +40,20 @@ test_that("as_epi_df errors for non-character other_keys", { expect_silent(as_epi_df(ex_input, other_keys = c("state", "pol"))) }) +test_that("as_epi_df errors on nonunique epikeytime", { + expect_snapshot( + as_epi_df(tibble::tibble( + geo_value = 1, time_value = 1, value = 1:2 + ), as_of = 5), + error = TRUE + ) + expect_no_error( + as_epi_df(tibble::tibble( + geo_value = 1, age_group = 1:2, time_value = 1, value = 1:2 + ), other_keys = "age_group", as_of = 5) + ) +}) + test_that("as_epi_df works for nonstandard input", { tib <- tibble::tibble( x = 1:10, y = 1:10, @@ -75,6 +89,22 @@ test_that("as_epi_df works for nonstandard input", { ) }) +test_that("as_epi_df ungroups", { + expect_false( + tibble::tibble(geo_value = 1, time_value = 1) %>% + dplyr::group_by(geo_value) %>% + as_epi_df(as_of = 2) %>% + dplyr::is_grouped_df() + ) + expect_false( + tibble::tibble(geo_value = 1, time_value = 1) %>% + as_epi_df(as_of = 2) %>% + dplyr::group_by(geo_value) %>% + as_epi_df(as_of = 2) %>% + dplyr::is_grouped_df() + ) +}) + # select fixes tib <- tibble::tibble( x = 1:10, y = 1:10, diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index f658bcf4e..0aa4aca7f 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -300,34 +300,48 @@ for (p in (param_combinations %>% transpose())) { concatenate_list_params(p) ), { - out_sum <- rlang::inject(epi_slide(test_data, ~ sum(.x$value), !!!slide_args)) %>% - rename(slide_value_value = slide_value) - out_mean <- rlang::inject(epi_slide(test_data, ~ mean(.x$value), !!!slide_args)) %>% - rename(slide_value_value = slide_value) + out_sum <- rlang::inject(epi_slide(test_data, ~ sum(.x$value), !!!slide_args)) + out_mean <- rlang::inject(epi_slide(test_data, ~ mean(.x$value), !!!slide_args)) expect_equal( out_sum, - rlang::inject(epi_slide_opt(test_data, value, .f = data.table::frollsum, !!!slide_args)) + rlang::inject(epi_slide_opt(test_data, value, + .f = data.table::frollsum, !!!slide_args, + .new_col_names = "slide_value" + )) ) expect_equal( out_sum, - rlang::inject(epi_slide_opt(test_data, value, .f = slider::slide_sum, !!!slide_args)) + rlang::inject(epi_slide_opt(test_data, value, + .f = slider::slide_sum, !!!slide_args, + .new_col_names = "slide_value" + )) ) expect_equal( out_sum, - rlang::inject(epi_slide_sum(test_data, value, !!!slide_args)) + rlang::inject(epi_slide_sum(test_data, value, !!!slide_args, + .new_col_names = "slide_value" + )) ) expect_equal( out_mean, - rlang::inject(epi_slide_opt(test_data, value, .f = data.table::frollmean, !!!slide_args)) + rlang::inject(epi_slide_opt(test_data, value, + .f = data.table::frollmean, !!!slide_args, + .new_col_names = "slide_value" + )) ) expect_equal( out_mean, - rlang::inject(epi_slide_opt(test_data, value, .f = slider::slide_mean, !!!slide_args)) + rlang::inject(epi_slide_opt(test_data, value, + .f = slider::slide_mean, !!!slide_args, + .new_col_names = "slide_value" + )) ) expect_equal( out_mean, - rlang::inject(epi_slide_mean(test_data, value, !!!slide_args)) + rlang::inject(epi_slide_mean(test_data, value, !!!slide_args, + .new_col_names = "slide_value" + )) ) } ) @@ -730,7 +744,7 @@ test_that("no dplyr warnings from selecting multiple columns", { ) expect_equal( names(multi_slid), - c("geo_value", "time_value", "value", "value2", "slide_value_value", "slide_value_value2") + c("geo_value", "time_value", "value", "value2", "value_7dav", "value2_7dav") ) expect_no_warning( multi_slid_select <- epi_slide_mean(multi_columns, c(value, value2), .window_size = 7) @@ -741,3 +755,185 @@ test_that("no dplyr warnings from selecting multiple columns", { ) expect_equal(multi_slid_select, multi_slid) }) + +test_that("epi_slide_opt output naming features", { + multi_columns <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:200, value = 1:200, value2 = -1:-200), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), value2 = 1:5) + ) %>% + as_epi_df() %>% + group_by(geo_value) + multi_columns_weekly <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = test_date + 7 * (1:200), value = 1:200, value2 = -1:-200), + dplyr::tibble(geo_value = "al", time_value = test_date + 7 * (1:5), value = -(1:5), value2 = 1:5) + ) %>% + as_epi_df() %>% + group_by(geo_value) + yearmonthly <- + tibble::tibble( + geo_value = 1, + time_value = tsibble::make_yearmonth(2000, 1) + 1:30 - 1, + value = 1:30 %% 2 == 0 + ) %>% + as_epi_df() %>% + group_by(geo_value) + + # Auto-naming: + # * Changing .f and .window_size: + expect_equal( + multi_columns %>% epi_slide_opt(value2, frollmean, .window_size = 14) %>% names(), + c(names(multi_columns), "value2_14dav") + ) + expect_equal( + multi_columns %>% epi_slide_opt(value2, slide_mean, .window_size = as.difftime(14, units = "days")) %>% names(), + c(names(multi_columns), "value2_14dav") + ) + expect_equal( + multi_columns %>% epi_slide_opt(value2, slide_sum, .window_size = Inf) %>% names(), + c(names(multi_columns), "value2_running_sum") + ) + # * Changing .f and .align: + expect_equal( + multi_columns %>% epi_slide_opt(value2, slide_min, .window_size = 14, .align = "center") %>% names(), + c(names(multi_columns), "value2_14dcmin") + ) + expect_equal( + multi_columns %>% epi_slide_opt(value2, slide_max, .window_size = 14, .align = "left") %>% names(), + c(names(multi_columns), "value2_14dlmax") + ) + # * Changing .f, time_type(, .window_size): + expect_equal( + multi_columns_weekly %>% + epi_slide_opt(value2, slide_prod, .window_size = as.difftime(2, units = "weeks")) %>% + names(), + c(names(multi_columns_weekly), "value2_2wprod") + ) + expect_equal( + yearmonthly %>% epi_slide_opt(value, slide_any, .window_size = 3) %>% names(), + c(names(yearmonthly), "value_3many") # not the best name, but super unlikely anyway + ) + # * Through forwarding functions: + expect_equal( + # XXX perhaps this should be an auto-naming feature? + yearmonthly %>% + epi_slide_mean(value, .window_size = Inf) %>% + names(), + c(names(yearmonthly), "value_running_prop") + ) + expect_equal( + # XXX perhaps this should be an auto-naming feature? + yearmonthly %>% + epi_slide_sum(value, .window_size = Inf) %>% + names(), + c(names(yearmonthly), "value_running_count") + ) + + # Manual naming: + # * Various combinations of args: + expect_equal( + multi_columns %>% + epi_slide_opt(starts_with("value"), slide_sum, .window_size = 7, .suffix = "_s{.n}") %>% + names(), + c(names(multi_columns), "value_s7", "value2_s7") + ) + expect_equal( + multi_columns %>% + epi_slide_opt(starts_with("value"), slide_sum, .window_size = 7, .prefix = "{.f_abbr}_", .suffix = "_{.n}") %>% + names(), + c(names(multi_columns), "sum_value_7", "sum_value2_7") + ) + expect_equal( + multi_columns %>% + epi_slide_opt(starts_with("value"), slide_sum, .window_size = 7, .prefix = "slide_value_") %>% + names(), + c(names(multi_columns), "slide_value_value", "slide_value_value2") + ) + expect_equal( + multi_columns %>% + epi_slide_opt(starts_with("value"), slide_sum, .window_size = 7, .new_col_names = c("slide_value", "sv2")) %>% + names(), + c(names(multi_columns), "slide_value", "sv2") + ) + # * Through forwarding functions: + expect_equal( + yearmonthly %>% epi_slide_mean(value, .window_size = Inf, .suffix = "_{.f_abbr}") %>% names(), + c(names(yearmonthly), "value_prop") + ) + expect_equal( + yearmonthly %>% epi_slide_sum(value, .window_size = Inf, .suffix = "_{.f_abbr}") %>% names(), + c(names(yearmonthly), "value_count") + ) + + # Validation errors: + # * Wrong sizes: + expect_error( + multi_columns %>% epi_slide_opt(starts_with("value"), slide_sum, + .window_size = 7, + .suffix = c("a", "b") + ) + ) + expect_error( + multi_columns %>% epi_slide_opt(starts_with("value"), slide_sum, + .window_size = 7, + .new_col_names = "slide_value" + ) + ) + expect_error( + multi_columns %>% epi_slide_mean(starts_with("value"), .window_size = 7, .new_col_names = "output") + ) + # * Incompatible args: + expect_error( + multi_columns %>% epi_slide_opt(value, slide_sum, + .window_size = 7, + .prefix = "a", .suffix = "b", .new_col_names = "slide_value" + ), + class = "epiprocess__epi_slide_opt_incompatible_naming_args" + ) + # * Bad resulting output names: + expect_error( + multi_columns %>% epi_slide_mean(value, .window_size = 7, .new_col_names = "value"), + class = "epiprocess__epi_slide_opt_old_new_name_conflict" + ) + expect_error( + multi_columns %>% epi_slide_mean(value:value2, .window_size = 7, .new_col_names = c("output", "output")), + class = "epiprocess__epi_slide_opt_new_name_duplicated" + ) +}) + +test_that("epi_slide* output grouping matches input grouping", { + toy_edf <- as_epi_df(bind_rows(list( + tibble(geo_value = 1, age_group = 1, time_value = as.Date("2020-01-01") + 1:10 - 1, value = 1:10), + tibble(geo_value = 1, age_group = 2, time_value = as.Date("2020-01-01") + 1:10 - 1, value = 20:11), + tibble(geo_value = 2, age_group = 2, time_value = as.Date("2020-01-01") + 1:10 - 1, value = 31:40) + )), other_keys = "age_group", as_of = as.Date("2020-01-01") + 20) + + # Preserving existing grouping: + expect_equal( + toy_edf %>% + group_by(age_group, geo_value) %>% + epi_slide(value_7dsum = sum(value), .window_size = 7) %>% + group_vars(), + c("age_group", "geo_value") + ) + expect_equal( + toy_edf %>% + group_by(age_group, geo_value) %>% + epi_slide_sum(value, .window_size = 7) %>% + group_vars(), + c("age_group", "geo_value") + ) + + # Removing automatic grouping: + expect_equal( + toy_edf %>% + epi_slide(value_7dsum = sum(value), .window_size = 7) %>% + group_vars(), + character(0) + ) + expect_equal( + toy_edf %>% + epi_slide_sum(value, .window_size = 7) %>% + group_vars(), + character(0) + ) +}) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index c5e6c5aa5..37125d533 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -362,3 +362,59 @@ test_that("validate_slide_window_arg works", { class = "epiprocess__validate_slide_window_arg" ) }) + +test_that("unit_time_delta works", { + expect_equal( + as.Date("2020-01-01") + 5 * unit_time_delta("day"), + as.Date("2020-01-06") + ) + expect_equal( + as.Date("2020-01-01") + 2 * unit_time_delta("week"), + as.Date("2020-01-15") + ) + expect_equal( + tsibble::make_yearmonth(2000, 1) + 5 * unit_time_delta("yearmonth"), + tsibble::make_yearmonth(2000, 6) + ) + expect_equal( + 1L + 5L * unit_time_delta("integer"), + 6L + ) + # + expect_equal( + as.Date("2020-01-01") + + time_delta_to_n_steps(as.Date("2020-01-06") - as.Date("2020-01-01"), "day") * + unit_time_delta("day"), + as.Date("2020-01-06") + ) + expect_equal( + as.Date("2020-01-01") + + time_delta_to_n_steps(as.integer(as.Date("2020-01-06") - as.Date("2020-01-01")), "day") * + unit_time_delta("day"), + as.Date("2020-01-06") + ) + expect_equal( + as.Date("2020-01-01") + + time_delta_to_n_steps(as.Date("2020-01-15") - as.Date("2020-01-01"), "week") * + unit_time_delta("week"), + as.Date("2020-01-15") + ) + expect_equal( + as.Date("2020-01-01") + + time_delta_to_n_steps(as.difftime(2, units = "weeks"), "week") * + unit_time_delta("week"), + as.Date("2020-01-15") + ) + expect_equal( + tsibble::make_yearmonth(2000, 1) + + time_delta_to_n_steps(5, "yearmonth") * + unit_time_delta("yearmonth"), + tsibble::make_yearmonth(2000, 6) + ) + expect_equal( + 1L + + time_delta_to_n_steps(5, "integer") * + unit_time_delta("integer"), + 6L + ) +}) diff --git a/vignettes/epiprocess.Rmd b/vignettes/epiprocess.Rmd index 613f91465..b9648e00b 100644 --- a/vignettes/epiprocess.Rmd +++ b/vignettes/epiprocess.Rmd @@ -86,7 +86,7 @@ edf %>% ``` We can compute the 7 day moving average of the confirmed daily cases for each -geo_value by using the `epi_slide_mean()` function. For a more in-depth guide to +`geo_value` by using the `epi_slide_mean()` function. For a more in-depth guide to sliding, see `vignette("epi_df")`. ```{r} @@ -96,7 +96,7 @@ edf %>% ``` We can compute the growth rate of the confirmed cumulative cases for each -geo_value. For a more in-depth guide to growth rates, see `vignette("growth_rate")`. +`geo_value`. For a more in-depth guide to growth rates, see `vignette("growth_rate")`. ```{r} edf %>% @@ -104,7 +104,7 @@ edf %>% mutate(cases_growth = growth_rate(x = time_value, y = cases_cumulative, method = "rel_change", h = 7)) ``` -Detect outliers in daily reported cases for each geo_value. For a more in-depth +Detect outliers in daily reported cases for each `geo_value`. For a more in-depth guide to outlier detection, see `vignette("outliers")`. ```{r message=FALSE} @@ -114,8 +114,8 @@ edf %>% ungroup() ``` -Add a column to the epi_df object with the daily deaths for each geo_value and -compute the correlations between cases and deaths for each geo_value. For a more +Add a column to the epi_df object with the daily deaths for each `geo_value` and +compute the correlations between cases and deaths for each `geo_value`. For a more in-depth guide to correlations, see `vignette("correlation")`. ```{r}