Skip to content

epi_df argument refactoring #460

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Jul 19, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ export(epix_merge)
export(epix_slide)
export(epix_truncate_versions_after)
export(filter)
export(geo_column_names)
export(group_by)
export(group_modify)
export(growth_rate)
Expand All @@ -75,9 +76,11 @@ export(next_after)
export(relocate)
export(rename)
export(slice)
export(time_column_names)
export(ungroup)
export(unnest)
export(validate_epi_archive)
export(version_column_names)
importFrom(checkmate,anyInfinite)
importFrom(checkmate,anyMissing)
importFrom(checkmate,assert)
Expand All @@ -100,6 +103,7 @@ importFrom(checkmate,test_subset)
importFrom(checkmate,vname)
importFrom(cli,cat_line)
importFrom(cli,cli_abort)
importFrom(cli,cli_inform)
importFrom(cli,cli_vec)
importFrom(cli,cli_warn)
importFrom(cli,format_message)
Expand Down Expand Up @@ -186,6 +190,7 @@ importFrom(tibble,as_tibble)
importFrom(tibble,new_tibble)
importFrom(tibble,validate_tibble)
importFrom(tidyr,unnest)
importFrom(tidyselect,any_of)
importFrom(tidyselect,eval_select)
importFrom(tidyselect,starts_with)
importFrom(tsibble,as_tsibble)
Expand Down
16 changes: 14 additions & 2 deletions R/archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,11 @@ validate_epi_archive <- function(

#' `as_epi_archive` converts a data frame, data table, or tibble into an
#' `epi_archive` object.
#' @param ... used for specifying column names, as in [`dplyr::rename`]. For
#' example `version = release_date`
#' @param .versions_end location based versions_end, used to avoid prefix
#' `version = issue` from being assigned to `versions_end` instead of being
#' used to rename columns.
#'
#' @rdname epi_archive
#'
Expand All @@ -454,11 +459,18 @@ as_epi_archive <- function(
additional_metadata = NULL,
compactify = NULL,
clobberable_versions_start = NULL,
versions_end = NULL) {
.versions_end = NULL, ...,
versions_end = .versions_end) {
assert_data_frame(x)
x <- rename(x, ...)
x <- guess_column_name(x, "time_value", time_column_names())
x <- guess_column_name(x, "geo_value", geo_column_names())
x <- guess_column_name(x, "version", version_column_names())
if (!test_subset(c("geo_value", "time_value", "version"), names(x))) {
cli_abort(
"Columns `geo_value`, `time_value`, and `version` must be present in `x`."
"Either columns `geo_value`, `time_value`, and `version` must be present in `x`,
or related columns (see the internal functions `guess_time_column_name()`,
`guess_geo_column_name()` and/or `guess_geo_version_name()` for complete list)."
)
}
if (anyMissing(x$version)) {
Expand Down
33 changes: 23 additions & 10 deletions R/epi_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ NULL
#'
#' @export
new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of,
additional_metadata = list(), ...) {
additional_metadata = list()) {
assert_data_frame(x)
assert_list(additional_metadata)

Expand Down Expand Up @@ -162,6 +162,7 @@ new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of,
#' guide](https://cmu-delphi.github.io/epiprocess/articles/epiprocess.html) for
#' examples.
#'
#' @param ... Additional arguments passed to methods.
#' @template epi_df-params
#'
#' @export
Expand Down Expand Up @@ -249,25 +250,37 @@ as_epi_df.epi_df <- function(x, ...) {

#' @method as_epi_df tbl_df
#' @describeIn as_epi_df The input tibble `x` must contain the columns
#' `geo_value` and `time_value`. All other columns will be preserved as is,
#' and treated as measured variables. If `as_of` is missing, then the function
#' will try to guess it from an `as_of`, `issue`, or `version` column of `x`
#' (if any of these are present), or from as an `as_of` field in its metadata
#' (stored in its attributes); if this fails, then the current day-time will
#' be used.
#' `geo_value` and `time_value`, or column names that uniquely map onto these
#' (e.g. `date` or `province`). Alternatively, you can specify the conversion
#' explicitly (`time_value = someWeirdColumnName`). All other columns not
#' specified as `other_keys` will be preserved as is, and treated as measured
#' variables.
#'
#' If `as_of` is missing, then the function will try to guess it from an
#' `as_of`, `issue`, or `version` column of `x` (if any of these are present),
#' or from as an `as_of` field in its metadata (stored in its attributes); if
#' this fails, then the current day-time will be used.
#' @importFrom rlang .data
#' @importFrom tidyselect any_of
#' @importFrom cli cli_inform
#' @export
as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of,
additional_metadata = list(), ...) {
additional_metadata = list(),
...) {
# possible standard substitutions for time_value
x <- rename(x, ...)
x <- guess_column_name(x, "time_value", time_column_names())
x <- guess_column_name(x, "geo_value", geo_column_names())
if (!test_subset(c("geo_value", "time_value"), names(x))) {
cli_abort(
"Columns `geo_value` and `time_value` must be present in `x`."
"Either columns `geo_value` and `time_value` must be present in `x`, or related columns (see the internal
functions `guess_time_column_name()` and/or `guess_geo_column_name()` for a complete list)."
)
}

new_epi_df(
x, geo_type, time_type, as_of,
additional_metadata, ...
additional_metadata
)
}

Expand Down
83 changes: 83 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,89 @@ guess_time_type <- function(time_value) {
return("custom")
}

#' given a vector of characters, add the same values, but upcased, e.g.
#' "date" -> c("date", "Date")
#' "target_date" -> c("target_date", "Target_Date")
#' @keywords internal
upcase_snake_case <- function(vec) {
upper_vec <- strsplit(vec, "_") %>%
map(function(name) paste0(toupper(substr(name, 1, 1)), substr(name, 2, nchar(name)), collapse = "_")) %>%
unlist()
c(vec, upper_vec)
}

#' potential time_value columns
#' @description
#' the full list of potential substitutions for the `time_value` column name:
#' `r time_column_names()`
#' @export
time_column_names <- function() {
substitutions <- c(
"time_value", "date", "time", "datetime", "dateTime", "date_time", "target_date",
"week", "epiweek", "month", "mon", "year", "yearmon", "yearmonth",
"yearMon", "yearMonth", "dates", "time_values", "target_dates", "time_Value"
)
substitutions <- upcase_snake_case(substitutions)
names(substitutions) <- rep("time_value", length(substitutions))
return(substitutions)
}
#
#' potential geo_value columns
#' @description
#' the full list of potential substitutions for the `geo_value` column name:
#' `r geo_column_names()`
#' @export
geo_column_names <- function() {
substitutions <- c(
"geo_value", "geo_values", "geo_id", "geos", "location", "jurisdiction", "fips", "zip",
"county", "hrr", "msa", "state", "province", "nation", "states",
"provinces", "counties", "geo_Value"
)
substitutions <- upcase_snake_case(substitutions)
names(substitutions) <- rep("geo_value", length(substitutions))
return(substitutions)
}

#' potential version columns
#' @description
#' the full list of potential substitutions for the `version` column name:
#' `r version_column_names()`
#' @export
version_column_names <- function() {
substitutions <- c(
"version", "issue", "release"
)
substitutions <- upcase_snake_case(substitutions)
names(substitutions) <- rep("version", length(substitutions))
return(substitutions)
}

#' rename potential time_value columns
#'
#' @description
#' potentially renames
#' @param x the tibble to potentially rename
#' @param substitutions a named vector. the potential substitions, with every name `time_value`
#' @keywords internal
guess_column_name <- function(x, column_name, substitutions) {
if (!(column_name %in% names(x))) {
x <- tryCatch(x %>% rename(any_of(substitutions)),
error = function(cond) {
cli_abort("{names(x)[names(x) %in% substitutions]} are both/all valid substitutions.
Either `rename` some yourself or drop some.")
}
)
# if none of the names are in substitutions, and `column_name` isn't a column, we're missing a relevant column
if (!any(names(x) %in% substitutions)) {
cli_abort("There is no {column_name} column or similar name. See e.g. [`time_column_name()`] for a complete list")
}
if (any(substitutions != "")) {
cli_inform("inferring {column_name} column.")
Copy link
Contributor

@nmdefries nmdefries Jul 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: I find the logic in this chunk confusing. Here's an alternative flow that reads more clearly to me.

Move the "if none of the names are in substitutions..." chunk before the tryCatch. Having the condition !any(names(x) %in% substitutions) after the tryCatch makes it read as if this block will always trigger. Downside to moving this before the tryCatch is that it is no longer checking if the rename was successful (although we shouldn't really need that).

If the rename has an error, the rest of the tryCatch expression won't be run, so I moved the cli_inform there.

Suggested change
if (!(column_name %in% names(x))) {
x <- tryCatch(x %>% rename(any_of(substitutions)),
error = function(cond) {
cli_abort("{names(x)[names(x) %in% substitutions]} are both/all valid substitutions.
Either `rename` some yourself or drop some.")
}
)
# if none of the names are in substitutions, and `column_name` isn't a column, we're missing a relevant column
if (!any(names(x) %in% substitutions)) {
cli_abort("There is no {column_name} column or similar name. See e.g. [`time_column_name()`] for a complete list")
}
if (any(substitutions != "")) {
cli_inform("inferring {column_name} column.")
if (!(column_name %in% names(x))) {
# if none of the names are in substitutions, and `column_name` isn't a column, we're missing a relevant column
if (!any(names(x) %in% substitutions)) {
cli_abort("There is no {column_name} column or similar name. See e.g. [`time_column_name()`] for a complete list")
}
x <- tryCatch({
tmp <- x %>% rename(any_of(substitutions))
cli_inform("inferring {column_name} column.")
tmp
},
error = function(cond) {
cli_abort("{names(x)[names(x) %in% substitutions]} are both/all valid substitutions.
Either `rename` some yourself or drop some.")
}
)

Copy link
Contributor Author

@dsweber2 dsweber2 Jul 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why the tmp? actually read it XD

}
}
return(x)
}

##########


Expand Down
1 change: 1 addition & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ reference:
desc: Details on `epi_df` format, and basic functionality.
- contents:
- matches("epi_df")
- matches("column_names")
- title: "`epi_*()` functions"
desc: Functions that act on `epi_df` objects.
- contents:
Expand Down
1 change: 0 additions & 1 deletion man-roxygen/epi_df-params.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,4 @@
#' `as_of` fields; named entries from the passed list will be included as
#' well. If your tibble has additional keys, be sure to specify them as a
#' character vector in the `other_keys` component of `additional_metadata`.
#' @param ... Additional arguments passed to methods.
#' @return An `epi_df` object.
16 changes: 10 additions & 6 deletions man/as_epi_df.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 10 additions & 1 deletion man/epi_archive.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions man/geo_column_names.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions man/guess_column_name.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 1 addition & 4 deletions man/new_epi_df.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions man/time_column_names.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 16 additions & 0 deletions man/upcase_snake_case.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions man/version_column_names.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading