Skip to content

Commit 69ea5e4

Browse files
authored
Merge pull request #460 from cmu-delphi/autoName
`epi_df` automatic argument
2 parents 8f25ec9 + 243c45e commit 69ea5e4

18 files changed

+309
-28
lines changed

DESCRIPTION

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Type: Package
22
Package: epiprocess
33
Title: Tools for basic signal processing in epidemiology
4-
Version: 0.7.12
4+
Version: 0.7.13
55
Authors@R: c(
66
person("Jacob", "Bien", role = "ctb"),
77
person("Logan", "Brooks", email = "[email protected]", role = c("aut", "cre")),

NAMESPACE

+5
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ export(epix_merge)
6161
export(epix_slide)
6262
export(epix_truncate_versions_after)
6363
export(filter)
64+
export(geo_column_names)
6465
export(group_by)
6566
export(group_modify)
6667
export(growth_rate)
@@ -75,9 +76,11 @@ export(next_after)
7576
export(relocate)
7677
export(rename)
7778
export(slice)
79+
export(time_column_names)
7880
export(ungroup)
7981
export(unnest)
8082
export(validate_epi_archive)
83+
export(version_column_names)
8184
importFrom(checkmate,anyInfinite)
8285
importFrom(checkmate,anyMissing)
8386
importFrom(checkmate,assert)
@@ -100,6 +103,7 @@ importFrom(checkmate,test_subset)
100103
importFrom(checkmate,vname)
101104
importFrom(cli,cat_line)
102105
importFrom(cli,cli_abort)
106+
importFrom(cli,cli_inform)
103107
importFrom(cli,cli_vec)
104108
importFrom(cli,cli_warn)
105109
importFrom(cli,format_message)
@@ -186,6 +190,7 @@ importFrom(tibble,as_tibble)
186190
importFrom(tibble,new_tibble)
187191
importFrom(tibble,validate_tibble)
188192
importFrom(tidyr,unnest)
193+
importFrom(tidyselect,any_of)
189194
importFrom(tidyselect,eval_select)
190195
importFrom(tidyselect,starts_with)
191196
importFrom(tsibble,as_tsibble)

NEWS.md

+5
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
4040
- Improved documentation web site landing page's introduction.
4141
- Fixed documentation referring to old `epi_slide()` interface (#466, thanks
4242
@XuedaShen!).
43+
- `as_epi_df` and `as_epi_archive` now support arguments to specify column names
44+
e.g. `as_epi_df(some_tibble, geo_value=state)`. In addition, there is a list
45+
of default conversions, see `time_column_names` for a list of columns that
46+
will automatically be recognized and converted to `time_value` column (there
47+
are similar functions for `geo` and `version`).
4348

4449
## Cleanup
4550
- Resolved some linting messages in package checks (#468).

R/archive.R

+15-2
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,11 @@ validate_epi_archive <- function(
442442

443443
#' `as_epi_archive` converts a data frame, data table, or tibble into an
444444
#' `epi_archive` object.
445+
#' @param ... used for specifying column names, as in [`dplyr::rename`]. For
446+
#' example `version = release_date`
447+
#' @param .versions_end location based versions_end, used to avoid prefix
448+
#' `version = issue` from being assigned to `versions_end` instead of being
449+
#' used to rename columns.
445450
#'
446451
#' @rdname epi_archive
447452
#'
@@ -454,11 +459,19 @@ as_epi_archive <- function(
454459
additional_metadata = NULL,
455460
compactify = NULL,
456461
clobberable_versions_start = NULL,
457-
versions_end = NULL) {
462+
.versions_end = NULL, ...,
463+
versions_end = .versions_end) {
458464
assert_data_frame(x)
465+
x <- rename(x, ...)
466+
x <- guess_column_name(x, "time_value", time_column_names())
467+
x <- guess_column_name(x, "geo_value", geo_column_names())
468+
x <- guess_column_name(x, "version", version_column_names())
459469
if (!test_subset(c("geo_value", "time_value", "version"), names(x))) {
460470
cli_abort(
461-
"Columns `geo_value`, `time_value`, and `version` must be present in `x`."
471+
"Either columns `geo_value`, `time_value`, and `version`, or related columns
472+
(see the internal functions `guess_time_column_name()`,
473+
`guess_geo_column_name()` and/or `guess_geo_version_name()` for complete
474+
list) must be present in `x`."
462475
)
463476
}
464477
if (anyMissing(x$version)) {

R/epi_df.R

+25-10
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ NULL
9595
#'
9696
#' @export
9797
new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of,
98-
additional_metadata = list(), ...) {
98+
additional_metadata = list()) {
9999
assert_data_frame(x)
100100
assert_list(additional_metadata)
101101

@@ -162,6 +162,7 @@ new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of,
162162
#' guide](https://cmu-delphi.github.io/epiprocess/articles/epiprocess.html) for
163163
#' examples.
164164
#'
165+
#' @param ... Additional arguments passed to methods.
165166
#' @template epi_df-params
166167
#'
167168
#' @export
@@ -249,25 +250,39 @@ as_epi_df.epi_df <- function(x, ...) {
249250

250251
#' @method as_epi_df tbl_df
251252
#' @describeIn as_epi_df The input tibble `x` must contain the columns
252-
#' `geo_value` and `time_value`. All other columns will be preserved as is,
253-
#' and treated as measured variables. If `as_of` is missing, then the function
254-
#' will try to guess it from an `as_of`, `issue`, or `version` column of `x`
255-
#' (if any of these are present), or from as an `as_of` field in its metadata
256-
#' (stored in its attributes); if this fails, then the current day-time will
257-
#' be used.
253+
#' `geo_value` and `time_value`, or column names that uniquely map onto these
254+
#' (e.g. `date` or `province`). Alternatively, you can specify the conversion
255+
#' explicitly (`time_value = someWeirdColumnName`). All other columns not
256+
#' specified as `other_keys` will be preserved as is, and treated as measured
257+
#' variables.
258+
#'
259+
#' If `as_of` is missing, then the function will try to guess it from an
260+
#' `as_of`, `issue`, or `version` column of `x` (if any of these are present),
261+
#' or from as an `as_of` field in its metadata (stored in its attributes); if
262+
#' this fails, then the current day-time will be used.
258263
#' @importFrom rlang .data
264+
#' @importFrom tidyselect any_of
265+
#' @importFrom cli cli_inform
259266
#' @export
260267
as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of,
261-
additional_metadata = list(), ...) {
268+
additional_metadata = list(),
269+
...) {
270+
# possible standard substitutions for time_value
271+
x <- rename(x, ...)
272+
x <- guess_column_name(x, "time_value", time_column_names())
273+
x <- guess_column_name(x, "geo_value", geo_column_names())
262274
if (!test_subset(c("geo_value", "time_value"), names(x))) {
263275
cli_abort(
264-
"Columns `geo_value` and `time_value` must be present in `x`."
276+
"Either columns `geo_value` and `time_value` or related columns
277+
(see the internal functions `guess_time_column_name()` and/or
278+
`guess_geo_column_name()` for a complete list)
279+
must be present in `x`."
265280
)
266281
}
267282

268283
new_epi_df(
269284
x, geo_type, time_type, as_of,
270-
additional_metadata, ...
285+
additional_metadata
271286
)
272287
}
273288

R/utils.R

+99
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,105 @@ guess_time_type <- function(time_value) {
448448
return("custom")
449449
}
450450

451+
#' given a vector of characters, add the same values, but upcased, e.g.
452+
#' "date" -> c("date", "Date")
453+
#' "target_date" -> c("target_date", "Target_Date")
454+
#' @keywords internal
455+
upcase_snake_case <- function(vec) {
456+
upper_vec <- strsplit(vec, "_") %>%
457+
map(function(name) paste0(toupper(substr(name, 1, 1)), substr(name, 2, nchar(name)), collapse = "_")) %>%
458+
unlist()
459+
c(vec, upper_vec)
460+
}
461+
462+
#' potential time_value columns
463+
#' @description
464+
#' the full list of potential substitutions for the `time_value` column name:
465+
#' `r time_column_names()`
466+
#' @export
467+
time_column_names <- function() {
468+
substitutions <- c(
469+
"time_value", "date", "time", "datetime", "dateTime", "date_time", "target_date",
470+
"week", "epiweek", "month", "mon", "year", "yearmon", "yearmonth",
471+
"yearMon", "yearMonth", "dates", "time_values", "target_dates", "time_Value"
472+
)
473+
substitutions <- upcase_snake_case(substitutions)
474+
names(substitutions) <- rep("time_value", length(substitutions))
475+
return(substitutions)
476+
}
477+
#
478+
#' potential geo_value columns
479+
#' @description
480+
#' the full list of potential substitutions for the `geo_value` column name:
481+
#' `r geo_column_names()`
482+
#' @export
483+
geo_column_names <- function() {
484+
substitutions <- c(
485+
"geo_value", "geo_values", "geo_id", "geos", "location", "jurisdiction", "fips", "zip",
486+
"county", "hrr", "msa", "state", "province", "nation", "states",
487+
"provinces", "counties", "geo_Value"
488+
)
489+
substitutions <- upcase_snake_case(substitutions)
490+
names(substitutions) <- rep("geo_value", length(substitutions))
491+
return(substitutions)
492+
}
493+
494+
#' potential version columns
495+
#' @description
496+
#' the full list of potential substitutions for the `version` column name:
497+
#' `r version_column_names()`
498+
#' @export
499+
version_column_names <- function() {
500+
substitutions <- c(
501+
"version", "issue", "release"
502+
)
503+
substitutions <- upcase_snake_case(substitutions)
504+
names(substitutions) <- rep("version", length(substitutions))
505+
return(substitutions)
506+
}
507+
508+
#' rename potential time_value columns
509+
#'
510+
#' @description
511+
#' potentially renames
512+
#' @param x the tibble to potentially rename
513+
#' @param substitutions a named vector. the potential substitions, with every name `time_value`
514+
#' @keywords internal
515+
#' @importFrom cli cli_inform cli_abort
516+
#' @importFrom dplyr rename
517+
guess_column_name <- function(x, column_name, substitutions) {
518+
if (!(column_name %in% names(x))) {
519+
# if none of the names are in substitutions, and `column_name` isn't a column, we're missing a relevant column
520+
if (!any(names(x) %in% substitutions)) {
521+
cli_abort(
522+
"There is no {column_name} column or similar name.
523+
See e.g. [`time_column_name()`] for a complete list",
524+
class = "epiprocess__guess_column__multiple_substitution_error"
525+
)
526+
}
527+
528+
tryCatch(
529+
{
530+
x <- x %>% rename(any_of(substitutions))
531+
cli_inform(
532+
"inferring {column_name} column.",
533+
class = "epiprocess__guess_column_inferring_inform"
534+
)
535+
return(x)
536+
},
537+
error = function(cond) {
538+
cli_abort(
539+
"{intersect(names(x), substitutions)}
540+
are both/all valid substitutions for {column_name}.
541+
Either `rename` some yourself or drop some.",
542+
class = "epiprocess__guess_column__multiple_substitution_error"
543+
)
544+
}
545+
)
546+
}
547+
return(x)
548+
}
549+
451550
##########
452551

453552

_pkgdown.yml

+1
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ reference:
6363
desc: Details on `epi_df` format, and basic functionality.
6464
- contents:
6565
- matches("epi_df")
66+
- matches("column_names")
6667
- title: "`epi_*()` functions"
6768
desc: Functions that act on `epi_df` objects.
6869
- contents:

man-roxygen/epi_df-params.R

-1
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,4 @@
1515
#' `as_of` fields; named entries from the passed list will be included as
1616
#' well. If your tibble has additional keys, be sure to specify them as a
1717
#' character vector in the `other_keys` component of `additional_metadata`.
18-
#' @param ... Additional arguments passed to methods.
1918
#' @return An `epi_df` object.

man/as_epi_df.Rd

+10-6
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/epi_archive.Rd

+10-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/geo_column_names.Rd

+12
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/guess_column_name.Rd

+17
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/new_epi_df.Rd

+1-4
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/time_column_names.Rd

+12
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)