From 1ecce6747c42c357bcf0a6ee44a23fb2bdee24f7 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 12 Aug 2024 13:06:03 -0700 Subject: [PATCH 1/8] add template --- vignettes/articles/scorecaster.Rmd | 86 ++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 vignettes/articles/scorecaster.Rmd diff --git a/vignettes/articles/scorecaster.Rmd b/vignettes/articles/scorecaster.Rmd new file mode 100644 index 000000000..748fcbe5c --- /dev/null +++ b/vignettes/articles/scorecaster.Rmd @@ -0,0 +1,86 @@ +--- +title: "Implementing a scorecaster for quantile calibration" +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + warning = FALSE, + message = FALSE, + cache = TRUE +) +``` + + +```{r packages} +library(tidyverse) +library(epipredict) +``` + +```{r forecast-output} +jhu <- case_death_rate_subset +fc_time_values <- seq(as.Date("2021-03-09"), as.Date("2021-12-01"), by = "4 weeks") +q_levels <- c(1, 2, 5, 8, 9) / 10 +forecaster <- function(x, aheads = 7) { + map(aheads, ~ arx_forecaster( + x, "death_rate", c("case_rate", "death_rate"), + quantile_reg(quantile_levels = q_levels), + arx_args_list(ahead = .x, quantile_levels = q_levels) + )$predictions |> + mutate(ahead = .x) + ) |> list_rbind() +} + +out <- map( + .x = fc_time_values, + .f = ~forecaster(jhu %>% filter(time_value <= .x), c(7, 14, 21, 28)), + .progress = TRUE +) + +out <- out %>% list_rbind() +out <- left_join( + out, + jhu, + by = c("target_date" = "time_value", "geo_value") +) +``` + + +```{r necessary-funs} +quantile_conformal_score <- function(x, actual) { + UseMethod("quantile_conformal_score") +} +quantile_conformal_score.distribution <- function(x, actual) { + l <- vctrs::vec_recycle_common(x = x, actual = actual) + map2( + .x = vctrs::vec_data(l$x), + .y = l$actual, + .f = quantile_conformal_score + ) +} +quantile_conformal_score.dist_quantiles <- function(x, actual) { + values <- vctrs::field(x, "values") + quantile_levels <- vctrs::field(x, "quantile_levels") + errs <- (actual - values) * (quantile_levels > 0.5) + + (values - actual) * (quantile_levels < 0.5) + + abs(actual - values) * (quantile_levels == 0.5) + errs +} + +tangent_integrator <- function(x, t, KI = 1000, Csat = 2) { + # defaults from https://github.com/aangelopoulos/conformal-time-series/blob/b729c3f5ff633bfc43f0f7ca08199b549c2573ac/tests/configs/ca-COVID-deaths-4wk.yaml#L41 + x <- x * log(t + 1) / (Csat * (t + 1)) + up <- x >= pi / 2 + down <- x <= -pi / 2 + x[up] <- Inf + x[down] <- -Inf + mid <- !up & !down + x[mid] <- KI * tan(x[mid]) +} +``` + +```{r score-fcasts} +out <- out |> + mutate(qc_scores = quantile_conformal_score(.pred_distn, death_rate)) +``` From bbd4b49a768b6705fb11e5c458a1a754f8512c83 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 21 Aug 2024 10:49:56 -0700 Subject: [PATCH 2/8] brief comments on the scorecaster code --- vignettes/articles/scorecaster.Rmd | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/vignettes/articles/scorecaster.Rmd b/vignettes/articles/scorecaster.Rmd index 748fcbe5c..960b83f87 100644 --- a/vignettes/articles/scorecaster.Rmd +++ b/vignettes/articles/scorecaster.Rmd @@ -18,6 +18,8 @@ library(tidyverse) library(epipredict) ``` +First we get some forecasts. + ```{r forecast-output} jhu <- case_death_rate_subset fc_time_values <- seq(as.Date("2021-03-09"), as.Date("2021-12-01"), by = "4 weeks") @@ -46,6 +48,7 @@ out <- left_join( ) ``` +Now we set up the "quantile conformal score" and the tangent integrator. ```{r necessary-funs} quantile_conformal_score <- function(x, actual) { @@ -80,7 +83,15 @@ tangent_integrator <- function(x, t, KI = 1000, Csat = 2) { } ``` +Score the forecasts. + ```{r score-fcasts} out <- out |> mutate(qc_scores = quantile_conformal_score(.pred_distn, death_rate)) ``` + +Now we would need a "scorecaster". The paper has code here: +https://github.com/aangelopoulos/conformal-time-series/blob/b729c3f5ff633bfc43f0f7ca08199b549c2573ac/tests/datasets/covid-ts-proc/statewide/death-forecasting-perstate-lasso-qr.ipynb + +Not quite sure what the model is. Note that `epipredict::quantile_reg()` may work +(without the $\ell_1$ penalty). From 98609a003353c8827cb42be788cb7e42d05a8ae3 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 16 Sep 2024 17:10:22 -0700 Subject: [PATCH 3/8] add draft --- R/step_pivot_wider.R | 78 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 R/step_pivot_wider.R diff --git a/R/step_pivot_wider.R b/R/step_pivot_wider.R new file mode 100644 index 000000000..af5294b73 --- /dev/null +++ b/R/step_pivot_wider.R @@ -0,0 +1,78 @@ +step_pivot_wider <- function( + recipe, + ..., + role = "predictor", + id_cols = NULL, + id_expand = FALSE, + names_from = NULL, + values_fill = NA, + values_fn = NULL, + unused_fn = NULL, + skip = FALSE, + id = rand_id("pivot_wider") +) { + + arg_is_chr_scalar(role, id) + add_step( + recipe, + step_pivot_wider_new( + terms = enquos(...), + role = role, + trained = FALSE, + id_cols = id_cols, + id_expand = id_expand, + names_from = names_from %||% kill_time_value(key_colnames(recipe)), + values_fill = values_fill, + values_fn = values_fn, + columns = NULL, + skip = skip, + id = id + ) + ) +} + +step_pivot_wider_new <- function( + terms, role, trained, id_cols, id_expand, names_from, values_fill, + values_fn, columns, skip, id) { + step( + subclass = "pivot_wider", + terms = terms, + role = role, + trained = trained, + id_cols = id_cols, + id_expand = id_expand, + names_from = names_from, + values_fill = values_fill, + values_fn = values_fn, + columns = columns, + skip = skip, + id = id + ) +} + +prep.step_pivot_wider <- function(x, training, info = NULL, ...) { + step_pivot_wider_new( + terms = x$terms, + role = x$role, + trained = TRUE, + id_cols = x$id_cols, + id_expand = x$id_expand, + names_from = x$names_from, + values_fill = x$values_fill, + values_fn = x$values_fn, + columns = recipes_eval_select(x$terms, training, info), + skip = x$skip, + id = x$id + ) +} + +bake.step_pivot_wider <- function(object, new_data, ...) { + pivotted <- tidyr::pivot_wider( + new_data, id_cols = object$id_cols, id_expand = object$id_expand, + names_from = object$names_from, values_from = all_of(object$columns), + values_fill = object$values_fill, values_fn = object$values_fn + ) + joinby <- union(key_colnames(new_data), object$id_cols)) + new_data <- left_join(new_data, pivotted, by) + +} From 28006fe1da5cd1627bb5e07a90edab8f4b49a9a8 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Tue, 17 Sep 2024 11:44:45 -0700 Subject: [PATCH 4/8] draft the step --- NAMESPACE | 3 + R/step_pivot_wider.R | 93 ++++++++++++++++++++------ R/utils-arg.R | 23 +++++++ man/step_pivot_wider.Rd | 80 ++++++++++++++++++++++ tests/testthat/test-step_pivot_wider.R | 20 ++++++ 5 files changed, 199 insertions(+), 20 deletions(-) create mode 100644 man/step_pivot_wider.Rd create mode 100644 tests/testthat/test-step_pivot_wider.R diff --git a/NAMESPACE b/NAMESPACE index 23c5adeaf..1e728a071 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -24,6 +24,7 @@ S3method(bake,step_epi_lag) S3method(bake,step_epi_slide) S3method(bake,step_growth_rate) S3method(bake,step_lag_difference) +S3method(bake,step_pivot_wider) S3method(bake,step_population_scaling) S3method(bake,step_training_window) S3method(detect_layer,frosting) @@ -63,6 +64,7 @@ S3method(prep,step_epi_lag) S3method(prep,step_epi_slide) S3method(prep,step_growth_rate) S3method(prep,step_lag_difference) +S3method(prep,step_pivot_wider) S3method(prep,step_population_scaling) S3method(prep,step_training_window) S3method(print,alist) @@ -201,6 +203,7 @@ export(step_epi_naomit) export(step_epi_slide) export(step_growth_rate) export(step_lag_difference) +export(step_pivot_wider) export(step_population_scaling) export(step_training_window) export(tibble) diff --git a/R/step_pivot_wider.R b/R/step_pivot_wider.R index af5294b73..2140f7a5a 100644 --- a/R/step_pivot_wider.R +++ b/R/step_pivot_wider.R @@ -1,30 +1,46 @@ + + +#' Create new variables by pivotting data +#' +#' @inheritParams step_growth_rate +#' @param ... One or more selector functions to choose variables +#' values to pivot. These are the `values_from` argument for [tidyr::pivot_wider()]. +#' See [recipes::selections()] for more details. +#' @inheritParams tidyr::pivot_wider +#' +#' @template step-return +#' @export +#' +#' @examples +#' 1+1 step_pivot_wider <- function( recipe, ..., role = "predictor", + names_from, id_cols = NULL, id_expand = FALSE, - names_from = NULL, values_fill = NA, values_fn = NULL, - unused_fn = NULL, skip = FALSE, id = rand_id("pivot_wider") ) { arg_is_chr_scalar(role, id) + add_step( recipe, step_pivot_wider_new( terms = enquos(...), role = role, trained = FALSE, - id_cols = id_cols, + user_id_cols = if (is.null(id_cols)) NULL else enquos(id_cols), + edf_id_cols = key_colnames(recipe), id_expand = id_expand, - names_from = names_from %||% kill_time_value(key_colnames(recipe)), + names_from = enquos(names_from), values_fill = values_fill, values_fn = values_fn, - columns = NULL, + values_from = NULL, skip = skip, id = id ) @@ -32,47 +48,84 @@ step_pivot_wider <- function( } step_pivot_wider_new <- function( - terms, role, trained, id_cols, id_expand, names_from, values_fill, - values_fn, columns, skip, id) { + terms, role, trained, user_id_cols, edf_id_cols, + id_expand, names_from, values_fill, + values_fn, values_from, skip, id) { step( subclass = "pivot_wider", terms = terms, role = role, trained = trained, - id_cols = id_cols, + user_id_cols = user_id_cols, + edf_id_cols = edf_id_cols, id_expand = id_expand, names_from = names_from, values_fill = values_fill, values_fn = values_fn, - columns = columns, + values_from = values_from, skip = skip, id = id ) } +#' @export prep.step_pivot_wider <- function(x, training, info = NULL, ...) { + checkmate::assert_subset(x$edf_id_cols, key_colnames(training)) step_pivot_wider_new( terms = x$terms, role = x$role, trained = TRUE, - id_cols = x$id_cols, + user_id_cols = recipes_eval_select(x$user_id_cols, training, info), + edf_id_cols = key_colnames(training), id_expand = x$id_expand, - names_from = x$names_from, + names_from = recipes_eval_select(x$names_from, training, info), values_fill = x$values_fill, values_fn = x$values_fn, - columns = recipes_eval_select(x$terms, training, info), + values_from = recipes_eval_select(x$terms, training, info), skip = x$skip, id = x$id ) } +#' @export bake.step_pivot_wider <- function(object, new_data, ...) { - pivotted <- tidyr::pivot_wider( - new_data, id_cols = object$id_cols, id_expand = object$id_expand, - names_from = object$names_from, values_from = all_of(object$columns), - values_fill = object$values_fill, values_fn = object$values_fn - ) - joinby <- union(key_colnames(new_data), object$id_cols)) - new_data <- left_join(new_data, pivotted, by) - + hardhat::validate_column_names(new_data, object$edf_id_cols) + id_cols <- union(object$user_id_cols, object$edf_id_cols) + id_cols <- union(id_cols, key_colnames(new_data)) + browser() + if (length(id_cols) == 0L) { + pivotted <- tidyr::pivot_wider( + new_data, + id_expand = object$id_expand, + names_from = unname(object$names_from), + values_from = unname(object$values_from), + values_fill = object$values_fill, + values_fn = object$values_fn, + names_repair = "unique" + ) + joinby <- intersect(names(pivotted), names(new_data)) + } else { + pivotted <- tidyr::pivot_wider( + new_data, + id_cols = id_cols, + id_expand = object$id_expand, + names_from = unname(object$names_from), + values_from = unname(object$values_from), + values_fill = object$values_fill, + values_fn = object$values_fn, + names_repair = "unique" + ) + joinby <- id_cols + } + if (length(joinby) > 0L) { + new_data <- left_join(new_data, pivotted, by = joinby) + } else if (length(joinby) == 0L && nrow(pivotted) == nrow(new_data)) { + new_data <- bind_cols(new_data, pivotted, .name_repair = "unique") + } else { + cli_abort(c( + "Unable to join variables created by `step_pivot_wider()`.", + i = "You may want to pass in `id_cols` on step creation." + )) + } + new_data } diff --git a/R/utils-arg.R b/R/utils-arg.R index b4242eaf9..ff76f88ee 100644 --- a/R/utils-arg.R +++ b/R/utils-arg.R @@ -102,3 +102,26 @@ arg_to_date <- function(x, allow_null = FALSE) { arg_is_date(x, allow_null = allow_null) x } + +check_tidyselect_cols_exist <- function(selection, data, call = caller_env()) { + name_pos <- tidyselect::eval_select(selection, data, error_call = call) + if (length(name_pos) == 0L) { + return(list(ok = TRUE, missing_names = selection)) + } + hardhat::check_column_names(data, names(name_pos)) +} + +validate_tidyselect_cols_exist <- function(selection, data, call = caller_env()) { + check <- check_tidyselect_cols_exist(selection, data, call) + if (!check$ok) { + missing_names <- glue::glue_collapse( + glue::single_quote(check$missing_names), + sep = ", " + ) + message <- glue::glue( + "The {selection} results in missing columns: {missing_names}." + ) + cli_abort(message, call = call) + } + invisible(data) +} diff --git a/man/step_pivot_wider.Rd b/man/step_pivot_wider.Rd new file mode 100644 index 000000000..1d2358e61 --- /dev/null +++ b/man/step_pivot_wider.Rd @@ -0,0 +1,80 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step_pivot_wider.R +\name{step_pivot_wider} +\alias{step_pivot_wider} +\title{Create new variables by pivotting data} +\usage{ +step_pivot_wider( + recipe, + ..., + role = "predictor", + names_from, + id_cols = NULL, + id_expand = FALSE, + values_fill = NA, + values_fn = NULL, + skip = FALSE, + id = rand_id("pivot_wider") +) +} +\arguments{ +\item{recipe}{A recipe object. The step will be added to the +sequence of operations for this recipe.} + +\item{...}{One or more selector functions to choose variables +values to pivot. These are the \code{values_from} argument for \code{\link[tidyr:pivot_wider]{tidyr::pivot_wider()}}. +See \code{\link[recipes:selections]{recipes::selections()}} for more details.} + +\item{role}{For model terms created by this step, what analysis role should +they be assigned? \code{lag} is default a predictor while \code{ahead} is an outcome.} + +\item{id_cols}{<\code{\link[tidyr:tidyr_tidy_select]{tidy-select}}> A set of columns that +uniquely identify each observation. Typically used when you have +redundant variables, i.e. variables whose values are perfectly correlated +with existing variables. + +Defaults to all columns in \code{data} except for the columns specified through +\code{names_from} and \code{values_from}. If a tidyselect expression is supplied, it +will be evaluated on \code{data} after removing the columns specified through +\code{names_from} and \code{values_from}.} + +\item{id_expand}{Should the values in the \code{id_cols} columns be expanded by +\code{\link[tidyr:expand]{expand()}} before pivoting? This results in more rows, the output will +contain a complete expansion of all possible values in \code{id_cols}. Implicit +factor levels that aren't represented in the data will become explicit. +Additionally, the row values corresponding to the expanded \code{id_cols} will +be sorted.} + +\item{values_fill}{Optionally, a (scalar) value that specifies what each +\code{value} should be filled in with when missing. + +This can be a named list if you want to apply different fill values to +different value columns.} + +\item{values_fn}{Optionally, a function applied to the value in each cell +in the output. You will typically use this when the combination of +\code{id_cols} and \code{names_from} columns does not uniquely identify an +observation. + +This can be a named list if you want to apply different aggregations +to different \code{values_from} columns.} + +\item{skip}{A logical. Should the step be skipped when the +recipe is baked by \code{\link[=bake]{bake()}}? While all operations are baked +when \code{\link[=prep]{prep()}} is run, some operations may not be able to be +conducted on new data (e.g. processing the outcome variable(s)). +Care should be taken when using \code{skip = TRUE} as it may affect +the computations for subsequent operations.} + +\item{id}{A unique identifier for the step} +} +\value{ +An updated version of \code{recipe} with the new step added to the +sequence of any existing operations. +} +\description{ +Create new variables by pivotting data +} +\examples{ +1+1 +} diff --git a/tests/testthat/test-step_pivot_wider.R b/tests/testthat/test-step_pivot_wider.R new file mode 100644 index 000000000..a6829c1f5 --- /dev/null +++ b/tests/testthat/test-step_pivot_wider.R @@ -0,0 +1,20 @@ +library(tidyr) +tib <- expand_grid( + tv = 1:4, + gv = letters[1:2], + cl = letters[2:4] +) +tib$val1 <- rnorm(nrow(tib)) +tib$val2 <- rnorm(nrow(tib)) + +recipe(tib) %>% + step_pivot_wider(starts_with(val), + names_from = cl, + values_fill = list(val1 = 0, val2 = 0) + ) %>% + prep() + bake(new_data = NULL) + +test_that("multiplication works", { + expect_equal(2 * 2, 4) +}) From a3ad946748ea67ecd4f5da985cb307779fb61ce9 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Tue, 17 Sep 2024 14:52:28 -0700 Subject: [PATCH 5/8] start testing --- R/step_pivot_wider.R | 1 - tests/testthat/test-step_pivot_wider.R | 40 +++++++++++++++++++------- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/R/step_pivot_wider.R b/R/step_pivot_wider.R index 2140f7a5a..e03e3bff0 100644 --- a/R/step_pivot_wider.R +++ b/R/step_pivot_wider.R @@ -92,7 +92,6 @@ bake.step_pivot_wider <- function(object, new_data, ...) { hardhat::validate_column_names(new_data, object$edf_id_cols) id_cols <- union(object$user_id_cols, object$edf_id_cols) id_cols <- union(id_cols, key_colnames(new_data)) - browser() if (length(id_cols) == 0L) { pivotted <- tidyr::pivot_wider( new_data, diff --git a/tests/testthat/test-step_pivot_wider.R b/tests/testthat/test-step_pivot_wider.R index a6829c1f5..ce16cc77b 100644 --- a/tests/testthat/test-step_pivot_wider.R +++ b/tests/testthat/test-step_pivot_wider.R @@ -4,17 +4,35 @@ tib <- expand_grid( gv = letters[1:2], cl = letters[2:4] ) -tib$val1 <- rnorm(nrow(tib)) -tib$val2 <- rnorm(nrow(tib)) +tib$val1 <- 1:nrow(tib) + .1 +tib$val2 <- nrow(tib):1 - .1 -recipe(tib) %>% - step_pivot_wider(starts_with(val), - names_from = cl, - values_fill = list(val1 = 0, val2 = 0) - ) %>% - prep() - bake(new_data = NULL) -test_that("multiplication works", { - expect_equal(2 * 2, 4) +test_that("works with recipe, various possible pivots", { + out <- recipe(tib) %>% + step_pivot_wider(val1, names_from = cl) %>% + prep(training = tib) %>% bake(new_data = NULL) + expect_snapshot(out) + + out <- recipe(tib) %>% + step_pivot_wider(val1, names_from = cl, values_fill = 0) %>% + prep(training = tib) %>% bake(new_data = NULL) + expect_snapshot(out) + + out <- recipe(tib) %>% + step_pivot_wider(starts_with("val"), names_from = cl) %>% + prep(training = tib) %>% bake(new_data = NULL) + expect_snapshot(out) + + out <- recipe(tib) %>% + step_pivot_wider(val1, names_from = gv:cl) %>% + prep(training = tib) %>% bake(new_data = NULL) + expect_snapshot(out) + + #fails + out <- recipe(tib) %>% + step_pivot_wider(val1, id_cols = tv:cl, names_from = cl) %>% + prep(training = tib) %>% bake(new_data = NULL) + + edf <- tib %>% as_epi_df() }) From 292d62d3c536c911e59b6f8bc1243a1be4a73cfc Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Tue, 17 Sep 2024 15:16:55 -0700 Subject: [PATCH 6/8] assign earlier --- R/step_pivot_wider.R | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/R/step_pivot_wider.R b/R/step_pivot_wider.R index e03e3bff0..ac68aa267 100644 --- a/R/step_pivot_wider.R +++ b/R/step_pivot_wider.R @@ -28,16 +28,19 @@ step_pivot_wider <- function( arg_is_chr_scalar(role, id) + id_cols <- enquos(id_cols) + names_from <- enquos(names_from) + add_step( recipe, step_pivot_wider_new( terms = enquos(...), role = role, trained = FALSE, - user_id_cols = if (is.null(id_cols)) NULL else enquos(id_cols), + user_id_cols = id_cols, edf_id_cols = key_colnames(recipe), id_expand = id_expand, - names_from = enquos(names_from), + names_from = names_from, values_fill = values_fill, values_fn = values_fn, values_from = NULL, @@ -106,7 +109,7 @@ bake.step_pivot_wider <- function(object, new_data, ...) { } else { pivotted <- tidyr::pivot_wider( new_data, - id_cols = id_cols, + id_cols = unname(id_cols), id_expand = object$id_expand, names_from = unname(object$names_from), values_from = unname(object$values_from), From 11551b257a560ab3b6b702c55911d945a68030c8 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Fri, 4 Oct 2024 11:09:47 -0700 Subject: [PATCH 7/8] refactor --- R/step_pivot_wider.R | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/R/step_pivot_wider.R b/R/step_pivot_wider.R index ac68aa267..03c288440 100644 --- a/R/step_pivot_wider.R +++ b/R/step_pivot_wider.R @@ -73,13 +73,15 @@ step_pivot_wider_new <- function( #' @export prep.step_pivot_wider <- function(x, training, info = NULL, ...) { - checkmate::assert_subset(x$edf_id_cols, key_colnames(training)) + user_id_cols <- recipes_eval_select(x$user_id_cols, training, info) + all_id_cols <- union(user_id_cols, key_colnames(training)) + hardhat::validate_column_names(training, all_id_cols) step_pivot_wider_new( terms = x$terms, role = x$role, trained = TRUE, - user_id_cols = recipes_eval_select(x$user_id_cols, training, info), - edf_id_cols = key_colnames(training), + user_id_cols = user_id_cols, + edf_id_cols = all_id_cols, id_expand = x$id_expand, names_from = recipes_eval_select(x$names_from, training, info), values_fill = x$values_fill, @@ -92,9 +94,9 @@ prep.step_pivot_wider <- function(x, training, info = NULL, ...) { #' @export bake.step_pivot_wider <- function(object, new_data, ...) { - hardhat::validate_column_names(new_data, object$edf_id_cols) - id_cols <- union(object$user_id_cols, object$edf_id_cols) - id_cols <- union(id_cols, key_colnames(new_data)) + hardhat::validate_column_names(new_data, object$all_id_cols) + id_cols <- union(object$all_id_cols, key_colnames(new_data)) + object$edf_id_cols <- id_cols if (length(id_cols) == 0L) { pivotted <- tidyr::pivot_wider( new_data, @@ -131,3 +133,14 @@ bake.step_pivot_wider <- function(object, new_data, ...) { } new_data } + +#' @export +print.step_pivot_wider <- function(x, width = max(20, options()$width - 30), ...) { + print_epi_step(x$values_from, x$terms, x$trained, + title = "Pivotting variables", + conjunction = "by", + extra_text = x$names_from + ) + invisible(x) +} + From 8b1e48297648592746924dcb092c300f60e39f47 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Fri, 4 Oct 2024 12:14:14 -0700 Subject: [PATCH 8/8] examples run, mostly untested --- NAMESPACE | 1 + R/step_pivot_wider.R | 61 ++++++++++++++++++++++++++++---------- man/step_adjust_latency.Rd | 4 +-- man/step_pivot_wider.Rd | 34 +++++++++++++-------- 4 files changed, 69 insertions(+), 31 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 85127359b..2dbdf3178 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -98,6 +98,7 @@ S3method(print,step_epi_slide) S3method(print,step_growth_rate) S3method(print,step_lag_difference) S3method(print,step_naomit) +S3method(print,step_pivot_wider) S3method(print,step_population_scaling) S3method(print,step_training_window) S3method(quantile,dist_quantiles) diff --git a/R/step_pivot_wider.R b/R/step_pivot_wider.R index 03c288440..edadd29db 100644 --- a/R/step_pivot_wider.R +++ b/R/step_pivot_wider.R @@ -2,23 +2,43 @@ #' Create new variables by pivotting data #' +#' This function typically creates new predictors by sharing values across keys. +#' So in the most basic case (see examples below), the values of a signal in +#' one `geo_value` would be used as predictors in all the other locations. +#' #' @inheritParams step_growth_rate -#' @param ... One or more selector functions to choose variables +#' @param ... <[`tidy-select`][tidyr_tidy_select]> One or more selector +#' functions to choose variables #' values to pivot. These are the `values_from` argument for [tidyr::pivot_wider()]. #' See [recipes::selections()] for more details. +#' @param names_from A selector function to choose which column (or columns) to +#' get the name of the output columns from. This is typically `geo_value` +#' (the default), and possibly any additional keys in the training data. +#' @param id_cols <[`tidy-select`][tidyr_tidy_select]> A selector function +#' providing a set of columns that uniquely identifies each observation. +#' The typical use is for this to be `time_value` and any additional keys +#' not selected by `names_from` (this is the default behaviour). #' @inheritParams tidyr::pivot_wider #' #' @template step-return #' @export #' #' @examples -#' 1+1 +#' jhu <- case_death_rate_subset %>% +#' filter(geo_value %in% c("ca", "ny", "pa"), time_value > "2021-12-01") +#' r <- epi_recipe(jhu) +#' +#' r1 <- r %>% step_pivot_wider("death_rate") +#' bake(prep(r1, jhu), new_data = NULL) +#' +#' r2 <- r %>% step_pivot_wider(dplyr::ends_with("rate")) +#' bake(prep(r2, jhu), new_data = NULL) step_pivot_wider <- function( recipe, ..., + names_from = "geo_value", role = "predictor", - names_from, - id_cols = NULL, + id_cols = "time_value", id_expand = FALSE, values_fill = NA, values_fn = NULL, @@ -74,8 +94,13 @@ step_pivot_wider_new <- function( #' @export prep.step_pivot_wider <- function(x, training, info = NULL, ...) { user_id_cols <- recipes_eval_select(x$user_id_cols, training, info) - all_id_cols <- union(user_id_cols, key_colnames(training)) - hardhat::validate_column_names(training, all_id_cols) + hardhat::validate_column_names(training, user_id_cols) + names_from <- recipes_eval_select(x$names_from, training, info) + remaining_ids <- setdiff( + union(user_id_cols, names_from), # keys from user + key_colnames(training) # all edf keys + ) + all_id_cols <- union(user_id_cols, remaining_ids) step_pivot_wider_new( terms = x$terms, role = x$role, @@ -83,7 +108,7 @@ prep.step_pivot_wider <- function(x, training, info = NULL, ...) { user_id_cols = user_id_cols, edf_id_cols = all_id_cols, id_expand = x$id_expand, - names_from = recipes_eval_select(x$names_from, training, info), + names_from = names_from, values_fill = x$values_fill, values_fn = x$values_fn, values_from = recipes_eval_select(x$terms, training, info), @@ -94,15 +119,19 @@ prep.step_pivot_wider <- function(x, training, info = NULL, ...) { #' @export bake.step_pivot_wider <- function(object, new_data, ...) { - hardhat::validate_column_names(new_data, object$all_id_cols) - id_cols <- union(object$all_id_cols, key_colnames(new_data)) - object$edf_id_cols <- id_cols + id_cols <- object$edf_id_cols + names_from <- object$names_from + values_from <- object$values_from + browser() + hardhat::validate_column_names(new_data, id_cols) + hardhat::validate_column_names(new_data, names_from) + hardhat::validate_column_names(new_data, values_from) if (length(id_cols) == 0L) { pivotted <- tidyr::pivot_wider( - new_data, + new_data[, c(names_from, values_from)], id_expand = object$id_expand, - names_from = unname(object$names_from), - values_from = unname(object$values_from), + names_from = unname(names_from), + values_from = unname(values_from), values_fill = object$values_fill, values_fn = object$values_fn, names_repair = "unique" @@ -110,11 +139,11 @@ bake.step_pivot_wider <- function(object, new_data, ...) { joinby <- intersect(names(pivotted), names(new_data)) } else { pivotted <- tidyr::pivot_wider( - new_data, + new_data[, c(id_cols, names_from, values_from)], id_cols = unname(id_cols), id_expand = object$id_expand, - names_from = unname(object$names_from), - values_from = unname(object$values_from), + names_from = unname(names_from), + values_from = unname(values_from), values_fill = object$values_fill, values_fn = object$values_fn, names_repair = "unique" diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index f0ee41390..678c2d38f 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -260,8 +260,8 @@ while this will not: \if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% step_epi_lag(a, lag=0) \%>\% step_adjust_latency(a, method = "extend_lags") -#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work with -#> modified data. +#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't +#> work with modified data. }\if{html}{\out{
}} If you create columns that you then apply lags to (such as diff --git a/man/step_pivot_wider.Rd b/man/step_pivot_wider.Rd index 1d2358e61..668c4f4dd 100644 --- a/man/step_pivot_wider.Rd +++ b/man/step_pivot_wider.Rd @@ -7,9 +7,9 @@ step_pivot_wider( recipe, ..., + names_from = "geo_value", role = "predictor", - names_from, - id_cols = NULL, + id_cols = "time_value", id_expand = FALSE, values_fill = NA, values_fn = NULL, @@ -21,22 +21,22 @@ step_pivot_wider( \item{recipe}{A recipe object. The step will be added to the sequence of operations for this recipe.} -\item{...}{One or more selector functions to choose variables +\item{...}{<\code{\link[=tidyr_tidy_select]{tidy-select}}> One or more selector +functions to choose variables values to pivot. These are the \code{values_from} argument for \code{\link[tidyr:pivot_wider]{tidyr::pivot_wider()}}. See \code{\link[recipes:selections]{recipes::selections()}} for more details.} +\item{names_from}{A selector function to choose which column (or columns) to +get the name of the output columns from. This is typically \code{geo_value} +(the default), and possibly any additional keys in the training data.} + \item{role}{For model terms created by this step, what analysis role should they be assigned? \code{lag} is default a predictor while \code{ahead} is an outcome.} -\item{id_cols}{<\code{\link[tidyr:tidyr_tidy_select]{tidy-select}}> A set of columns that -uniquely identify each observation. Typically used when you have -redundant variables, i.e. variables whose values are perfectly correlated -with existing variables. - -Defaults to all columns in \code{data} except for the columns specified through -\code{names_from} and \code{values_from}. If a tidyselect expression is supplied, it -will be evaluated on \code{data} after removing the columns specified through -\code{names_from} and \code{values_from}.} +\item{id_cols}{<\code{\link[=tidyr_tidy_select]{tidy-select}}> A selector function +providing a set of columns that uniquely identifies each observation. +The typical use is for this to be \code{time_value} and any additional keys +not selected by \code{names_from} (this is the default behaviour).} \item{id_expand}{Should the values in the \code{id_cols} columns be expanded by \code{\link[tidyr:expand]{expand()}} before pivoting? This results in more rows, the output will @@ -76,5 +76,13 @@ sequence of any existing operations. Create new variables by pivotting data } \examples{ -1+1 +jhu <- case_death_rate_subset \%>\% + filter(geo_value \%in\% c("ca", "ny", "pa"), time_value > "2021-12-01") +r <- epi_recipe(jhu) + +r1 <- r \%>\% step_pivot_wider("death_rate") +bake(prep(r1, jhu), new_data = NULL) + +r2 <- r \%>\% step_pivot_wider(dplyr::ends_with("rate")) +bake(prep(r2, jhu), new_data = NULL) }