cmu-delphi
diff --git a/‎.github/workflows/R-CMD-check.yaml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/R-CMD-check.yaml
Lines changed: 1 addition & 0 deletions
diff --git a/‎.gitignore
Lines changed: 5 additions & 1 deletion b/‎.gitignore
Lines changed: 5 additions & 1 deletion
diff --git a/‎DESCRIPTION
Lines changed: 1 addition & 1 deletion b/‎DESCRIPTION
Lines changed: 1 addition & 1 deletion
diff --git a/‎NAMESPACE
Lines changed: 12 additions & 2 deletions b/‎NAMESPACE
Lines changed: 12 additions & 2 deletions
diff --git a/‎R/ensemble_average.R
Lines changed: 40 additions & 0 deletions b/‎R/ensemble_average.R
Lines changed: 40 additions & 0 deletions
diff --git a/‎R/epieval-package.R
Lines changed: 1 addition & 1 deletion b/‎R/epieval-package.R
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/targets_utils.R
Lines changed: 80 additions & 6 deletions b/‎R/targets_utils.R
Lines changed: 80 additions & 6 deletions
diff --git a/‎R/utils.R
Lines changed: 76 additions & 8 deletions b/‎R/utils.R
Lines changed: 76 additions & 8 deletions
diff --git a/‎app.R
Lines changed: 5 additions & 1 deletion b/‎app.R
Lines changed: 5 additions & 1 deletion
@@ -1,6 +1,7 @@
 # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 on:
+  workflow_dispatch:
   push:
     branches: [main]
   pull_request:
 
@@ -4,4 +4,8 @@
 tmp/
 extras/**.html
 *.pdf
-.Renviron
+.Renviron
+.renvignore
+nohup.out
+run.Rout
+tmp.R
@@ -1,6 +1,6 @@
 Package: epieval
 Title: Evaluating Timeseries Forecasting on Archival Data
-Version: 0.1.0
+Version: 0.2.0
 Date: 2023-09-28
 Authors@R:
     c(
 
@@ -8,20 +8,25 @@ export(clear_lastminute_nas)
 export(collapse_cards)
 export(confirm_sufficient_data)
 export(covidhub_probs)
+export(ensemble_average)
+export(ensemble_missing_forecasters)
+export(ensemble_missing_forecasters_details)
 export(evaluate_predictions)
 export(extend_ahead)
 export(flatline_fc)
+export(forecaster_lookup)
 export(forecaster_pred)
 export(format_storage)
 export(id_ahead_ensemble_grid)
 export(interval_coverage)
-export(lookup_ids)
 export(make_data_targets)
-export(make_ensemble_targets)
+export(make_ensemble_targets_and_scores)
 export(make_external_names_and_scores)
 export(make_forecasts_and_scores)
 export(make_forecasts_and_scores_by_ahead)
+export(make_shared_ensembles)
 export(make_shared_grids)
+export(make_target_ensemble_grid)
 export(make_target_param_grid)
 export(manage_S3_forecast_cache)
 export(overprediction)
@@ -83,16 +88,21 @@ importFrom(epiprocess,epix_slide)
 importFrom(here,here)
 importFrom(magrittr,"%<>%")
 importFrom(magrittr,"%>%")
+importFrom(purrr,imap)
 importFrom(purrr,map)
 importFrom(purrr,map2_vec)
+importFrom(purrr,map_vec)
 importFrom(purrr,transpose)
 importFrom(recipes,all_numeric)
 importFrom(rlang,"!!")
+importFrom(rlang,"%||%")
 importFrom(rlang,.data)
 importFrom(rlang,quo)
 importFrom(rlang,sym)
 importFrom(rlang,syms)
+importFrom(targets,tar_config_get)
 importFrom(targets,tar_group)
+importFrom(targets,tar_read)
 importFrom(targets,tar_target)
 importFrom(tibble,tibble)
 importFrom(tidyr,drop_na)
 
@@ -0,0 +1,40 @@
+#' an ensemble model that averages each quantile separately
+#' @description
+#' The simplest class of ensembing models, it takes in a list of quantile
+#'   forecasts and averages them on a per-quantile basis. By default the average
+#'   used is the median, but it can accept any vectorized function.
+#' @param epi_data unused for this forecaster, but potentially an ensemble may
+#'   want the underlying data.
+#' @param outcome The name of the target variable.
+#' @param extra_sources The name of any extra columns to use. This list could be
+#'   empty
+#' @param forecasts a list of quantile forecasts to aggregate. They should
+#'   be tibbles with columns `(geo_value, forecast_date, target_end_date,
+#'   quantile, value)`, preferably in that order.
+#' @param ensemble_args any arguments unique to this particular ensembler should
+#'   be included in a list like this (unfortunate targets issues). The arguments
+#'   for `ensemble_average` in particular are `average_type` and `join_columns`
+#' @param ensemble_args_names an argument purely for use in targets. You
+#'   probably shouldn't worry about it. In a target, it should probably be
+#'   `ensemble_args_names = names(ensemble_args)`
+#' @importFrom rlang %||%
+#' @export
+ensemble_average <- function(epi_data,
+                             forecasts,
+                             outcome,
+                             extra_sources = "",
+                             ensemble_args = list(),
+                             ensemble_args_names = NULL) {
+  # unique parameters must be buried in ensemble_args so that the generic function signature is stable
+  # their names are separated for obscure target related reasons
+  if (!is.null(ensemble_args_names)) {
+    names(ensemble_args) <- ensemble_args_names
+  }
+  average_type <- ensemble_args$average_type %||% median
+  join_columns <- ensemble_args$join_columns %||% c("geo_value", "forecast_date", "target_end_date", "quantile")
+  # begin actual analysis
+  bind_rows(!!!forecasts, .id = "forecaster") %>%
+    group_by(across(all_of(join_columns))) %>%
+    summarize(value = average_type(value)) %>%
+    ungroup()
+}
@@ -1,7 +1,7 @@
 #' @importFrom magrittr %>% %<>%
 #' @importFrom dplyr select rename inner_join join_by mutate relocate any_of
 #'    group_by reframe summarize left_join across filter rowwise everything ungroup
-#' @importFrom purrr transpose map map2_vec
+#' @importFrom purrr transpose map map2_vec map_vec imap
 #' @keywords internal
 "_PACKAGE"
 globalVariables(c("ahead", "id", "parent_id", "all_of", "last_col", "time_value", "geo_value", "target_end_date", "forecast_date", "quantile", ".pred_distn", "quantiles", "quantile_levels", "signal", ".dstn", "values", ".", "forecasters", "forecaster", "trainer", "forecast_date", ".pred", "n_distinct", "target_date", "value"))
@@ -6,10 +6,11 @@
 #' @export
 #' @importFrom rlang syms
 make_target_param_grid <- function(param_grid) {
+  not_na <- !is.na(param_grid$trainer)
+  param_grid$trainer[not_na] <- syms(param_grid$trainer[not_na])
   param_grid %<>%
     select(-any_of("parent_id")) %>%
-    mutate(forecaster = syms(forecaster)) %>%
-    mutate(trainer = syms(trainer))
+    mutate(forecaster = syms(forecaster))
   list_of_params <- lists_of_real_values(param_grid)
   list_names <- map(list_of_params, names)
   tibble(
@@ -19,6 +20,30 @@ make_target_param_grid <- function(param_grid) {
     param_names = list_names
   )
 }
+#' convert a list of forecasters
+#' @description
+#' the required format for targets is a little jank; this takes a human legible tibble and makes it targets legible.
+#' Currently only `forecaster` and `trainer` can be symbols.
+#' @param param_grid the tibble of parameters. Must have forecaster and trainer, everything else is optional
+#' @param ONE_AHEAD_FORECASTER_NAME the extra bit of name that is shared by all
+#' @export
+#' @importFrom rlang syms
+make_target_ensemble_grid <- function(param_grid, ONE_AHEAD_FORECASTER_NAME = "forecast_by_ahead") {
+  param_grid$ensemble_params <- map(param_grid$ensemble_params, sym_subset)
+  param_grid %<>%
+    mutate(ensemble = syms(ensemble)) %>%
+    mutate(ensemble_params_names = list(names(ensemble_params))) %>%
+    select(-forecasters) %>%
+    relocate(id, .before = everything()) %>%
+    mutate(forecaster_ids = list(syms(paste(ONE_AHEAD_FORECASTER_NAME, forecaster_ids, sep = "_"))))
+  return(param_grid)
+}
+#' function to map
+#' @keywords internal
+#' @param sym_names a list of the parameter names that should be turned into symbols
+sym_subset <- function(param_list, sym_names = list("average_type")) {
+  imap(param_list, \(x, y) if (y %in% sym_names) sym(x) else x)
+}
 
 #' helper function for `make_target_param_grid`
 #' @keywords internal
@@ -150,7 +175,7 @@ make_data_targets <- function() {
   )
 }
 
-#' Make common targets for forecasting experiments
+#' Make list of common forecasters for forecasting experiments across projects
 #' @export
 make_shared_grids <- function() {
   list(
@@ -163,12 +188,44 @@ make_shared_grids <- function() {
     tidyr::expand_grid(
       forecaster = "scaled_pop",
       trainer = c("linreg", "quantreg"),
-      ahead = 5:7,
+      ahead = 1:7,
       lags = list(c(0, 3, 5, 7, 14), c(0, 7, 14)),
       pop_scaling = c(FALSE)
+    ),
+    tidyr::expand_grid(
+      forecaster = "flatline_fc",
+      ahead = 1:7
     )
   )
 }
+#' Make list of common ensembles for forecasting experiments across projects
+#' @export
+make_shared_ensembles <- function() {
+  ex_forecaster <- list(
+    forecaster = "scaled_pop",
+    trainer = "linreg",
+    pop_scaling = FALSE,
+    lags = c(0, 3, 5, 7, 14)
+  )
+  # ensembles don't lend themselves to expand grid (inherently needs a list for sub-forecasters)
+  tribble(
+    ~ensemble, ~ensemble_params, ~forecasters,
+    # mean forecaster
+    "ensemble_average",
+    list(average_type = "mean"),
+    list(
+      ex_forecaster,
+      list(forecaster = "flatline_fc")
+    ),
+    # median forecaster
+    "ensemble_average",
+    list(average_type = "median"),
+    list(
+      ex_forecaster,
+      list(forecaster = "flatline_fc")
+    ),
+  )
+}
 
 #' Make forecasts and scores by ahead targets
 #' @description
@@ -238,8 +295,25 @@ make_forecasts_and_scores <- function() {
 
 #' Make ensemble targets
 #' @export
-make_ensemble_targets <- function() {
-  list()
+make_ensemble_targets_and_scores <- function() {
+  ensembles_and_scores <- tar_map(
+    values = ensemble_parent_id_map,
+    names = parent_id,
+    tar_target(
+      name = ensemble,
+      command = {
+        bind_rows(ensemble_component_ids) %>%
+          mutate(parent_ensemble = parent_id)
+      }
+    ),
+    tar_target(
+      name = ensemble_score,
+      command = {
+        bind_rows(score_component_ids) %>%
+          mutate(parent_ensemble = parent_id)
+      }
+    )
+  )
 }
 
 
 
@@ -36,6 +36,75 @@ add_id <- function(df, n_adj = 2) {
   return(df)
 }
 
+#' look up forecasters by name
+#' @description
+#' given a (partial) forecaster name, look up all forecasters in the given project which contain part of that name.
+#' @param forecaster_name a part of the adj.adj.1 name used to identify the forecaster.
+#' @param param_grid the tibble containing the mapping between
+#' @param project the project to be used; by default, the environmental variable is used
+#' @importFrom targets tar_read tar_config_get
+#' @export
+forecaster_lookup <- function(forecaster_name, param_grid = NULL, project = NULL) {
+  forecaster_name <- strip_underscored(forecaster_name)
+  if (is.null(project)) {
+    project <- tar_config_get("store")
+  }
+  if (is.null(param_grid)) {
+    param_grid <- tar_read(forecaster_params_grid, store = project)
+  }
+  param_grid %>% filter(grepl(forecaster_name, id))
+}
+
+strip_underscored <- function(x) {
+  g <- gregexpr("_", x, fixed = TRUE)
+  last_underscore <- g[[1]][[length(g[[1]])]]
+  substr(x[[1]], start = last_underscore + 1, stop = nchar(x))
+}
+
+#' list forecasters used in the given ensemble table not found in the given forecaster grid
+#' @description
+#' list forecasters used in the given ensemble table not found in the given forecaster grid
+#'
+#' @param ensemble_grid the grid of ensembles used
+#' @param param_grid the grid of forecasters used that we're checking for presence
+#' @param project the project to be used; by default, the environmental variable is used
+#' @export
+ensemble_missing_forecasters <- function(ensemble_grid = NULL, param_grid = NULL, project = NULL) {
+  if (is.null(project)) {
+    project <- tar_config_get("store")
+  }
+  if (is.null(ensemble_grid)) {
+    ensemble_grid <- tar_read(ensemble_forecasters, store = project)
+  }
+  used_forecasters <- unlist(ensemble_grid$forecaster_ids) %>% unique()
+  is_present <- map_vec(used_forecasters, \(given_forecaster) nrow(forecaster_lookup(given_forecaster, param_grid, project)) > 0)
+  absent_forecasters <- used_forecasters[!is_present]
+  return(absent_forecasters)
+}
+
+#' given an ensemble and a list of forecasters used in some of those ensembles, return the ones that use them
+#' @inheritParams ensemble_missing_forecasters
+#' @export
+ensemble_missing_forecasters_details <- function(ensemble_grid = NULL, param_grid = NULL, project = NULL) {
+  absent_forecasters <- ensemble_missing_forecasters(ensemble_grid, param_grid, project)
+  grid_with_missing <- ensemble_grid %>%
+    rowwise() %>%
+    mutate(
+      missing_forecasters = list(map(
+        absent_forecasters,
+        # extract a list of the subforecasters with associated id, with only the missing ones having non-empty lists
+        function(absent_fc) {
+          is_missing <- grepl(absent_fc, forecaster_ids)
+          params_only <- forecasters[is_missing]
+          mapply(c, params_only, id = forecaster_ids[is_missing])
+        }
+      ))
+    )
+  flat_missing <- unlist(grid_with_missing$missing_forecasters, recursive = FALSE)
+  unique_missing <- flat_missing[map_vec(flat_missing, \(x) length(x) > 0)] %>% unique()
+  return(unique_missing)
+}
+
 
 #' generate an id from a simple list of parameters
 #' @param param_list the list of parameters. must include `ahead` if `ahead = NULL`
@@ -45,6 +114,7 @@ add_id <- function(df, n_adj = 2) {
 single_id <- function(param_list, ahead = NULL, n_adj = 2) {
   full_hash <- param_list[names(param_list) != "ahead"] %>%
     .[order(names(.))] %>% # put in alphabetical order
+    lapply(function(x) if (length(x) > 1) list(x) else x) %>% # the tibble version needs vectors to actually be lists, so this is a conversion to make sure the strings are identical
     paste(collapse = "") %>%
     hash_animal(n_adj = n_adj)
   single_string <- full_hash$words[[1]][1:n_adj] %>% paste(sep = ".", collapse = ".")
@@ -56,16 +126,11 @@ single_id <- function(param_list, ahead = NULL, n_adj = 2) {
   return(full_name)
 }
 
-
-#' given target name(s), lookup the corresponding parameters
-#' @export
-lookup_ids <- function() {
-}
-
-
 #' add aheads, forecaster_ids, and ids to a list of ensemble models
 #' @description
-#' minor utility
+#' First, do an expand grid to do a full combination of ensemble_grid x aheads.
+#'   Then add a column containing lists of ids of the dependent forecasters
+#'   based on their parameters.
 #' @param ensemble_grid the list of ensembles,
 #' @param aheads the aheads to add
 #' @inheritParams add_id
@@ -82,6 +147,9 @@ id_ahead_ensemble_grid <- function(ensemble_grid, aheads, n_adj = 2) {
     add_id(., n_adj = 2) %>%
     rowwise() %>%
     mutate(forecaster_ids = list(map2_vec(forecasters, ahead, single_id, n_adj = 2)))
+  if (length(ensemble_grid$id %>% unique()) < length(ensemble_grid$id)) {
+    abort("ensemble grid has non-unique forecasters")
+  }
   return(ensemble_grid)
 }
 
 
@@ -136,7 +136,11 @@ shinyApp(
   server = function(input, output, session) {
     filtered_scorecards_reactive <- reactive({
       agg_forecasters <- unique(c(input$selected_forecasters, input$baseline))
-      if (length(agg_forecasters) == 0) { return(data.frame()) }
+      if (length(agg_forecasters) == 0 ||
+          all(agg_forecasters == "" | is.null(agg_forecasters) | is.na(agg_forecasters))
+      ) {
+        return(data.frame())
+      }
 
       processed_evaluations_internal <- lapply(agg_forecasters, function(forecaster) {
           load_forecast_data(forecaster) %>>%