cmu-delphi · dshemetov · May 10, 2025 · Apr 10, 2025 · Apr 10, 2025 · Apr 11, 2025
diff --git a/Makefile b/Makefile
@@ -11,6 +11,12 @@ test:
 run:
 	Rscript scripts/run.R
 
+run-nohup:
+	nohup Rscript scripts/run.R &
+
+run-nohup-restarting:
+	scripts/hardRestarting.sh &
+
 prod-covid:
 	export TAR_RUN_PROJECT=covid_hosp_prod; Rscript scripts/run.R
 
@@ -65,12 +71,6 @@ get-nwss:
 	python nwss_covid_export.py; \
 	python nwss_influenza_export.py
 
-run-nohup:
-	nohup Rscript scripts/run.R &
-
-run-nohup-restarting:
-	scripts/hardRestarting.sh &
-
 sync:
 	Rscript -e "source('R/sync_aws.R'); sync_aws()"
 
@@ -98,3 +98,6 @@ get-flu-prod-errors:
 
 get-covid-prod-errors:
 	Rscript -e "suppressPackageStartupMessages(source(here::here('R', 'load_all.R'))); get_targets_errors(project = 'covid_hosp_prod')"
+
+summary_reports:
+	Rscript scripts/summary_reports.R
diff --git a/R/aux_data_utils.R b/R/aux_data_utils.R
@@ -213,13 +213,15 @@ daily_to_weekly <- function(epi_df, agg_method = c("sum", "mean"), keys = "geo_v
 #' @param epi_arch the archive to aggregate.
 #' @param agg_columns the columns to aggregate.
 #' @param agg_method the method to use to aggregate the data, one of "sum" or "mean".
-#' @param day_of_week the day of the week to use as the reference day.
-#' @param day_of_week_end the day of the week to use as the end of the week.
+#' @param week_reference the day of the week to use as the reference day (Wednesday is default).
+#'   Note that this is 1-indexed, so 1 = Sunday, 2 = Monday, ..., 7 = Saturday.
+#' @param week_start the day of the week to use as the start of the week (Sunday is default).
+#'   Note that this is 1-indexed, so 1 = Sunday, 2 = Monday, ..., 7 = Saturday.
 daily_to_weekly_archive <- function(epi_arch,
                                     agg_columns,
                                     agg_method = c("sum", "mean"),
-                                    day_of_week = 4L,
-                                    day_of_week_end = 7L) {
+                                    week_reference = 4L,
+                                    week_start = 7L) {
   # How to aggregate the windowed data.
   agg_method <- arg_match(agg_method)
   # The columns we will later group by when aggregating.
@@ -230,67 +232,24 @@ daily_to_weekly_archive <- function(epi_arch,
     sort()
   # Choose a fast function to use to slide and aggregate.
   if (agg_method == "sum") {
-    slide_fun <- epi_slide_sum
+    # If the week is complete, this is equivalent to the sum. If the week is not
+    # complete, this is equivalent to 7/(number of days in the week) * the sum,
+    # which should be a decent approximation.
+    agg_fun <- \(x) 7 * mean(x, na.rm = TRUE)
   } else if (agg_method == "mean") {
-    slide_fun <- epi_slide_mean
+    agg_fun <- \(x) mean(x, na.rm = TRUE)
   }
   # Slide over the versions and aggregate.
   epix_slide(
     epi_arch,
     .versions = ref_time_values,
     function(x, group_keys, ref_time) {
-      # The last day of the week we will slide over.
-      ref_time_last_week_end <- floor_date(ref_time, "week", day_of_week_end - 1)
-
-      # To find the days we will slide over, we need to find the first and last
-      # complete weeks of data. Get the max and min times, and then find the
-      # first and last complete weeks of data.
-      min_time <- min(x$time_value)
-      max_time <- max(x$time_value)
-
-      # Let's determine if the min and max times are in the same week.
-      ceil_min_time <- ceiling_date(min_time, "week", week_start = day_of_week_end - 1)
-      ceil_max_time <- ceiling_date(max_time, "week", week_start = day_of_week_end - 1)
-
-      # If they're not in the same week, this means we have at least one
-      # complete week of data to slide over.
-      if (ceil_min_time < ceil_max_time) {
-        valid_slide_days <- seq.Date(
-          from = ceiling_date(min_time, "week", week_start = day_of_week_end - 1),
-          to = floor_date(max_time, "week", week_start = day_of_week_end - 1),
-          by = 7L
-        )
-      } else {
-        # This is the degenerate case, where we have about 1 week or less of
-        # data. In this case, we opt to return nothing for two reasons:
-        # 1. in most cases here, the data is incomplete for a single week,
-        # 2. if the data is complete, a single week of data is not enough to
-        #    reasonably perform any kind of aggregation.
-        return(tibble())
-      }
-
-      # If the last day of the week is not the end of the week, add it to the
-      # list of valid slide days (this will produce an incomplete slide, but
-      # that's fine for us, since it should only be 1 day, historically.)
-      if (wday(max_time) != day_of_week_end) {
-        valid_slide_days <- c(valid_slide_days, max_time)
-      }
-
       # Slide over the days and aggregate.
       x %>%
-        group_by(across(all_of(keys))) %>%
-        slide_fun(
-          agg_columns,
-          .window_size = 7L,
-          na.rm = TRUE,
-          .ref_time_values = valid_slide_days
-        ) %>%
-        select(-all_of(agg_columns)) %>%
-        rename_with(~ gsub("slide_value_", "", .x)) %>%
-        rename_with(~ gsub("_7dsum", "", .x)) %>%
-        # Round all dates to reference day of the week. These will get
-        # de-duplicated by compactify in as_epi_archive below.
-        mutate(time_value = round_date(time_value, "week", day_of_week - 1)) %>%
+        mutate(week_start = ceiling_date(time_value, "week", week_start = week_start)-1) %>%
+        summarize(across(all_of(agg_columns), agg_fun), .by = all_of(c(keys, "week_start"))) %>%
+        mutate(time_value = round_date(week_start, "week", week_reference - 1)) %>%
+        select(-week_start) %>%
         as_tibble()
     }
   ) %>%

diff --git a/R/forecasters/data_validation.R b/R/forecasters/data_validation.R
@@ -68,9 +68,6 @@ confirm_sufficient_data <- function(epi_data, ahead, args_input, outcome, extra_
   # TODO: Buffer should probably be 2 * n(lags) * n(predictors). But honestly,
   # this needs to be fixed in epipredict itself, see
   # https://github.com/cmu-delphi/epipredict/issues/106.
-  if (identical(extra_sources, "")) {
-    extra_sources <- character(0L)
-  }
   has_no_last_nas <- epi_data %>%
     drop_na(c(!!outcome, !!!extra_sources)) %>%
     group_by(geo_value) %>%
@@ -106,17 +103,3 @@ filter_minus_one_ahead <- function(epi_data, ahead) {
   }
   epi_data
 }
-
-#' Unwrap an argument if it's a list of length 1
-#'
-#' Many of our arguments to the forecasters come as lists not because we expect
-#' them that way, but as a byproduct of tibble and expand_grid.
-unwrap_argument <- function(arg, default_trigger = "", default = character(0L)) {
-  if (is.list(arg) && length(arg) == 1) {
-    arg <- arg[[1]]
-  }
-  if (identical(arg, default_trigger)) {
-    return(default)
-  }
-  return(arg)
-}
diff --git a/R/forecasters/ensemble_average.R b/R/forecasters/ensemble_average.R
@@ -25,7 +25,7 @@
 ensemble_average <- function(epi_data,
                              forecasts,
                              outcome,
-                             extra_sources = "",
+                             extra_sources = character(),
                              ensemble_args = list(),
                              ensemble_args_names = NULL) {
   # unique parameters must be buried in ensemble_args so that the generic function signature is stable

diff --git a/R/forecasters/forecaster_climatological.R b/R/forecasters/forecaster_climatological.R
@@ -2,7 +2,7 @@
 #'
 climate_linear_ensembled <- function(epi_data,
                                      outcome,
-                                     extra_sources = "",
+                                     extra_sources = character(),
                                      ahead = 7,
                                      trainer = parsnip::linear_reg(),
                                      quantile_levels = covidhub_probs(),
@@ -22,8 +22,7 @@ climate_linear_ensembled <- function(epi_data,
   nonlin_method <- arg_match(nonlin_method)
 
   epi_data <- validate_epi_data(epi_data)
-  extra_sources <- unwrap_argument(extra_sources)
-  trainer <- unwrap_argument(trainer)
+  extra_sources <- unlist(extra_sources)
 
   args_list <- list(...)
   ahead <- as.integer(ahead / 7)

diff --git a/R/forecasters/forecaster_flatline.R b/R/forecasters/forecaster_flatline.R
@@ -10,16 +10,15 @@
 #' @export
 flatline_fc <- function(epi_data,
                         outcome,
-                        extra_sources = "",
+                        extra_sources = character(),
                         ahead = 1,
                         trainer = parsnip::linear_reg(),
                         quantile_levels = covidhub_probs(),
                         filter_source = "",
                         filter_agg_level = "",
                         ...) {
   epi_data <- validate_epi_data(epi_data)
-  extra_sources <- unwrap_argument(extra_sources)
-  trainer <- unwrap_argument(trainer)
+  extra_sources <- unlist(extra_sources)
 
   # perform any preprocessing not supported by epipredict
   epi_data %<>% filter_extraneous(filter_source, filter_agg_level)

diff --git a/R/forecasters/forecaster_flusion.R b/R/forecasters/forecaster_flusion.R
@@ -1,6 +1,6 @@
 flusion <- function(epi_data,
                     outcome,
-                    extra_sources = "",
+                    extra_sources = character(),
                     ahead = 7,
                     pop_scaling = FALSE,
                     trainer = rand_forest(
@@ -24,8 +24,7 @@ flusion <- function(epi_data,
   derivative_estimator <- arg_match(derivative_estimator)
 
   epi_data <- validate_epi_data(epi_data)
-  extra_sources <- unwrap_argument(extra_sources)
-  trainer <- unwrap_argument(trainer)
+  extra_sources <- unlist(extra_sources)
 
   # perform any preprocessing not supported by epipredict
   args_input <- list(...)

diff --git a/R/forecasters/forecaster_no_recent_outcome.R b/R/forecasters/forecaster_no_recent_outcome.R
@@ -2,7 +2,7 @@
 #' it may whiten any old data as the outcome
 no_recent_outcome <- function(epi_data,
                               outcome,
-                              extra_sources = "",
+                              extra_sources = character(),
                               ahead = 7,
                               pop_scaling = FALSE,
                               trainer = epipredict::quantile_reg(),
@@ -24,8 +24,7 @@ no_recent_outcome <- function(epi_data,
   week_method <- arg_match(week_method)
 
   epi_data <- validate_epi_data(epi_data)
-  extra_sources <- unwrap_argument(extra_sources)
-  trainer <- unwrap_argument(trainer)
+  extra_sources <- unlist(extra_sources)
 
   # this is for the case where there are multiple sources in the same column
   epi_data %<>% filter_extraneous(filter_source, filter_agg_level)

diff --git a/R/forecasters/forecaster_scaled_pop.R b/R/forecasters/forecaster_scaled_pop.R
@@ -47,7 +47,7 @@
 #' @export
 scaled_pop <- function(epi_data,
                        outcome,
-                       extra_sources = "",
+                       extra_sources = character(),
                        ahead = 1,
                        pop_scaling = TRUE,
                        drop_non_seasons = FALSE,
@@ -64,8 +64,7 @@ scaled_pop <- function(epi_data,
   nonlin_method <- arg_match(nonlin_method)
 
   epi_data <- validate_epi_data(epi_data)
-  extra_sources <- unwrap_argument(extra_sources)
-  trainer <- unwrap_argument(trainer)
+  extra_sources <- unlist(extra_sources)
 
   # perform any preprocessing not supported by epipredict
   #

diff --git a/R/forecasters/forecaster_scaled_pop_seasonal.R b/R/forecasters/forecaster_scaled_pop_seasonal.R
@@ -38,7 +38,7 @@
 scaled_pop_seasonal <- function(
   epi_data,
   outcome,
-  extra_sources = "",
+  extra_sources = character(),
   ahead = 1,
   pop_scaling = TRUE,
   drop_non_seasons = FALSE,
@@ -61,13 +61,9 @@ scaled_pop_seasonal <- function(
   nonlin_method <- arg_match(nonlin_method)
 
   epi_data <- validate_epi_data(epi_data)
-  extra_sources <- unwrap_argument(extra_sources)
-  trainer <- unwrap_argument(trainer)
+  extra_sources <- unlist(extra_sources)
 
-  if (typeof(seasonal_method) == "list") {
-    seasonal_method <- seasonal_method[[1]]
-  }
-  if (all(seasonal_method == c("none", "flu", "covid", "indicator", "window", "climatological"))) {
+  if (identical(seasonal_method, c("none", "flu", "covid", "indicator", "window", "climatological"))) {
     seasonal_method <- "none"
   }
   # perform any preprocessing not supported by epipredict

diff --git a/R/forecasters/forecaster_smoothed_scaled.R b/R/forecasters/forecaster_smoothed_scaled.R
@@ -51,7 +51,7 @@
 #' @export
 smoothed_scaled <- function(epi_data,
                             outcome,
-                            extra_sources = "",
+                            extra_sources = character(),
                             ahead = 1,
                             pop_scaling = TRUE,
                             trainer = parsnip::linear_reg(),
@@ -73,8 +73,7 @@ smoothed_scaled <- function(epi_data,
   nonlin_method <- arg_match(nonlin_method)
 
   epi_data <- validate_epi_data(epi_data)
-  extra_sources <- unwrap_argument(extra_sources)
-  trainer <- unwrap_argument(trainer)
+  extra_sources <- unlist(extra_sources)
 
   # perform any preprocessing not supported by epipredict
   #

diff --git a/R/forecasters/formatters.R b/R/forecasters/formatters.R
@@ -72,24 +72,27 @@ format_flusight <- function(pred, disease = c("flu", "covid")) {
 }
 
 format_scoring_utils <- function(forecasts_and_ensembles, disease = c("flu", "covid")) {
-  forecasts_and_ensembles %>%
-    filter(!grepl("region.*", geo_value)) %>%
-    mutate(
-      reference_date = get_forecast_reference_date(forecast_date),
-      target = glue::glue("wk inc {disease} hosp"),
-      horizon = as.integer(floor((target_end_date - reference_date) / 7)),
-      output_type = "quantile",
-      output_type_id = quantile,
-      value = value
-    ) %>%
+  # dplyr here was unreasonably slow on 1m+ rows, so replacing with direct access
+  fc_ens <- forecasts_and_ensembles
+  fc_ens <- fc_ens[!grepl("region.*", forecasts_and_ensembles$geo_value), ]
+  fc_ens[, "reference_date"] <- get_forecast_reference_date(fc_ens$forecast_date)
+  fc_ens[, "target"] <- glue::glue("wk inc {disease} hosp")
+  fc_ens[, "horizon"] <- as.integer(floor((fc_ens$target_end_date - fc_ens$reference_date) / 7))
+  fc_ens[, "output_type"] <- "quantile"
+  fc_ens[, "output_type_id"] <- fc_ens$quantile
+  fc_ens %>%
     left_join(
       get_population_data() %>%
         select(state_id, state_code),
       by = c("geo_value" = "state_id")
     ) %>%
     rename(location = state_code, model_id = forecaster) %>%
     select(reference_date, target, horizon, target_end_date, location, output_type, output_type_id, value, model_id) %>%
-    drop_na()
+    drop_na() %>%
+    arrange(location, target_end_date, reference_date, output_type_id) %>%
+    group_by(model_id, location, target_end_date, reference_date) %>%
+    mutate(value = sort(value)) %>%
+    ungroup()
 }
 
 #' The quantile levels used by the covidhub repository

diff --git a/R/imports.R b/R/imports.R
@@ -9,13 +9,15 @@ library(crew)
 library(data.table)
 library(dplyr)
 library(DT)
+options(DT.options = list(scrollX = TRUE))
 library(epidatr)
 library(epipredict)
 library(epiprocess)
 library(ggplot2)
 library(glue)
 library(grf)
 library(here)
+library(httpgd)
 if (Sys.getenv("COVID_SUBMISSION_DIRECTORY", "cache") != "cache") {
   library(hubValidations)
 }
@@ -36,6 +38,7 @@ library(recipes)
 library(renv)
 library(rlang)
 library(rspm)
+library(scales)
 library(scoringutils)
 library(slider)
 library(stringr)