Skip to content

Commit cee1f2b

Browse files
authored
Merge pull request #168 from cmu-delphi/evaluate_current_season
covid: evaluate current season
2 parents 3d17bbf + 9005314 commit cee1f2b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+1750
-793
lines changed

.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,7 @@ reports/report.md
1818
cache/
1919
data/
2020
.vscode/
21+
.envrc
22+
.nhsn_covid_cache.parquet
23+
.nhsn_flu_cache.parquet
24+
meta/

.renvignore

+1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
tmp.R
22
node_modules/*
3+
local_logs/

Makefile

+8-1
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,10 @@ submit: submit-covid submit-flu
6363
get_nwss:
6464
mkdir -p aux_data/nwss_covid_data; \
6565
mkdir -p aux_data/nwss_flu_data; \
66+
. .venv/bin/activate; \
6667
cd scripts/nwss_export_tool/; \
6768
python nwss_covid_export.py; \
68-
python nwss_covid_export.py
69+
python nwss_influenza_export.py
6970

7071
run-nohup:
7172
nohup Rscript scripts/run.R &
@@ -94,3 +95,9 @@ update_site:
9495

9596
netlify:
9697
netlify deploy --dir=reports --prod
98+
99+
get_flu_prod_errors:
100+
Rscript -e "suppressPackageStartupMessages(source(here::here('R', 'load_all.R'))); get_targets_errors(project = 'flu_hosp_prod')"
101+
102+
get_covid_prod_errors:
103+
Rscript -e "suppressPackageStartupMessages(source(here::here('R', 'load_all.R'))); get_targets_errors(project = 'covid_hosp_prod')"

R/aux_data_utils.R

+137-38
Original file line numberDiff line numberDiff line change
@@ -188,38 +188,76 @@ daily_to_weekly <- function(epi_df, agg_method = c("sum", "mean"), day_of_week =
188188
select(-epiweek, -year)
189189
}
190190

191+
#' Aggregate a daily archive to a weekly archive.
192+
#'
193+
#' @param epi_arch the archive to aggregate.
194+
#' @param agg_columns the columns to aggregate.
195+
#' @param agg_method the method to use to aggregate the data, one of "sum" or "mean".
196+
#' @param day_of_week the day of the week to use as the reference day.
197+
#' @param day_of_week_end the day of the week to use as the end of the week.
191198
daily_to_weekly_archive <- function(epi_arch,
192199
agg_columns,
193200
agg_method = c("sum", "mean"),
194201
day_of_week = 4L,
195202
day_of_week_end = 7L) {
203+
# How to aggregate the windowed data.
196204
agg_method <- arg_match(agg_method)
205+
# The columns we will later group by when aggregating.
197206
keys <- key_colnames(epi_arch, exclude = c("time_value", "version"))
207+
# The versions we will slide over.
198208
ref_time_values <- epi_arch$DT$version %>%
199209
unique() %>%
200210
sort()
211+
# Choose a fast function to use to slide and aggregate.
201212
if (agg_method == "sum") {
202213
slide_fun <- epi_slide_sum
203214
} else if (agg_method == "mean") {
204215
slide_fun <- epi_slide_mean
205216
}
206-
too_many_tibbles <- epix_slide(
217+
# Slide over the versions and aggregate.
218+
epix_slide(
207219
epi_arch,
208-
.before = 99999999L,
209220
.versions = ref_time_values,
210-
function(x, group, ref_time) {
211-
ref_time_last_week_end <-
212-
floor_date(ref_time, "week", day_of_week_end - 1) # this is over by 1
221+
function(x, group_keys, ref_time) {
222+
# The last day of the week we will slide over.
223+
ref_time_last_week_end <- floor_date(ref_time, "week", day_of_week_end - 1)
224+
225+
# To find the days we will slide over, we need to find the first and last
226+
# complete weeks of data. Get the max and min times, and then find the
227+
# first and last complete weeks of data.
228+
min_time <- min(x$time_value)
213229
max_time <- max(x$time_value)
214-
valid_slide_days <- seq.Date(
215-
from = ceiling_date(min(x$time_value), "week", week_start = day_of_week_end - 1),
216-
to = floor_date(max(x$time_value), "week", week_start = day_of_week_end - 1),
217-
by = 7L
218-
)
230+
231+
# Let's determine if the min and max times are in the same week.
232+
ceil_min_time <- ceiling_date(min_time, "week", week_start = day_of_week_end - 1)
233+
ceil_max_time <- ceiling_date(max_time, "week", week_start = day_of_week_end - 1)
234+
235+
# If they're not in the same week, this means we have at least one
236+
# complete week of data to slide over.
237+
if (ceil_min_time < ceil_max_time) {
238+
valid_slide_days <- seq.Date(
239+
from = ceiling_date(min_time, "week", week_start = day_of_week_end - 1),
240+
to = floor_date(max_time, "week", week_start = day_of_week_end - 1),
241+
by = 7L
242+
)
243+
} else {
244+
# This is the degenerate case, where we have about 1 week or less of
245+
# data. In this case, we opt to return nothing for two reasons:
246+
# 1. in most cases here, the data is incomplete for a single week,
247+
# 2. if the data is complete, a single week of data is not enough to
248+
# reasonably perform any kind of aggregation.
249+
return(tibble())
250+
}
251+
252+
# If the last day of the week is not the end of the week, add it to the
253+
# list of valid slide days (this will produce an incomplete slide, but
254+
# that's fine for us, since it should only be 1 day, historically.)
219255
if (wday(max_time) != day_of_week_end) {
220256
valid_slide_days <- c(valid_slide_days, max_time)
221257
}
222-
slid_result <- x %>%
258+
259+
# Slide over the days and aggregate.
260+
x %>%
223261
group_by(across(all_of(keys))) %>%
224262
slide_fun(
225263
agg_columns,
@@ -229,18 +267,13 @@ daily_to_weekly_archive <- function(epi_arch,
229267
) %>%
230268
select(-all_of(agg_columns)) %>%
231269
rename_with(~ gsub("slide_value_", "", .x)) %>%
232-
# only keep 1/week
233-
# group_by week, keep the largest in each week
234-
# alternatively
235-
# switch time_value to the designated day of the week
270+
rename_with(~ gsub("_7dsum", "", .x)) %>%
271+
# Round all dates to reference day of the week. These will get
272+
# de-duplicated by compactify in as_epi_archive below.
236273
mutate(time_value = round_date(time_value, "week", day_of_week - 1)) %>%
237274
as_tibble()
238275
}
239-
)
240-
too_many_tibbles %>%
241-
pull(time_value) %>%
242-
max()
243-
too_many_tibbles %>%
276+
) %>%
244277
as_epi_archive(compactify = TRUE)
245278
}
246279

@@ -313,9 +346,8 @@ get_health_data <- function(as_of, disease = c("covid", "flu")) {
313346

314347
most_recent_row <- meta_data %>%
315348
# update_date is actually a time, so we need to filter for the day after.
316-
filter(update_date <= as_of + 1) %>%
317-
arrange(desc(update_date)) %>%
318-
slice(1)
349+
filter(update_date <= as.Date(as_of) + 1) %>%
350+
slice_max(update_date)
319351

320352
if (nrow(most_recent_row) == 0) {
321353
cli::cli_abort("No data available for the given date.")
@@ -331,9 +363,7 @@ get_health_data <- function(as_of, disease = c("covid", "flu")) {
331363
if (disease == "covid") {
332364
data %<>% mutate(
333365
hhs = previous_day_admission_adult_covid_confirmed +
334-
previous_day_admission_adult_covid_suspected +
335-
previous_day_admission_pediatric_covid_confirmed +
336-
previous_day_admission_pediatric_covid_suspected
366+
previous_day_admission_pediatric_covid_confirmed
337367
)
338368
} else if (disease == "flu") {
339369
data %<>% mutate(hhs = previous_day_admission_influenza_confirmed)
@@ -594,15 +624,16 @@ gen_ili_data <- function(default_day_of_week = 1) {
594624
as_epi_archive(compactify = TRUE)
595625
}
596626

627+
#' Process Raw NHSN Data
628+
#'
629+
#' Turns the raw NHSN data into a tidy format with the following columns:
630+
#' - geo_value: the jurisdiction of the data
631+
#' - disease: the disease of the data
632+
#' - time_value: the date of the data
633+
#' - version: the version of the data
634+
#' - value: the value of the data
635+
#'
597636
process_nhsn_data <- function(raw_nhsn_data) {
598-
# These are exception dates when the data was available on a different day
599-
# than usual. In these two cases, it was the Thursday after. But to keep
600-
# the rest of the pipeline the same, we pretend it was available on Wednesday.
601-
remap_exceptions <- list(
602-
"2024-12-26" = "2024-12-25",
603-
"2025-01-02" = "2025-01-01"
604-
)
605-
fixed_version <- remap_exceptions[[as.character(Sys.Date())]] %||% Sys.Date()
606637
raw_nhsn_data %>%
607638
mutate(
608639
geo_value = tolower(jurisdiction),
@@ -614,15 +645,58 @@ process_nhsn_data <- function(raw_nhsn_data) {
614645
select(-weekendingdate, -jurisdiction, -starts_with("totalconf")) %>%
615646
pivot_longer(cols = starts_with("nhsn"), names_to = "disease") %>%
616647
filter(!is.na(value)) %>%
617-
mutate(version = fixed_version) %>%
648+
mutate(version = Sys.Date()) %>%
618649
relocate(geo_value, disease, time_value, version)
619650
}
620651

621652

622653
# for filenames of the form nhsn_data_2024-11-19_16-29-43.191649.rds
623654
get_version_timestamp <- function(filename) ymd_hms(str_match(filename, "[0-9]{4}-..-.._..-..-..\\.[^.^_]*"))
624655

625-
#' all in one function to get and cache a nhsn archive from raw files
656+
#' Remove duplicate files from S3
657+
#'
658+
#' Removes duplicate files from S3 by keeping only the earliest timestamp file for each ETag.
659+
#' You can modify keep_df, if this doesn't suit your needs.
660+
#'
661+
#' @param bucket The name of the S3 bucket.
662+
#' @param prefix The prefix of the files to remove duplicates from.
663+
#' @param dry_run Whether to actually delete the files.
664+
#' @param .progress Whether to show a progress bar.
665+
delete_duplicates_from_s3_by_etag <- function(bucket, prefix, dry_run = TRUE, .progress = TRUE) {
666+
# Get a list of all new dataset snapshots from S3
667+
files_df <- aws.s3::get_bucket_df(bucket = bucket, prefix = prefix) %>% as_tibble()
668+
669+
# Create a list of all the files to keep by keeping the earliest timestamp file for each ETag
670+
keep_df <- files_df %>%
671+
group_by(ETag) %>%
672+
slice_min(LastModified) %>%
673+
ungroup()
674+
delete_df <- files_df %>%
675+
anti_join(keep_df, by = "Key")
676+
if (nrow(delete_df) > 0) {
677+
if (dry_run) {
678+
cli::cli_alert_info("Would delete {nrow(delete_df)} files from {bucket} with prefix {prefix}")
679+
print(delete_df)
680+
return(invisible(delete_df))
681+
} else {
682+
delete_files_from_s3(bucket = bucket, keys = delete_df$Key, .progress = .progress)
683+
}
684+
}
685+
}
686+
687+
#' Delete files from S3
688+
#'
689+
#' Faster than aws.s3::delete_object, when there are many files to delete (thousands).
690+
#'
691+
#' @param bucket The name of the S3 bucket.
692+
#' @param keys The keys of the files to delete, as a character vector.
693+
#' @param batch_size The number of files to delete in each batch.
694+
#' @param .progress Whether to show a progress bar.
695+
delete_files_from_s3 <- function(bucket, keys, batch_size = 500, .progress = TRUE) {
696+
split(keys, ceiling(seq_along(keys) / batch_size)) %>%
697+
purrr::walk(~aws.s3::delete_object(bucket = bucket, object = .x), .progress = .progress)
698+
}
699+
626700
#' @description
627701
#' This takes in all of the raw data files for the nhsn data, creates a
628702
#' quasi-archive (it keeps one example per version-day, rather than one per
@@ -709,10 +783,12 @@ create_nhsn_data_archive <- function(disease_name) {
709783
as_epi_archive(compactify = TRUE)
710784
}
711785

712-
713786
up_to_date_nssp_state_archive <- function(disease = c("covid", "influenza")) {
714787
disease <- arg_match(disease)
715-
nssp_state <- pub_covidcast(
788+
nssp_state <- retry_fn(
789+
max_attempts = 10,
790+
wait_seconds = 1,
791+
fn = pub_covidcast,
716792
source = "nssp",
717793
signal = glue::glue("pct_ed_visits_{disease}"),
718794
time_type = "week",
@@ -728,3 +804,26 @@ up_to_date_nssp_state_archive <- function(disease = c("covid", "influenza")) {
728804
mutate(time_value = time_value + 3) %>%
729805
as_epi_archive(compactify = TRUE)
730806
}
807+
808+
# Get the last time the signal was updated.
809+
get_covidcast_signal_last_update <- function(source, signal, geo_type) {
810+
pub_covidcast_meta() %>%
811+
filter(source == !!source, signal == !!signal, geo_type == !!geo_type) %>%
812+
pull(last_update) %>%
813+
as.POSIXct()
814+
}
815+
816+
# Get the last time the Socrata dataset was updated.
817+
get_socrata_updated_at <- function(dataset_url) {
818+
httr::GET(dataset_url) %>%
819+
httr::content() %>%
820+
pluck("rowsUpdatedAt") %>%
821+
as.POSIXct()
822+
}
823+
824+
get_s3_object_last_modified <- function(bucket, key) {
825+
# Format looks like "Fri, 31 Jan 2025 22:01:16 GMT"
826+
attr(aws.s3::head_object(key, bucket = bucket), "last-modified") %>%
827+
str_replace_all(" GMT", "") %>%
828+
as.POSIXct(format = "%a, %d %b %Y %H:%M:%S")
829+
}

R/forecasters/data_transforms.R

+13-20
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ get_nonkey_names <- function(epi_data) {
2626

2727
#' Get a rolling average for the named columns
2828
#'
29-
#' Add column(s) that are the rolling means of the specified columns, as
29+
#' Add a column that is the rolling means of the specified column, as
3030
#' implemented by slider. Defaults to the previous 7 days. Currently only
3131
#' group_by's on the geo_value. Should probably extend to more keys if you have
3232
#' them.
@@ -40,15 +40,10 @@ get_nonkey_names <- function(epi_data) {
4040
#' @export
4141
rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) {
4242
cols_to_mean <- get_trainable_names(epi_data, cols_to_mean)
43-
epi_data %<>% group_by(across(key_colnames(epi_data, exclude = "time_value")))
44-
for (col in cols_to_mean) {
45-
mean_name <- paste0("slide_", col, "_m", width)
46-
epi_data %<>%
47-
epi_slide_mean(all_of(col), .window_size = width) %>%
48-
rename(!!mean_name := paste0("slide_value_", col))
49-
}
50-
epi_data %<>% ungroup()
51-
return(epi_data)
43+
epi_data %>%
44+
group_by(across(key_colnames(epi_data, exclude = "time_value"))) %>%
45+
epi_slide_mean(all_of(cols_to_mean), .window_size = width, .new_col_names = paste0("slide_", cols_to_mean, "_m", width)) %>%
46+
ungroup()
5247
}
5348

5449
#' Get a rolling standard deviation for the named columns
@@ -75,20 +70,18 @@ rolling_sd <- function(epi_data, sd_width = 29L, mean_width = NULL, cols_to_sd =
7570
cols_to_sd <- get_trainable_names(epi_data, cols_to_sd)
7671
result <- epi_data
7772
result %<>% group_by(across(key_colnames(epi_data, exclude = "time_value")))
78-
for (col in cols_to_sd) {
79-
mean_name <- glue::glue("slide_{col}_m{mean_width}")
80-
sd_name <- glue::glue("slide_{col}_sd{sd_width}")
73+
for (col_name in cols_to_sd) {
74+
mean_name <- glue::glue("slide_{col_name}_m{mean_width}")
75+
sd_name <- glue::glue("slide_{col_name}_sd{sd_width}")
8176

8277
result %<>%
83-
epi_slide_mean(all_of(col), .window_size = mean_width) %>%
84-
rename(!!mean_name := paste0("slide_value_", col))
78+
epi_slide_mean(all_of(col_name), .window_size = mean_width, .new_col_names = mean_name)
8579

8680
result %<>%
87-
mutate(.temp = (.data[[mean_name]] - .data[[col]])^2) %>%
88-
epi_slide_mean(all_of(".temp"), .window_size = sd_width) %>%
89-
select(-.temp) %>%
90-
rename(!!sd_name := "slide_value_.temp") %>%
91-
mutate(!!sd_name := sqrt(.data[[sd_name]]))
81+
mutate(.temp = (.data[[mean_name]] - .data[[col_name]])^2) %>%
82+
epi_slide_mean(all_of(".temp"), .window_size = sd_width, .new_col_names = sd_name) %>%
83+
mutate(!!sd_name := sqrt(.data[[sd_name]])) %>%
84+
select(-.temp)
9285

9386
if (!keep_mean) {
9487
result %<>% select(-{{ mean_name }})

R/forecasters/ensemble_average.R

+2-2
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,12 @@ ensemble_average <- function(epi_data,
3232
# their names are separated for obscure target related reasons
3333
names(ensemble_args) <- ensemble_args_names %||% names(ensemble_args)
3434
average_type <- ensemble_args$average_type %||% median
35-
join_columns <- ensemble_args$join_columns %||% setdiff(names(forecasts[[1]]), "prediction")
35+
join_columns <- ensemble_args$join_columns %||% setdiff(names(forecasts[[1]]), "value")
3636

3737
# begin actual analysis
3838
forecasts %>%
3939
bind_rows(.id = "id") %>%
4040
group_by(across(all_of(join_columns))) %>%
41-
summarize(prediction = average_type(prediction, na.rm = TRUE), .groups = "drop") %>%
41+
summarize(value = average_type(value, na.rm = TRUE), .groups = "drop") %>%
4242
ungroup()
4343
}

0 commit comments

Comments
 (0)