wip

dshemetov · dshemetov · commit f3899e47dc42 · 2025-02-20T14:00:12.000-08:00
diff --git a/R/aux_data_utils.R b/R/aux_data_utils.R
@@ -697,6 +697,8 @@ delete_files_from_s3 <- function(bucket, keys, batch_size = 500, .progress = TRU
     purrr::walk(~aws.s3::delete_object(bucket = bucket, object = .x), .progress = .progress)
 }
 
+#' All in one function to get and cache a nhsn archive from raw files.
+#'
 #' @description
 #' This takes in all of the raw data files for the nhsn data, creates a
 #'   quasi-archive (it keeps one example per version-day, rather than one per
@@ -707,72 +709,89 @@ delete_files_from_s3 <- function(bucket, keys, batch_size = 500, .progress = TRU
 #'   containing the data for `disease_name`.
 create_nhsn_data_archive <- function(disease_name) {
   if (aws.s3::head_object("archive_timestamped.parquet", bucket = "forecasting-team-data")) {
+    # Load the previous archive from S3
     aws.s3::save_object("archive_timestamped.parquet", bucket = "forecasting-team-data", file = here::here("cache/archive_timestamped.parquet"))
     previous_archive <- qs::qread(here::here("cache/archive_timestamped.parquet"))
     last_timestamp <- max(previous_archive$version_timestamp)
   } else {
-    # there is no remote
-    previous_archive <- NULL
+    # Remote archive does not exist, so start from scratch
+    previous_archive <- tibble()
     last_timestamp <- as.Date("1000-01-01")
   }
-  new_data <- aws.s3::get_bucket_df(bucket = "forecasting-team-data", prefix = "nhsn_data_") %>%
-    filter(get_version_timestamp(Key) > last_timestamp) %>%
-    pull(Key) %>%
-    lapply(
-      function(filename) {
-        version_timestamp <- get_version_timestamp(filename)
-        res <- NULL
-        tryCatch(
-          {
-            s3load(object = filename, bucket = "forecasting-team-data")
-            if (grepl("prelim", filename)) {
-              res <- epi_data_raw_prelim
-              endpoint_val <- "prelim"
-            } else {
-              res <- epi_data_raw
-              endpoint_val <- "basic"
-            }
-            res <- res %>%
-              process_nhsn_data() %>%
-              select(geo_value, disease, time_value, value) %>%
-              mutate(version_timestamp = version_timestamp, endpoint = endpoint_val)
-          },
-          error = function(cond) {}
-        )
-        res
-      }
-    )
-  # drop any duplicates on the same day
-  compactified <-
-    new_data %>%
-    bind_rows()
-  if (nrow(compactified) == 0) {
+
+  # Get a list of all new dataset snapshots from S3
+  new_data_files <- aws.s3::get_bucket_df(bucket = "forecasting-team-data", prefix = "nhsn_data_") %>%
+    mutate(version_timestamp = get_version_timestamp(Key), version = as.Date(version_timestamp)) %>%
+    filter(version_timestamp > last_timestamp) %>%
+    as_tibble()
+  new_data_files_latest_per_day <- new_data_files
+  # Filter to just the latest version_timestamp for each version date.
+  new_data_files_latest_per_day <- new_data_files %>%
+    group_by(version) %>%
+    slice_max(version_timestamp) %>%
+    ungroup()
+
+  if (length(new_data_files_latest_per_day) == 0) {
+    # No new data, so just use the previous archive
     one_per_day <- previous_archive
   } else {
-    compactified <-
-      compactified %>%
-      arrange(geo_value, time_value, disease, endpoint, version_timestamp) %>%
+    # Process each new dataset snapshot
+    new_data <- new_data_files_latest_per_day$Key %>%
+      map(
+        function(filename) {
+          version_timestamp <- get_version_timestamp(filename)
+          res <- NULL
+          tryCatch(
+            {
+              s3load(object = filename, bucket = "forecasting-team-data")
+              if (grepl("prelim", filename)) {
+                res <- epi_data_raw_prelim
+                endpoint_val <- "prelim"
+              } else {
+                res <- epi_data_raw
+                endpoint_val <- "basic"
+              }
+              res <- res %>%
+                process_nhsn_data() %>%
+                select(geo_value, disease, time_value, value) %>%
+                mutate(version_timestamp = version_timestamp, endpoint = endpoint_val)
+            },
+            error = function(cond) {
+              cli::cli_warn("Error processing {filename}: {cond}")
+              NULL
+            }
+          )
+          res
+        }, .progress = TRUE
+      )
+
+    new_data %>%
+      bind_rows() %>%
       mutate(version = as.Date(version_timestamp)) %>%
-      filter(if_any(
-        c(everything(), -endpoint, -version_timestamp), # all non-version, non-endpoint columns
-        ~ !epiprocess:::is_locf(., .Machine$double.eps^0.5)
-      ))
+      group_by(version, disease, geo_value, time_value) %>%
+      slice_max(version_timestamp) %>%
+      ungroup()
 
+    # Only keep the values with the latest version_timestamp for a given version date.
+    # We only need to do this for the versions in compactified, as the other versions can't possibly change
     unchanged <- previous_archive %>% filter(!(version %in% unique(compactified$version)))
-    # only keep the last value for a given version (so across version_timestamps)
-    # we only need to do this for the versions in compactified, as the other versions can't possibly change
-    one_per_day <-
-      previous_archive %>%
+    one_per_day <- previous_archive %>%
       filter(version %in% unique(compactified$version)) %>%
       bind_rows(compactified) %>%
       group_by(geo_value, disease, time_value, version) %>%
-      arrange(version_timestamp) %>%
-      filter(row_number() == n()) %>%
+      slice_max(version_timestamp) %>%
       ungroup() %>%
       bind_rows(unchanged)
     qs::qsave(one_per_day, here::here("cache/archive_timestamped.parquet"))
     aws.s3::put_object(here::here("cache/archive_timestamped.parquet"), "archive_timestamped.parquet", bucket = "forecasting-team-data")
   }
+
+  if (nrow(one_per_day) == 0) {
+    cli::cli_warn("No data found for {disease_name}")
+    return(NULL)
+  }
+
+  # Return the archive for the disease of interest.
   one_per_day %>%
     filter(disease == disease_name) %>%
     select(-version_timestamp, -endpoint, -disease) %>%
diff --git a/R/utils.R b/R/utils.R
@@ -326,6 +326,7 @@ update_site <- function(sync_to_s3 = TRUE) {
   if (!file_exists(template_path)) {
     stop("Template file does not exist.")
   }
+
   report_md_content <- readLines(template_path)
   # Get the list of files in the reports directory
   report_files <- dir_ls(reports_dir, regexp = ".*_prod_on_.*.html")
diff --git a/scripts/nhsn_download.R b/scripts/nhsn_download.R
@@ -1,28 +1,82 @@
-print("########################################")
-print("Starting at")
-print(Sys.time())
-print("########################################")
-# the crontab that is used to run this is:
+# NHSN Archive Builder
+#
+# This script is meant to run every minute. It downloads the latest NHSN data from
+# the CDC website and adds it to the archive.
+#
+# It is meant to be general and usable for other data source projects. Adapting
+# it to other data sources you will need to consider:
+# 1. The new data source URL
+# 2. A way to capture the last updated timestamp on the new data source
+#
+# The crontab:
 # 31 0-23 * * 2,3,5 cd /path/to/root/of/this/project && direnv exec /path/to/root/of/this/project /usr/bin/Rscript scripts/nhsn_download.R >> cache/nhsn_download.log 2>&1
-suppressPackageStartupMessages(source(here::here("R", "load_all.R")))
+
+library(tidyverse)
+library(here)
+library(httr)
 library(readr)
-library(epiprocess)
+library(purrr)
 library(qs)
-save_folder <- here::here("cache")
-dir.create(save_folder)
-dir.create(file.path(save_folder, "raw_data"))
-
-# read and immediately save the raw version
-epi_data_raw <- readr::read_csv("https://data.cdc.gov/resource/ua7e-t2fy.csv?$limit=20000&$select=weekendingdate,jurisdiction,totalconfc19newadm,totalconfflunewadm")
-epi_data_raw_prelim <- readr::read_csv("https://data.cdc.gov/resource/mpgq-jmmr.csv?$limit=20000&$select=weekendingdate,jurisdiction,totalconfc19newadm,totalconfflunewadm")
-raw_file <- glue::glue("nhsn_data_{Sys.time()}") %>%
-  gsub(" ", "_", .) %>%
-  gsub(":", "-", .)
-raw_path <- raw_file %>%
-  file.path(save_folder, "raw_data", .) %>%
-  paste0(".parquet")
-qs::qsave(epi_data_raw, raw_path)
-s3save(epi_data_raw, object = paste0(raw_file, ".rds"), bucket = "forecasting-team-data")
-s3save(epi_data_raw_prelim, object = paste0(raw_file, "_prelim", ".rds"), bucket = "forecasting-team-data")
-
-create_nhsn_data_archive()
+
+
+config <- list(
+  raw_url = "https://data.cdc.gov/resource/ua7e-t2fy.csv",
+  prelim_url = "https://data.cdc.gov/resource/mpgq-jmmr.csv",
+  raw_metadata_url = "https://data.cdc.gov/api/views/ua7e-t2fy",
+  prelim_metadata_url = "https://data.cdc.gov/api/views/mpgq-jmmr",
+  save_folder = here::here("cache"),
+  raw_file_name = "nhsn_data",
+  last_updated_at_file_name = "last_updated_at.rds",
+  s3_bucket = "forecasting-team-data",
+  s3_key = "nhsn_data.parquet"
+)
+
+get_socrata_updated_at <- function(dataset_url) {
+  httr::GET(dataset_url) %>%
+    httr::content() %>%
+    pluck("rowsUpdatedAt") %>%
+    as.POSIXct()
+}
+
+main <- function() {
+  # Create the save folder if it doesn't exist
+  if (!dir.exists(config$save_folder)) {
+    dir.create(config$save_folder)
+  }
+  if (!dir.exists(file.path(config$save_folder, "raw_data"))) {
+    dir.create(file.path(config$save_folder, "raw_data"))
+  }
+  if (file.exists(file.path(config$save_folder, config$last_updated_at_file_name))) {
+    last_updated_at <- read_rds(file.path(config$save_folder, config$last_updated_at_file_name))
+  } else {
+    last_updated_at <- tibble(
+      raw = NA_POSIXct_,
+      prelim = NA_POSIXct_
+    )
+  }
+
+  raw_update_at <- get_socrata_updated_at(config$raw_metadata_url)
+  prelim_update_at <- get_socrata_updated_at(config$prelim_metadata_url)
+
+  raw_file_name <- glue::glue("{config$raw_file_name}_{Sys.time()}") %>%
+    gsub(" ", "_", .) %>%
+    gsub(":", "-", .)
+  raw_path <- file.path(config$save_folder, "raw_data", raw_file_name) %>%
+    paste0(".parquet")
+
+  if (raw_update_at > last_updated_at$raw) {
+    # read and immediately save the raw version
+    epi_data_raw <- readr::read_csv("https://data.cdc.gov/resource/ua7e-t2fy.csv?$limit=20000&$select=weekendingdate,jurisdiction,totalconfc19newadm,totalconfflunewadm")
+    last_updated_at$raw <- raw_update_at
+    qs::qsave(epi_data_raw, raw_path)
+    s3save(epi_data_raw, object = paste0(raw_file, ".rds"), bucket = "forecasting-team-data")
+  }
+
+  if (prelim_update_at > last_updated_at$prelim) {
+    epi_data_raw_prelim <- readr::read_csv("https://data.cdc.gov/resource/mpgq-jmmr.csv?$limit=20000&$select=weekendingdate,jurisdiction,totalconfc19newadm,totalconfflunewadm")
+    last_updated_at$prelim <- prelim_update_at
+    qs::qsave(epi_data_raw_prelim, raw_path)
+    s3save(epi_data_raw_prelim, object = paste0(raw_file, "_prelim", ".rds"), bucket = "forecasting-team-data")
+  }
+  create_nhsn_data_archive()
+}

Original file line number	Diff line number	Diff line change
`@@ -326,6 +326,7 @@ update_site <- function(sync_to_s3 = TRUE) {`
`326`	`326`	`if (!file_exists(template_path)) {`
`327`	`327`	`stop("Template file does not exist.")`
`328`	`328`	`}`
	`329`	`+`
`329`	`330`	`report_md_content <- readLines(template_path)`
`330`	`331`	`# Get the list of files in the reports directory`
`331`	`332`	`report_files <- dir_ls(reports_dir, regexp = "._prod_on_..html")`