@@ -697,6 +697,8 @@ delete_files_from_s3 <- function(bucket, keys, batch_size = 500, .progress = TRU
697
697
purrr :: walk(~ aws.s3 :: delete_object(bucket = bucket , object = .x ), .progress = .progress )
698
698
}
699
699
700
+ # ' All in one function to get and cache a nhsn archive from raw files.
701
+ # '
700
702
# ' @description
701
703
# ' This takes in all of the raw data files for the nhsn data, creates a
702
704
# ' quasi-archive (it keeps one example per version-day, rather than one per
@@ -707,72 +709,89 @@ delete_files_from_s3 <- function(bucket, keys, batch_size = 500, .progress = TRU
707
709
# ' containing the data for `disease_name`.
708
710
create_nhsn_data_archive <- function (disease_name ) {
709
711
if (aws.s3 :: head_object(" archive_timestamped.parquet" , bucket = " forecasting-team-data" )) {
712
+ # Load the previous archive from S3
710
713
aws.s3 :: save_object(" archive_timestamped.parquet" , bucket = " forecasting-team-data" , file = here :: here(" cache/archive_timestamped.parquet" ))
711
714
previous_archive <- qs :: qread(here :: here(" cache/archive_timestamped.parquet" ))
712
715
last_timestamp <- max(previous_archive $ version_timestamp )
713
716
} else {
714
- # there is no remote
715
- previous_archive <- NULL
717
+ # Remote archive does not exist, so start from scratch
718
+ previous_archive <- tibble()
716
719
last_timestamp <- as.Date(" 1000-01-01" )
717
720
}
718
- new_data <- aws.s3 :: get_bucket_df(bucket = " forecasting-team-data" , prefix = " nhsn_data_" ) %> %
719
- filter(get_version_timestamp(Key ) > last_timestamp ) %> %
720
- pull(Key ) %> %
721
- lapply(
722
- function (filename ) {
723
- version_timestamp <- get_version_timestamp(filename )
724
- res <- NULL
725
- tryCatch(
726
- {
727
- s3load(object = filename , bucket = " forecasting-team-data" )
728
- if (grepl(" prelim" , filename )) {
729
- res <- epi_data_raw_prelim
730
- endpoint_val <- " prelim"
731
- } else {
732
- res <- epi_data_raw
733
- endpoint_val <- " basic"
734
- }
735
- res <- res %> %
736
- process_nhsn_data() %> %
737
- select(geo_value , disease , time_value , value ) %> %
738
- mutate(version_timestamp = version_timestamp , endpoint = endpoint_val )
739
- },
740
- error = function (cond ) {}
741
- )
742
- res
743
- }
744
- )
745
- # drop any duplicates on the same day
746
- compactified <-
747
- new_data %> %
748
- bind_rows()
749
- if (nrow(compactified ) == 0 ) {
721
+
722
+ # Get a list of all new dataset snapshots from S3
723
+ new_data_files <- aws.s3 :: get_bucket_df(bucket = " forecasting-team-data" , prefix = " nhsn_data_" ) %> %
724
+ mutate(version_timestamp = get_version_timestamp(Key ), version = as.Date(version_timestamp )) %> %
725
+ filter(version_timestamp > last_timestamp ) %> %
726
+ as_tibble()
727
+ new_data_files_latest_per_day <- new_data_files
728
+ # Filter to just the latest version_timestamp for each version date.
729
+ new_data_files_latest_per_day <- new_data_files %> %
730
+ group_by(version ) %> %
731
+ slice_max(version_timestamp ) %> %
732
+ ungroup()
733
+
734
+ if (length(new_data_files_latest_per_day ) == 0 ) {
735
+ # No new data, so just use the previous archive
750
736
one_per_day <- previous_archive
751
737
} else {
752
- compactified <-
753
- compactified %> %
754
- arrange(geo_value , time_value , disease , endpoint , version_timestamp ) %> %
738
+ # Process each new dataset snapshot
739
+ new_data <- new_data_files_latest_per_day $ Key %> %
740
+ map(
741
+ function (filename ) {
742
+ version_timestamp <- get_version_timestamp(filename )
743
+ res <- NULL
744
+ tryCatch(
745
+ {
746
+ s3load(object = filename , bucket = " forecasting-team-data" )
747
+ if (grepl(" prelim" , filename )) {
748
+ res <- epi_data_raw_prelim
749
+ endpoint_val <- " prelim"
750
+ } else {
751
+ res <- epi_data_raw
752
+ endpoint_val <- " basic"
753
+ }
754
+ res <- res %> %
755
+ process_nhsn_data() %> %
756
+ select(geo_value , disease , time_value , value ) %> %
757
+ mutate(version_timestamp = version_timestamp , endpoint = endpoint_val )
758
+ },
759
+ error = function (cond ) {
760
+ cli :: cli_warn(" Error processing {filename}: {cond}" )
761
+ NULL
762
+ }
763
+ )
764
+ res
765
+ }, .progress = TRUE
766
+ )
767
+
768
+ new_data %> %
769
+ bind_rows() %> %
755
770
mutate(version = as.Date(version_timestamp )) %> %
756
- filter(if_any(
757
- c(everything(), - endpoint , - version_timestamp ), # all non-version, non-endpoint columns
758
- ~ ! epiprocess ::: is_locf(. , .Machine $ double.eps ^ 0.5 )
759
- ))
771
+ group_by(version , disease , geo_value , time_value ) %> %
772
+ slice_max(version_timestamp ) %> %
773
+ ungroup()
760
774
775
+ # Only keep the values with the latest version_timestamp for a given version date.
776
+ # We only need to do this for the versions in compactified, as the other versions can't possibly change
761
777
unchanged <- previous_archive %> % filter(! (version %in% unique(compactified $ version )))
762
- # only keep the last value for a given version (so across version_timestamps)
763
- # we only need to do this for the versions in compactified, as the other versions can't possibly change
764
- one_per_day <-
765
- previous_archive %> %
778
+ one_per_day <- previous_archive %> %
766
779
filter(version %in% unique(compactified $ version )) %> %
767
780
bind_rows(compactified ) %> %
768
781
group_by(geo_value , disease , time_value , version ) %> %
769
- arrange(version_timestamp ) %> %
770
- filter(row_number() == n()) %> %
782
+ slice_max(version_timestamp ) %> %
771
783
ungroup() %> %
772
784
bind_rows(unchanged )
773
785
qs :: qsave(one_per_day , here :: here(" cache/archive_timestamped.parquet" ))
774
786
aws.s3 :: put_object(here :: here(" cache/archive_timestamped.parquet" ), " archive_timestamped.parquet" , bucket = " forecasting-team-data" )
775
787
}
788
+
789
+ if (nrow(one_per_day ) == 0 ) {
790
+ cli :: cli_warn(" No data found for {disease_name}" )
791
+ return (NULL )
792
+ }
793
+
794
+ # Return the archive for the disease of interest.
776
795
one_per_day %> %
777
796
filter(disease == disease_name ) %> %
778
797
select(- version_timestamp , - endpoint , - disease ) %> %
0 commit comments