Skip to content

Commit 3bb5a82

Browse files
authored
Merge pull request #5 from cmu-delphi/ndefries/epipredict-support
Move `epipredict` vignette datasets to `epidatasets`
2 parents d91aaee + 47b3ca0 commit 3bb5a82

37 files changed

+820
-58
lines changed

DESCRIPTION

+3-2
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,10 @@ Authors@R: c(
77
person("Nat", "DeFries", email="[email protected]", role = c("cre", "aut")),
88
person("Johns Hopkins University Center for Systems Science and Engineering", role = "dtc", comment = "Owner of COVID-19 cases and deaths data from the COVID-19 Data Repository"),
99
person("Johns Hopkins University", role = "cph", comment = "Copyright holder of COVID-19 cases and deaths data from the COVID-19 Data Repository"),
10-
person("Carnegie Mellon University Delphi Group", role = "dtc", comment = "Owner of masking and social-distancing data from the COVID-19 Trends and Impacts Survey. Owner of claims-based CLI data from the Delphi Epidata API"),
10+
person("Carnegie Mellon University Delphi Group", role = "dtc", comment = "Owner of masking, social-distancing, and CLI data from the COVID-19 Trends and Impacts Survey. Owner of claims-based CLI data from the Delphi Epidata API"),
1111
person("The COVID-19 Canada Open Data Working Group", role = "dtc", comment = "Owner of Canadian COVID-19 cases rates from the Covid19Canada data repository"),
12-
person("Statistics Canada", role = "dtc", comment = "Owner of Canadian graduate employment income data from the Statistics Canada website")
12+
person("Statistics Canada", role = "dtc", comment = "Owner of Canadian graduate employment income data from the Statistics Canada website"),
13+
person("Google", role = "dtc", comment = "Collaborator on CLI data from the Google symptom surveys")
1314
)
1415
Description: This package contains data sets used to compile vignettes and
1516
other documentation in Delphi R Packages. The goal is to avoid calls

R/epipredict-data.R

+148-6
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@
33
#' This data source of confirmed COVID-19 cases and deaths is based on reports
44
#' made available by the Center for Systems Science and Engineering at Johns
55
#' Hopkins University, as downloaded from the CMU Delphi COVIDcast Epidata
6-
#' API. This example data is a snapshot as of March 20, 2024, and
6+
#' API. This example data is a snapshot as of May 31, 2022, and
77
#' ranges from December 31, 2020 to December 31, 2021. It
8-
#' includes all states. It is used in the {epiprocess} correlation vignette.
8+
#' includes all states.
99
#'
10-
#' @format An [`epiprocess::epi_df`] (object of class `c("epi_df", "tbl_df", "tbl", "data.frame")`) with 37576 rows and 4 columns.
10+
#' @format An [`epiprocess::epi_df`] (object of class `c("epi_df", "tbl_df", "tbl", "data.frame")`) with 20496 rows and 4 columns.
1111
#' @section Data dictionary:
1212
#' The data has columns:
1313
#' \describe{
@@ -76,7 +76,7 @@
7676
#' ranges from June 4, 2021 to December 31, 2021.
7777
#' It is limited to California, Florida, Texas, New Jersey, and New York.
7878
#'
79-
#' @format A [`tibble::tibble`] (object of class `c("tbl_df", "tbl", "data.frame")`) with 1055 rows and 4 columns.
79+
#' @format An [`epiprocess::epi_df`] (object of class `c("epi_df", "tbl_df", "tbl", "data.frame")`) with 1055 rows and 4 columns.
8080
#' @section Data dictionary:
8181
#' The data has columns:
8282
#' \describe{
@@ -195,15 +195,14 @@
195195
#' www.statcan.gc.ca. This example data is a snapshot as of September 18,
196196
#' 2024, and ranges from 2010 to 2017 (yearly).
197197
#'
198-
#' @format An [`epiprocess::epi_df`] (object of class `c("epi_df", "tbl_df", "tbl", "data.frame")`) with 10193 rows and 8 columns.
198+
#' @format An [`epiprocess::epi_df`] (object of class `c("epi_df", "tbl_df", "tbl", "data.frame")`) with 1445 rows and 7 columns.
199199
#' @section Data dictionary:
200200
#' The data has columns:
201201
#' \describe{
202202
#' \item{geo_value}{The province in Canada associated with each
203203
#' row of measurements.}
204204
#' \item{time_value}{The time value, a year integer in YYYY format}
205205
#' \item{edu_qual}{The education qualification}
206-
#' \item{fos}{The field of study}
207206
#' \item{age_group}{The age group; either 15 to 34 or 35 to 64}
208207
#' \item{num_graduates}{The number of graduates for the given row of characteristics}
209208
#' \item{med_income_2y}{The median employment income two years after graduation}
@@ -226,3 +225,146 @@
226225
#' drop the level-specific rows.
227226
#' * No modifications were made to the time range of the data.
228227
"grad_employ_subset"
228+
229+
#' Percent CLI from different surveys, compared to ground truth COVID incidence in a subset of counties
230+
#'
231+
#' @description
232+
#' Data set for more than 400 US counties containing CLI
233+
#' (COVID-19-like-illness) incidence derived from two surveys, and a reference signal as
234+
#' reported by JHU CSSE. This example data is a snapshot as of September 21,
235+
#' 2020, and ranges from April 11, 2020 to September 01, 2020.
236+
#'
237+
#' The reference signal `case` is based on reports made available
238+
#' by the Center for Systems Science and Engineering at Johns Hopkins
239+
#' University.
240+
#'
241+
#' One survey was
242+
#' \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/google-survey.html}{run by Google},
243+
#' in partnership with Delphi.
244+
#'
245+
#' The other survey, the
246+
#'\href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/fb-survey.html}{COVID-19 Trends and Impact Survey},
247+
#' was run by Delphi in collaboration with Facebook.
248+
#'
249+
#' Data is reported for counties that had at least 200 cumulative COVID-19 cases
250+
#' on May 14, 2020, according to JHU CSSE.
251+
#'
252+
#' @format An [`epiprocess::epi_df`] (object of class `c("epi_df", "tbl_df", "tbl", "data.frame")`) with 63840 rows and 5 columns.
253+
#' @section Data dictionary:
254+
#' The data has columns:
255+
#' \describe{
256+
#' \item{geo_value}{The 5-digit county FIPS code associated with each
257+
#' row of measurements.}
258+
#' \item{time_value}{The time value, a date in YYYY-MM-DD format}
259+
#' \item{goog}{Seven-day average of CLI (covid-like-illness) cases from the Google survey}
260+
#' \item{fb}{Seven-day average of CLI (covid-like-illness) cases from CTIS}
261+
#' \item{case}{Reference signal. Seven-day average of CLI (covid-like-illness) cases}
262+
#' }
263+
#' @source
264+
#' This object contains a modified part of the \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. This data set is licensed under the terms of the
265+
#' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license}
266+
#' by Johns Hopkins University on behalf of its Center for Systems Science in Engineering.
267+
#' Copyright Johns Hopkins University 2020.
268+
#'
269+
#' Modifications:
270+
#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: The signal `confirmed_cumulative_num` was used to determine eligibility for inclusion. The signal `confirmed_7dav_incidence_prop` was computed by Delphi from the original JHU-CSSE data by calculating moving averages of the preceding 7 days, so the signal for June 7 is the average of the underlying data for June 1 through 7, inclusive.
271+
#' * Furthermore, the data has been limited to a specific time range, the
272+
#' signal names slightly altered, and formatted into an `epi_df`.
273+
#'
274+
#' This object contains a modified part of the
275+
#' \href{https://cmu-delphi.github.io/delphi-epidata/symptom-survey/#covid-19-trends-and-impact-survey}{data
276+
#' aggregations in the API} that are prepared from the
277+
#' \href{https://www.pnas.org/doi/full/10.1073/pnas.2111454118}{COVID-19
278+
#' Trends and Impact Survey}; see the first link for more information on
279+
#' citing in publications.
280+
#' The data is made available via the
281+
#' \href{https://cmu-delphi.github.io/delphi-epidata/}{Delphi Epidata API}.
282+
#'
283+
#' These aggregations are licensed under the terms of
284+
#' the \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons
285+
#' Attribution license}.
286+
#'
287+
#' Modifications:
288+
#' * The data has been limited to a very small number of rows, the
289+
#' signal names slightly altered, and formatted into an `epi_df`.
290+
#'
291+
#' This object contains a modified part of the
292+
#' \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/google-survey.html}{Google symptom surveys}.
293+
#' Aggregations based on the survey are licensed under the terms of
294+
#' the \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons
295+
#' Attribution license}.
296+
#'
297+
#' Modifications:
298+
#' * The data has been limited to a very small number of rows, the
299+
#' signal names slightly altered, and formatted into an `epi_df`.
300+
"county_smoothed_cli_comparison"
301+
302+
#' Daily COVID-19 case and death rates from all states in archive format
303+
#'
304+
#' @description
305+
#' Data set containing COVID-19 case and death rates (counts per 100000
306+
#' population) as reported by the Delphi API, based on reports made available
307+
#' by the Center for Systems Science and Engineering at Johns Hopkins
308+
#' University. This example data ranges from March 1, 2020 to November 30,
309+
#' 2021, issued monthly on the first day of each month from September 1, 2020
310+
#' to December 1, 2021. It includes all US states, Washington DC, Guam, Puerto
311+
#' Rico, and the Virgin Islands.
312+
#'
313+
#' @format An [`epiprocess::epi_archive`]. The DT attribute contains the data formatted as a [`data.table::data.table`] (object of class `c("data.table", "data.frame")`) with 72086 rows and 7 columns.
314+
#' @section Data dictionary:
315+
#' The data in the `epi_archive$DT` attribute has columns:
316+
#' \describe{
317+
#' \item{geo_value}{the geographic value associated with each row of measurements.}
318+
#' \item{time_value}{the time value associated with each row of measurements.}
319+
#' \item{version}{the time value specifying the version for each row of measurements. }
320+
#' \item{case_rate}{Number of new confirmed cases due to COVID-19 per 100,000 population, daily}
321+
#' \item{case_rate_7d_av}{7-day average signal of number of new confirmed cases due to COVID-19 per 100,000 population, daily}
322+
#' \item{death_rate}{Number of new confirmed deaths due to COVID-19 per 100,000 population, daily}
323+
#' \item{death_rate_7d_av}{7-day average signal of number of new confirmed deaths due to COVID-19 per 100,000 population, daily}
324+
#' }
325+
#' @source
326+
#' This object contains a modified part of the \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. This data set is licensed under the terms of the
327+
#' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license}
328+
#' by Johns Hopkins University on behalf of its Center for Systems Science in Engineering.
329+
#' Copyright Johns Hopkins University 2020.
330+
#'
331+
#' Modifications:
332+
#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: The signals `case_rate` and `death_rate` are taken directly from the JHU CSSE GitHub repo without changes, served through the Delphi API.
333+
#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: Averaged signals were computed from the original JHU-CSSE data by calculating moving averages of the preceding 7 days, so the signal for June 7 is the average of the underlying data for June 1 through 7, inclusive.
334+
#' * Furthermore, the data has been limited to a specific time range, the
335+
#' signal names slightly altered, and formatted into an `epi_archive`.
336+
"case_death_rate_archive"
337+
338+
#' Daily COVID-19 doctor visits and cases from all states in archive format
339+
#' @description
340+
#' This data source is based on information about outpatient visits, provided
341+
#' to us by health system partners, and also contains confirmed COVID-19
342+
#' cases based on reports made available by the Center for Systems Science
343+
#' and Engineering at Johns Hopkins University. This example data ranges from
344+
#' June 1, 2020 to December 1, 2021, issued on dates from June 1, 2020 to December 1,
345+
#' 2021. It includes all US states.
346+
#'
347+
#' It is used in the {epipredict} `sliding` article.
348+
#'
349+
#' @format An [`epiprocess::epi_archive`]. The DT attribute contains the data formatted as a [`data.table::data.table`] (object of class `c("data.table", "data.frame")`) with 1514489 rows and 5 columns.
350+
#' @section Data dictionary:
351+
#' The data in the `epi_archive$DT` attribute has columns:
352+
#' \describe{
353+
#' \item{geo_value}{the geographic value associated with each row of measurements.}
354+
#' \item{time_value}{the time value associated with each row of measurements.}
355+
#' \item{version}{the time value specifying the version for each row of measurements. }
356+
#' \item{percent_cli}{percentage of doctor’s visits with CLI (COVID-like illness) computed from medical insurance claims}
357+
#' \item{case_rate}{7-day average signal of number of new confirmed cases due to COVID-19 per 100,000 population, daily}
358+
#' }
359+
#' @source
360+
#' This object contains a modified part of the \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. This data set is licensed under the terms of the
361+
#' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license}
362+
#' by Johns Hopkins University on behalf of its Center for Systems Science in Engineering.
363+
#' Copyright Johns Hopkins University 2020.
364+
#'
365+
#' Modifications:
366+
#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html}{From the COVIDcast Doctor Visits API}: The signal `percent_cli` is taken directly from the API without changes.
367+
#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: `case_rate` signal was computed by Delphi from the original JHU-CSSE data by calculating moving averages of the preceding 7 days, so the signal for June 7 is the average of the underlying data for June 1 through 7, inclusive.
368+
#' * Furthermore, the data has been limited to a very small number of rows, the
369+
#' signal names slightly altered, and formatted into an `epi_archive`.
370+
"archive_cases_dv_subset_all_states"

R/epiprocess-data.R

+42-1
Original file line numberDiff line numberDiff line change
@@ -207,4 +207,45 @@
207207
#' These signals are taken directly from the JHU CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} without changes.
208208
#' * Furthermore, the data has been limited to a very small number of rows,
209209
#' formatted into an `epi_df`, and the signal names slightly altered.
210-
"jhu_confirmed_cumulative_num"
210+
"covid_confirmed_cumulative_num"
211+
212+
#' JHU daily COVID-19 cases and deaths rates from all states
213+
#'
214+
#' This data source of confirmed COVID-19 cases and deaths is based on reports
215+
#' made available by the Center for Systems Science and Engineering at Johns
216+
#' Hopkins University, as downloaded from the CMU Delphi COVIDcast Epidata
217+
#' API. This example data is a snapshot as of May 31, 2022, and
218+
#' ranges from March 1, 2020 to December 31, 2021. It
219+
#' includes all states.
220+
#'
221+
#' It is used in the {epiprocess} correlation vignettes.
222+
#'
223+
#' @format An [`epiprocess::epi_df`] (object of class `c("epi_df", "tbl_df", "tbl", "data.frame")`) with 37576 rows and 4 columns.
224+
#' @section Data dictionary:
225+
#' The data has columns:
226+
#' \describe{
227+
#' \item{geo_value}{the geographic value associated with each row
228+
#' of measurements.}
229+
#' \item{time_value}{the time value associated with each row of measurements.}
230+
#' \item{case_rate}{7-day average signal of number of new
231+
#' confirmed COVID-19 cases per 100,000 population, daily}
232+
#' \item{death_rate}{7-day average signal of number of new confirmed
233+
#' deaths due to COVID-19 per 100,000 population, daily}
234+
#' }
235+
#' @source This object contains a modified part of the
236+
#' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University}
237+
#' as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}.
238+
#' This data set is licensed under the terms of the
239+
#' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license}
240+
#' by the Johns Hopkins University on behalf of its Center for Systems Science
241+
#' in Engineering. Copyright Johns Hopkins University 2020.
242+
#'
243+
#' Modifications:
244+
#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}:
245+
#' These signals are taken directly from the JHU CSSE
246+
#' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository}
247+
#' without changes. The 7-day average signals are computed by Delphi by
248+
#' calculating moving averages of the preceding 7 days, so the signal for
249+
#' June 7 is the average of the underlying data for June 1 through 7,
250+
#' inclusive.
251+
"covid_case_death_rates_extended"

R/sysdata.rda

5.74 MB
Binary file not shown.

data-raw/_helper.R

+3-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ save_to_sysdata <- function(obj, obj_name) {
1919
list = names(sysdata_env),
2020
file = internal_data_path,
2121
envir = sysdata_env,
22-
compress = "xz"
22+
compress = "xz",
23+
# For backwards compatibility with older R versions (<3.5)
24+
version = 2
2325
)
2426
}

data-raw/_run_all.R

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
library(here)
2+
3+
internal_data_path <- here("data-raw")
4+
files <- list.files(
5+
internal_data_path, pattern = ".*[.]R",
6+
full.names = FALSE
7+
)
8+
for (file in files) {
9+
if (startsWith(file, "_")) {
10+
# File is a helper script and does not generate data.
11+
next
12+
}
13+
path <- here(file.path("data-raw", file))
14+
message("running ", path, " ...")
15+
source(path)
16+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
library(dplyr)
2+
library(epidatr)
3+
library(epiprocess)
4+
5+
source(here::here("data-raw/_helper.R"))
6+
7+
dv_subset <- pub_covidcast(
8+
source = "doctor-visits",
9+
signals = "smoothed_adj_cli",
10+
time_type = "day",
11+
geo_type = "state",
12+
time_values = epirange(20200601, 20211201),
13+
geo_values = "*",
14+
issues = epirange(20200601, 20211201)
15+
) %>%
16+
select(geo_value, time_value, version = issue, percent_cli = value) %>%
17+
# Drop DC and territories.
18+
filter(!(geo_value %in% c("as", "gu", "dc", "mp", "pr", "vi"))) %>%
19+
# We're using compactify=FALSE here and below to avoid some testthat test
20+
# failures on tests that were based on a non-compactified version.
21+
as_epi_archive(compactify = FALSE)
22+
23+
case_rate_subset <- pub_covidcast(
24+
source = "jhu-csse",
25+
signals = "confirmed_7dav_incidence_prop",
26+
time_type = "day",
27+
geo_type = "state",
28+
time_values = epirange(20200601, 20211201),
29+
geo_values = "*",
30+
issues = epirange(20200601, 20211201)
31+
) %>%
32+
select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>%
33+
filter(!(geo_value %in% c("as", "gu", "dc", "mp", "pr", "vi"))) %>%
34+
as_epi_archive(compactify = FALSE)
35+
36+
# Use `epiprocess::epix_merge` to avoid having to reimplement `sync`ing
37+
# behavior. After merging, convert DT component back to tibble.
38+
archive_cases_dv_subset_all_states_tbl = epix_merge(
39+
dv_subset, case_rate_subset,
40+
sync = "locf",
41+
compactify = TRUE)$DT %>%
42+
as_tibble()
43+
44+
# We're trying to do:
45+
# usethis::use_data(archive_cases_dv_subset_all_states_tbl, internal = TRUE, overwrite = TRUE, compress = "xz")
46+
# but `usethis::use_data` can only store multiple objects if they're added in
47+
# the same call. This workaround is from
48+
# https://github.com/r-lib/usethis/issues/1512
49+
save_to_sysdata(archive_cases_dv_subset_all_states_tbl, "archive_cases_dv_subset_all_states_tbl")

0 commit comments

Comments
 (0)