Skip to content

Commit 3e4a1fa

Browse files
authored
Merge pull request #17 from cmu-delphi/ds/epi-slide-updates
fix: update tooling book
2 parents eeebbc8 + 1a14295 commit 3e4a1fa

28 files changed

+45541
-27811
lines changed

_freeze/archive/execute-results/html.json

+2-2
Large diffs are not rendered by default.

_freeze/archive/figure-html/unnamed-chunk-8-1.svg

+1-1
Loading

_freeze/epidf/execute-results/html.json

+2-2
Large diffs are not rendered by default.

_freeze/epidf/figure-html/unnamed-chunk-11-1.svg

+474-467
Loading

_freeze/epidf/figure-html/unnamed-chunk-13-1.svg

+715-708
Loading

_freeze/epidf/figure-html/unnamed-chunk-15-1.svg

+1,998-1,991
Loading

_freeze/outliers/execute-results/html.json

+2-2
Large diffs are not rendered by default.

_freeze/outliers/figure-html/unnamed-chunk-3-1.svg

+532-525
Loading

_freeze/outliers/figure-html/unnamed-chunk-7-1.svg

+1,244-1,038
Loading

_freeze/outliers/figure-html/unnamed-chunk-7-2.svg

+1,222-1,024
Loading

_freeze/outliers/figure-html/unnamed-chunk-9-1.svg

+538-531
Loading

_freeze/slide/execute-results/html.json

+2-2
Large diffs are not rendered by default.

_freeze/slide/figure-html/unnamed-chunk-10-1.svg

+308
Loading

_freeze/slide/figure-html/unnamed-chunk-12-1.svg

+16,583-2,478
Loading

_freeze/slide/figure-html/unnamed-chunk-16-1.svg

+2,860
Loading

_freeze/slide/figure-html/unnamed-chunk-8-1.svg

+12,068-12,085
Loading

_freeze/sliding-forecasters/execute-results/html.json

+2-2
Large diffs are not rendered by default.

_freeze/sliding-forecasters/figure-html/plot-ar-asof-1.svg

+841-843
Loading

_freeze/sliding-forecasters/figure-html/plot-arx-1.svg

+610-600
Loading

_freeze/sliding-forecasters/figure-html/plot-can-fc-boost-1.svg

+2,866-2,790
Loading

_freeze/sliding-forecasters/figure-html/plot-can-fc-lr-1.svg

+1,907-1,919
Loading

archive.qmd

+21-17
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ claims, available through the [COVIDcast
1616
API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html). This
1717
signal is subject to very heavy and regular revision; you can read more about it
1818
on its [API documentation
19-
page](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). We'll use the offline version stored in `{epidatasets}`.
20-
19+
page](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html).
20+
We'll use the offline version stored in `{epidatasets}`.
2121

2222
```{r, include=FALSE}
2323
source("_common.R")
@@ -36,7 +36,7 @@ tibble, provided that it has (at least) the following columns:
3636
the data for January 14, 2022 that were available one day later.
3737

3838
As we can see from the above, the data frame returned by
39-
`epidatr::covidcast()` has the columns required for the `epi_archive`
39+
`epidatr::pub_covidcast()` has the columns required for the `epi_archive`
4040
format, so we use
4141
`as_epi_archive()` to cast it into `epi_archive` format.[^1]
4242

@@ -47,7 +47,7 @@ to the [compactify vignette](https://cmu-delphi.github.io/epiprocess/articles/co
4747

4848
```{r}
4949
x <- archive_cases_dv_subset_dt %>%
50-
select(geo_value, time_value, version, percent_cli) %>%
50+
select(geo_value, time_value, version, percent_cli) %>%
5151
as_epi_archive(compactify = TRUE)
5252
5353
class(x)
@@ -70,8 +70,8 @@ below). There can only be a single row per unique combination of key variables,
7070
and therefore the key variables are critical for figuring out how to generate a
7171
snapshot of data from the archive, as of a given version (also described below).
7272

73-
```{r, error=TRUE}
74-
key(x$DT)
73+
```{r}
74+
data.table::key(x$DT)
7575
```
7676

7777
In general, the last version of each observation is carried forward (LOCF) to
@@ -100,7 +100,7 @@ the signal variables as of a given version. This can be accessed via
100100
`epix_as_of()`.
101101

102102
```{r}
103-
x_snapshot <- epix_as_of(x, max_version = as.Date("2021-06-01"))
103+
x_snapshot <- epix_as_of(x, version = as.Date("2021-06-01"))
104104
class(x_snapshot)
105105
x_snapshot
106106
max(x_snapshot$time_value)
@@ -120,7 +120,7 @@ this case, since updates to the current version may still come in at a later
120120
point in time, due to various reasons, such as synchronization issues.
121121

122122
```{r}
123-
x_latest <- epix_as_of(x, max_version = max(x$DT$version))
123+
x_latest <- epix_as_of(x, version = max(x$DT$version))
124124
```
125125

126126
Below, we pull several snapshots from the archive, spaced one month apart. We
@@ -134,27 +134,32 @@ versions <- seq(as.Date("2020-06-01"), self_max - 1, by = "1 month")
134134
snapshots <- map(
135135
versions,
136136
function(v) {
137-
epix_as_of(x, max_version = v) %>% mutate(version = v)
138-
}) %>%
137+
epix_as_of(x, version = v) %>% mutate(version = v)
138+
}
139+
) %>%
139140
list_rbind() %>%
140141
bind_rows(x_latest %>% mutate(version = self_max)) %>%
141142
mutate(latest = version == self_max)
142143
```
143144

144145
```{r, fig.height=7}
145146
#| code-fold: true
146-
ggplot(snapshots %>% filter(!latest),
147-
aes(x = time_value, y = percent_cli)) +
147+
ggplot(
148+
snapshots %>% filter(!latest),
149+
aes(x = time_value, y = percent_cli)
150+
) +
148151
geom_line(aes(color = factor(version)), na.rm = TRUE) +
149152
geom_vline(aes(color = factor(version), xintercept = version), lty = 2) +
150-
facet_wrap(~ geo_value, scales = "free_y", ncol = 1) +
153+
facet_wrap(~geo_value, scales = "free_y", ncol = 1) +
151154
scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
152155
scale_color_viridis_d(option = "A", end = .9) +
153156
labs(x = "Date", y = "% of doctor's visits with CLI") +
154157
theme(legend.position = "none") +
155-
geom_line(data = snapshots %>% filter(latest),
156-
aes(x = time_value, y = percent_cli),
157-
inherit.aes = FALSE, color = "black", na.rm = TRUE)
158+
geom_line(
159+
data = snapshots %>% filter(latest),
160+
aes(x = time_value, y = percent_cli),
161+
inherit.aes = FALSE, color = "black", na.rm = TRUE
162+
)
158163
```
159164

160165
We can see some interesting and highly nontrivial revision behavior: at some
@@ -164,7 +169,6 @@ they overestimate it (both states towards the beginning of 2021), though not
164169
quite as dramatically. Modeling the revision process, which is often called
165170
*backfill modeling*, is an important statistical problem in it of itself.
166171

167-
168172
## Merging `epi_archive` objects
169173

170174
Now we demonstrate how to merge two `epi_archive` objects together, e.g., so

epidf.qmd

+56-51
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
source("_common.R")
66
```
77

8-
We'll start by showing how to get data into
8+
We'll start by showing how to get data into
99
`epi_df`, which is just
1010
a tibble with a bit of special structure, and is the format assumed by all of
1111
the functions in the `epiprocess` package. An `epi_df` object has (at least) the
@@ -43,16 +43,13 @@ cases <- pub_covidcast(
4343
colnames(cases)
4444
```
4545

46-
As we can see, a data frame returned by `epidatr::covidcast()` has the
46+
As we can see, a data frame returned by `epidatr::pub_covidcast()` has the
4747
columns required for an `epi_df` object (along with many others). We can use
4848
`as_epi_df()`, with specification of some relevant metadata, to bring the data
4949
frame into `epi_df` format.
5050

5151
```{r, message = FALSE}
52-
x <- as_epi_df(cases,
53-
geo_type = "state",
54-
time_type = "day",
55-
as_of = max(cases$issue)) %>%
52+
x <- as_epi_df(cases, as_of = max(cases$issue)) %>%
5653
select(geo_value, time_value, total_cases = value)
5754
5855
class(x)
@@ -64,7 +61,7 @@ attributes(x)$metadata
6461
## Some details on metadata
6562

6663
In general, an `epi_df` object has the following fields in its metadata:
67-
64+
6865
* `geo_type`: the type for the geo values.
6966
* `time_type`: the type for the time values.
7067
* `as_of`: the time value at which the given data were available.
@@ -86,10 +83,10 @@ data set. See the [archive
8683
vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for
8784
more.
8885

89-
If any of the `geo_type`, `time_type`, or `as_of` arguments are missing in a
86+
If any of the `geo_type`, `time_type`, or `as_of` arguments are missing in a
9087
call to `as_epi_df()`, then this function will try to infer them from the passed
9188
object. Usually, `geo_type` and `time_type` can be inferred from the `geo_value`
92-
and `time_value` columns, respectively, but inferring the `as_of` field is not
89+
and `time_value` columns, respectively, but inferring the `as_of` field is not
9390
as easy. See the documentation for `as_epi_df()` more details.
9491

9592
```{r}
@@ -109,25 +106,29 @@ In the following examples we will show how to create an `epi_df` with additional
109106
set.seed(12345)
110107
ex1 <- tibble(
111108
geo_value = rep(c("ca", "fl", "pa"), each = 3),
112-
county_code = c("06059", "06061", "06067", "12111", "12113", "12117",
113-
"42101", "42103", "42105"),
109+
county_code = c(
110+
"06059", "06061", "06067", "12111", "12113", "12117",
111+
"42101", "42103", "42105"
112+
),
114113
time_value = rep(
115-
seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "1 day"),
116-
length.out = 9),
114+
seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "1 day"),
115+
length.out = 9
116+
),
117117
value = rpois(9, 5)
118-
) %>%
118+
) %>%
119119
as_tsibble(index = time_value, key = c(geo_value, county_code))
120120
121-
ex1 <- as_epi_df(x = ex1, geo_type = "state", time_type = "day", as_of = "2020-06-03")
121+
ex1 <- as_epi_df(x = ex1, as_of = "2020-06-03")
122122
```
123123

124124
The metadata now includes `county_code` as an extra key.
125+
125126
```{r}
126127
attr(ex1, "metadata")
127128
```
128129

129130

130-
### Dealing with misspecified column names
131+
### Dealing with misspecified column names
131132

132133
`epi_df` requires there to be columns `geo_value` and `time_value`, if they do not exist then `as_epi_df()` throws an error.
133134

@@ -136,27 +137,27 @@ ex2 <- data.frame(
136137
state = rep(c("ca", "fl", "pa"), each = 3), # misnamed
137138
pol = rep(c("blue", "swing", "swing"), each = 3), # extra key
138139
reported_date = rep(
139-
seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"),
140-
length.out = 9), # misnamed
140+
seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"),
141+
length.out = 9
142+
), # misnamed
141143
value = rpois(9, 5)
142-
)
143-
ex2 %>% as_epi_df()
144+
)
145+
ex2 %>% as_epi_df()
144146
```
145147

146-
The columns should be renamed to match `epi_df` format.
148+
The columns should be renamed to match `epi_df` format.
147149

148150
```{r}
149-
ex2 <- ex2 %>%
151+
ex2 <- ex2 %>%
150152
rename(geo_value = state, time_value = reported_date) %>%
151-
as_epi_df(geo_type = "state",
152-
as_of = "2020-06-03",
153-
additional_metadata = list(other_keys = "pol")
153+
as_epi_df(
154+
as_of = "2020-06-03",
155+
other_keys = "pol"
154156
)
155157
156158
attr(ex2, "metadata")
157159
```
158160

159-
160161
### Adding additional keys to an `epi_df` object
161162

162163
In the above examples, all the keys are added to objects prior to conversion to
@@ -166,22 +167,23 @@ We'll look at an included dataset and filter to a single state for simplicity.
166167
```{r}
167168
ex3 <- jhu_csse_county_level_subset %>%
168169
filter(time_value > "2021-12-01", state_name == "Massachusetts") %>%
169-
slice_tail(n = 6)
170-
170+
slice_tail(n = 6)
171+
171172
attr(ex3, "metadata") # geo_type is county currently
172173
```
173174

174-
Now we add `state` (MA) and `pol` as new columns to the data and as new keys to the metadata. The "state" `geo_type` anticipates lower-case abbreviations, so we'll match that.
175+
Now we add `state` (MA) and `pol` as new columns to the data and as new keys to the metadata. The "state" `geo_type` anticipates lower-case abbreviations, so we'll match that.
175176

176177
```{r}
177-
ex3 <- ex3 %>%
178+
ex3 <- ex3 %>%
178179
as_tibble() %>% # drop the `epi_df` class before adding additional metadata
179180
mutate(
180181
state = rep(tolower("MA"), 6),
181-
pol = rep(c("blue", "swing", "swing"), each = 2)) %>%
182-
as_epi_df(additional_metadata = list(other_keys = c("state", "pol")))
182+
pol = rep(c("blue", "swing", "swing"), each = 2)
183+
) %>%
184+
as_epi_df(other_keys = c("state", "pol"))
183185
184-
attr(ex3,"metadata")
186+
attr(ex3, "metadata")
185187
```
186188

187189
Note that the two additional keys we added, `state` and `pol`, are specified as a character vector in the `other_keys` component of the `additional_metadata` list. They must be specified in this manner so that downstream actions on the `epi_df`, like model fitting and prediction, can recognize and use these keys.
@@ -199,38 +201,38 @@ package. Of course, we can also write custom code for other downstream uses,
199201
like plotting, which is pretty easy to do `ggplot2`.
200202

201203
```{r, message = FALSE, warning = FALSE}
202-
ggplot(x, aes(x = time_value, y = total_cases, color = geo_value)) +
204+
ggplot(x, aes(x = time_value, y = total_cases, color = geo_value)) +
203205
geom_line() +
204206
scale_color_brewer(palette = "Set1") +
205207
scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
206208
labs(x = "Date", y = "Cumulative COVID-19 cases", color = "State")
207209
```
208210

209-
Finally, we'll examine some data from other packages just to show how
210-
we might get them into `epi_df` format.
211-
The first is data on daily new (not cumulative) SARS
212-
cases in Canada in 2003, from the
211+
Finally, we'll examine some data from other packages just to show how
212+
we might get them into `epi_df` format.
213+
The first is data on daily new (not cumulative) SARS
214+
cases in Canada in 2003, from the
213215
[outbreaks](https://github.com/reconverse/outbreaks) package. New cases are
214216
broken into a few categories by provenance.
215217

216218
```{r}
217219
x <- outbreaks::sars_canada_2003 %>%
218220
mutate(geo_value = "ca") %>%
219221
select(geo_value, time_value = date, starts_with("cases")) %>%
220-
as_epi_df(geo_type = "nation")
222+
as_epi_df()
221223
222224
head(x)
223225
```
224226

225227
```{r}
226228
#| code-fold: true
227-
x <- x %>%
229+
x <- x %>%
228230
pivot_longer(starts_with("cases"), names_to = "type") %>%
229231
mutate(type = substring(type, 7))
230232
231233
ggplot(x, aes(x = time_value, y = value)) +
232234
geom_col(aes(fill = type), just = 0.5) +
233-
scale_y_continuous(breaks = 0:4*2, expand = expansion(c(0, 0.05))) +
235+
scale_y_continuous(breaks = 0:4 * 2, expand = expansion(c(0, 0.05))) +
234236
scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
235237
labs(x = "Date", y = "SARS cases in Canada", fill = "Type")
236238
```
@@ -243,27 +245,30 @@ x <- outbreaks::ebola_sierraleone_2014 %>%
243245
cases = ifelse(status == "confirmed", 1, 0),
244246
province = case_when(
245247
district %in% c("Kailahun", "Kenema", "Kono") ~ "Eastern",
246-
district %in% c("Bombali", "Kambia", "Koinadugu",
247-
"Port Loko", "Tonkolili") ~ "Northern",
248+
district %in% c(
249+
"Bombali", "Kambia", "Koinadugu",
250+
"Port Loko", "Tonkolili"
251+
) ~ "Northern",
248252
district %in% c("Bo", "Bonthe", "Moyamba", "Pujehun") ~ "Sourthern",
249-
district %in% c("Western Rural", "Western Urban") ~ "Western")
250-
) %>%
253+
district %in% c("Western Rural", "Western Urban") ~ "Western"
254+
)
255+
) %>%
251256
select(geo_value = province, time_value = date_of_onset, cases) %>%
252257
filter(cases == 1) %>%
253-
group_by(geo_value, time_value) %>%
258+
group_by(geo_value, time_value) %>%
254259
summarise(cases = sum(cases)) %>%
255-
as_epi_df(geo_type = "province")
260+
as_epi_df()
256261
```
257262

258263
```{r}
259264
#| code-fold: true
260265
#| fig-width: 8
261266
#| fig-height: 6
262-
ggplot(x, aes(x = time_value, y = cases)) +
263-
geom_col(aes(fill = geo_value), show.legend = FALSE) +
264-
facet_wrap(~ geo_value, scales = "free_y") +
267+
ggplot(x, aes(x = time_value, y = cases)) +
268+
geom_col(aes(fill = geo_value), show.legend = FALSE) +
269+
facet_wrap(~geo_value, scales = "free_y") +
265270
scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
266-
labs(x = "Date", y = "Confirmed cases of Ebola in Sierra Leone")
271+
labs(x = "Date", y = "Confirmed cases of Ebola in Sierra Leone")
267272
```
268273

269274

0 commit comments

Comments
 (0)