5
5
source("_common.R")
6
6
```
7
7
8
- We'll start by showing how to get data into
8
+ We'll start by showing how to get data into
9
9
` epi_df ` , which is just
10
10
a tibble with a bit of special structure, and is the format assumed by all of
11
11
the functions in the ` epiprocess ` package. An ` epi_df ` object has (at least) the
@@ -43,16 +43,13 @@ cases <- pub_covidcast(
43
43
colnames(cases)
44
44
```
45
45
46
- As we can see, a data frame returned by ` epidatr::covidcast () ` has the
46
+ As we can see, a data frame returned by ` epidatr::pub_covidcast () ` has the
47
47
columns required for an ` epi_df ` object (along with many others). We can use
48
48
` as_epi_df() ` , with specification of some relevant metadata, to bring the data
49
49
frame into ` epi_df ` format.
50
50
51
51
``` {r, message = FALSE}
52
- x <- as_epi_df(cases,
53
- geo_type = "state",
54
- time_type = "day",
55
- as_of = max(cases$issue)) %>%
52
+ x <- as_epi_df(cases, as_of = max(cases$issue)) %>%
56
53
select(geo_value, time_value, total_cases = value)
57
54
58
55
class(x)
@@ -64,7 +61,7 @@ attributes(x)$metadata
64
61
## Some details on metadata
65
62
66
63
In general, an ` epi_df ` object has the following fields in its metadata:
67
-
64
+
68
65
* ` geo_type ` : the type for the geo values.
69
66
* ` time_type ` : the type for the time values.
70
67
* ` as_of ` : the time value at which the given data were available.
@@ -86,10 +83,10 @@ data set. See the [archive
86
83
vignette] ( https://cmu-delphi.github.io/epiprocess/articles/archive.html ) for
87
84
more.
88
85
89
- If any of the ` geo_type ` , ` time_type ` , or ` as_of ` arguments are missing in a
86
+ If any of the ` geo_type ` , ` time_type ` , or ` as_of ` arguments are missing in a
90
87
call to ` as_epi_df() ` , then this function will try to infer them from the passed
91
88
object. Usually, ` geo_type ` and ` time_type ` can be inferred from the ` geo_value `
92
- and ` time_value ` columns, respectively, but inferring the ` as_of ` field is not
89
+ and ` time_value ` columns, respectively, but inferring the ` as_of ` field is not
93
90
as easy. See the documentation for ` as_epi_df() ` more details.
94
91
95
92
``` {r}
@@ -109,25 +106,29 @@ In the following examples we will show how to create an `epi_df` with additional
109
106
set.seed(12345)
110
107
ex1 <- tibble(
111
108
geo_value = rep(c("ca", "fl", "pa"), each = 3),
112
- county_code = c("06059", "06061", "06067", "12111", "12113", "12117",
113
- "42101", "42103", "42105"),
109
+ county_code = c(
110
+ "06059", "06061", "06067", "12111", "12113", "12117",
111
+ "42101", "42103", "42105"
112
+ ),
114
113
time_value = rep(
115
- seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "1 day"),
116
- length.out = 9),
114
+ seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "1 day"),
115
+ length.out = 9
116
+ ),
117
117
value = rpois(9, 5)
118
- ) %>%
118
+ ) %>%
119
119
as_tsibble(index = time_value, key = c(geo_value, county_code))
120
120
121
- ex1 <- as_epi_df(x = ex1, geo_type = "state", time_type = "day", as_of = "2020-06-03")
121
+ ex1 <- as_epi_df(x = ex1, as_of = "2020-06-03")
122
122
```
123
123
124
124
The metadata now includes ` county_code ` as an extra key.
125
+
125
126
``` {r}
126
127
attr(ex1, "metadata")
127
128
```
128
129
129
130
130
- ### Dealing with misspecified column names
131
+ ### Dealing with misspecified column names
131
132
132
133
` epi_df ` requires there to be columns ` geo_value ` and ` time_value ` , if they do not exist then ` as_epi_df() ` throws an error.
133
134
@@ -136,27 +137,27 @@ ex2 <- data.frame(
136
137
state = rep(c("ca", "fl", "pa"), each = 3), # misnamed
137
138
pol = rep(c("blue", "swing", "swing"), each = 3), # extra key
138
139
reported_date = rep(
139
- seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"),
140
- length.out = 9), # misnamed
140
+ seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"),
141
+ length.out = 9
142
+ ), # misnamed
141
143
value = rpois(9, 5)
142
- )
143
- ex2 %>% as_epi_df()
144
+ )
145
+ ex2 %>% as_epi_df()
144
146
```
145
147
146
- The columns should be renamed to match ` epi_df ` format.
148
+ The columns should be renamed to match ` epi_df ` format.
147
149
148
150
``` {r}
149
- ex2 <- ex2 %>%
151
+ ex2 <- ex2 %>%
150
152
rename(geo_value = state, time_value = reported_date) %>%
151
- as_epi_df(geo_type = "state",
152
- as_of = "2020-06-03",
153
- additional_metadata = list( other_keys = "pol")
153
+ as_epi_df(
154
+ as_of = "2020-06-03",
155
+ other_keys = "pol"
154
156
)
155
157
156
158
attr(ex2, "metadata")
157
159
```
158
160
159
-
160
161
### Adding additional keys to an ` epi_df ` object
161
162
162
163
In the above examples, all the keys are added to objects prior to conversion to
@@ -166,22 +167,23 @@ We'll look at an included dataset and filter to a single state for simplicity.
166
167
``` {r}
167
168
ex3 <- jhu_csse_county_level_subset %>%
168
169
filter(time_value > "2021-12-01", state_name == "Massachusetts") %>%
169
- slice_tail(n = 6)
170
-
170
+ slice_tail(n = 6)
171
+
171
172
attr(ex3, "metadata") # geo_type is county currently
172
173
```
173
174
174
- Now we add ` state ` (MA) and ` pol ` as new columns to the data and as new keys to the metadata. The "state" ` geo_type ` anticipates lower-case abbreviations, so we'll match that.
175
+ Now we add ` state ` (MA) and ` pol ` as new columns to the data and as new keys to the metadata. The "state" ` geo_type ` anticipates lower-case abbreviations, so we'll match that.
175
176
176
177
``` {r}
177
- ex3 <- ex3 %>%
178
+ ex3 <- ex3 %>%
178
179
as_tibble() %>% # drop the `epi_df` class before adding additional metadata
179
180
mutate(
180
181
state = rep(tolower("MA"), 6),
181
- pol = rep(c("blue", "swing", "swing"), each = 2)) %>%
182
- as_epi_df(additional_metadata = list(other_keys = c("state", "pol")))
182
+ pol = rep(c("blue", "swing", "swing"), each = 2)
183
+ ) %>%
184
+ as_epi_df(other_keys = c("state", "pol"))
183
185
184
- attr(ex3,"metadata")
186
+ attr(ex3, "metadata")
185
187
```
186
188
187
189
Note that the two additional keys we added, ` state ` and ` pol ` , are specified as a character vector in the ` other_keys ` component of the ` additional_metadata ` list. They must be specified in this manner so that downstream actions on the ` epi_df ` , like model fitting and prediction, can recognize and use these keys.
@@ -199,38 +201,38 @@ package. Of course, we can also write custom code for other downstream uses,
199
201
like plotting, which is pretty easy to do ` ggplot2 ` .
200
202
201
203
``` {r, message = FALSE, warning = FALSE}
202
- ggplot(x, aes(x = time_value, y = total_cases, color = geo_value)) +
204
+ ggplot(x, aes(x = time_value, y = total_cases, color = geo_value)) +
203
205
geom_line() +
204
206
scale_color_brewer(palette = "Set1") +
205
207
scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
206
208
labs(x = "Date", y = "Cumulative COVID-19 cases", color = "State")
207
209
```
208
210
209
- Finally, we'll examine some data from other packages just to show how
210
- we might get them into ` epi_df ` format.
211
- The first is data on daily new (not cumulative) SARS
212
- cases in Canada in 2003, from the
211
+ Finally, we'll examine some data from other packages just to show how
212
+ we might get them into ` epi_df ` format.
213
+ The first is data on daily new (not cumulative) SARS
214
+ cases in Canada in 2003, from the
213
215
[ outbreaks] ( https://github.com/reconverse/outbreaks ) package. New cases are
214
216
broken into a few categories by provenance.
215
217
216
218
``` {r}
217
219
x <- outbreaks::sars_canada_2003 %>%
218
220
mutate(geo_value = "ca") %>%
219
221
select(geo_value, time_value = date, starts_with("cases")) %>%
220
- as_epi_df(geo_type = "nation" )
222
+ as_epi_df()
221
223
222
224
head(x)
223
225
```
224
226
225
227
``` {r}
226
228
#| code-fold: true
227
- x <- x %>%
229
+ x <- x %>%
228
230
pivot_longer(starts_with("cases"), names_to = "type") %>%
229
231
mutate(type = substring(type, 7))
230
232
231
233
ggplot(x, aes(x = time_value, y = value)) +
232
234
geom_col(aes(fill = type), just = 0.5) +
233
- scale_y_continuous(breaks = 0:4* 2, expand = expansion(c(0, 0.05))) +
235
+ scale_y_continuous(breaks = 0:4 * 2, expand = expansion(c(0, 0.05))) +
234
236
scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
235
237
labs(x = "Date", y = "SARS cases in Canada", fill = "Type")
236
238
```
@@ -243,27 +245,30 @@ x <- outbreaks::ebola_sierraleone_2014 %>%
243
245
cases = ifelse(status == "confirmed", 1, 0),
244
246
province = case_when(
245
247
district %in% c("Kailahun", "Kenema", "Kono") ~ "Eastern",
246
- district %in% c("Bombali", "Kambia", "Koinadugu",
247
- "Port Loko", "Tonkolili") ~ "Northern",
248
+ district %in% c(
249
+ "Bombali", "Kambia", "Koinadugu",
250
+ "Port Loko", "Tonkolili"
251
+ ) ~ "Northern",
248
252
district %in% c("Bo", "Bonthe", "Moyamba", "Pujehun") ~ "Sourthern",
249
- district %in% c("Western Rural", "Western Urban") ~ "Western")
250
- ) %>%
253
+ district %in% c("Western Rural", "Western Urban") ~ "Western"
254
+ )
255
+ ) %>%
251
256
select(geo_value = province, time_value = date_of_onset, cases) %>%
252
257
filter(cases == 1) %>%
253
- group_by(geo_value, time_value) %>%
258
+ group_by(geo_value, time_value) %>%
254
259
summarise(cases = sum(cases)) %>%
255
- as_epi_df(geo_type = "province" )
260
+ as_epi_df()
256
261
```
257
262
258
263
``` {r}
259
264
#| code-fold: true
260
265
#| fig-width: 8
261
266
#| fig-height: 6
262
- ggplot(x, aes(x = time_value, y = cases)) +
263
- geom_col(aes(fill = geo_value), show.legend = FALSE) +
264
- facet_wrap(~ geo_value, scales = "free_y") +
267
+ ggplot(x, aes(x = time_value, y = cases)) +
268
+ geom_col(aes(fill = geo_value), show.legend = FALSE) +
269
+ facet_wrap(~geo_value, scales = "free_y") +
265
270
scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
266
- labs(x = "Date", y = "Confirmed cases of Ebola in Sierra Leone")
271
+ labs(x = "Date", y = "Confirmed cases of Ebola in Sierra Leone")
267
272
```
268
273
269
274
0 commit comments