@@ -16,16 +16,16 @@ library(dplyr)
16
16
# Census Population - County Level
17
17
18
18
``` {r}
19
- d <- rio::import(here::here("data", "DECENNIALPL2020.P1_data_with_overlays_2021-12-16T123049.csv"), skip = 1) %>%
19
+ d_county <- rio::import(here::here("data", "DECENNIALPL2020.P1_data_with_overlays_2021-12-16T123049.csv"), skip = 1) %>%
20
20
select(id, `Geographic Area Name`, `!!Total:`) %>%
21
21
rename(population = `!!Total:`)
22
22
23
- # join with df_state to get state.abb and state.name --> to have a common col with acfrs to join
23
+ # join with df_state to get state.abb and state.name --> to have a common col to join with acfrs
24
24
df_state <- data.frame(state.abb, state.name) %>%
25
25
add_row(state.abb = "PR", state.name = "Puerto Rico") %>%
26
26
add_row(state.abb = "DC", state.name = "District of Columbia")
27
27
28
- pop <- d %>%
28
+ pop_county <- d_county %>%
29
29
separate(`Geographic Area Name`, c("county", "state.name"), sep = ",") %>%
30
30
mutate(state.name = str_trim(state.name)) %>%
31
31
left_join(df_state) %>%
@@ -34,10 +34,10 @@ pop <- d %>%
34
34
mutate(county = str_to_lower(county),
35
35
county = str_remove(county,"\\.|'|‘"))
36
36
37
- pop %>%
37
+ ## Special cases
38
+ pop_county %>%
38
39
filter(state.abb == "TN" | state.abb == "KY") %>%
39
40
filter(str_detect(county, "davidson|jefferson")) #jefferson county ; davidson county
40
- ## Special cases
41
41
42
42
# # Nashville-Davidson metropolitan government (balance) == "Nashville-Davidson County" in ACFRS--> rename to match
43
43
# mutate(county = ifelse(county == "nashville-davidson metropolitan government (balance)", "nashville-davidson county", county)) %>%
@@ -80,13 +80,12 @@ acfrs <- readRDS("data_from_dbsite.RDS") %>%
80
80
ACFRs has ` r nrow(acfrs) ` observations.
81
81
82
82
83
- ## Find "County" in ACFRs
83
+ ## Find "County" or "Municipality" in ACFRs
84
84
``` {r}
85
85
# ACFRs entities that contains the word "County" in their names
86
86
acfrs_county <- acfrs %>%
87
87
filter(category == "General Purpose") %>%
88
- filter(grepl("county", county))
89
-
88
+ filter(grepl("county|municipality", county))
90
89
```
91
90
92
91
There are ` r nrow(acfrs_county) ` entities that contain the word "county" in their names.
@@ -115,15 +114,6 @@ alaska_borough <- acfrs %>%
115
114
filter(category == "General Purpose") %>%
116
115
filter(state.abb == "AK") %>%
117
116
filter(grepl("borough", county)) %>% arrange(county)
118
-
119
-
120
- # double check, find any ACFRs entities that has "Municipality" in their names.
121
- acfrs %>% filter(grepl("municipality", county))
122
-
123
- # Found 2: SD Municipality of Dell Rapids & WV Municipality of Parkersburg.
124
- # However, they're not in Census population data
125
- pop %>%
126
- filter(county == "municipality of parkersburg" | county == "municipality of dell rapids" )
127
117
128
118
```
129
119
@@ -143,7 +133,7 @@ Potential risk: Not all Puerto Rico entities in ACFRs without the word "Municipi
143
133
144
134
``` {r}
145
135
# in Census population, find county that has the word "Municipio" --> remove the word "Municipio" to match with acfrs
146
- puertorico_census_pop <- pop %>%
136
+ puertorico_census_pop <- pop_county %>%
147
137
filter(state.name == "Puerto Rico") %>%
148
138
filter(grepl("municipio", county)) %>%
149
139
mutate(county = str_remove(county, " municipio"))
@@ -155,15 +145,14 @@ puertorico_census_pop <- pop %>%
155
145
filter(category == "General Purpose") %>%
156
146
filter(state.abb == "PR") %>%
157
147
#join Puerto Rico in acfrs with PR in census -
158
- left_join(puertorico_census_pop, by = c("state.abb", "county")) %>%
159
- drop_na(population)
148
+ left_join(puertorico_census_pop, by = c("state.abb", "county"))
160
149
```
161
150
162
151
# Join ACFRs and Census population data to get population for ACFRs counties
163
152
164
153
165
154
Joining these components:
166
- * acfrs entities contain word "County" in their names
155
+ * acfrs entities contain word "County" or "Municipality" in their names
167
156
* acfrs entities of Louisiana that contain word "Parish" in their names
168
157
* Puerto Rico in acfrs and Census. Note that in ACFRs, there's no Puerto Rico entities has "Municipio". To match with PR in Census, need to remove this word from Census population data. This introduces a risk that there might be cases where PR entities in ACFRs without word "Municipio" are NOT actually Municipio.
169
158
@@ -172,22 +161,18 @@ Joining these components:
172
161
# first, join entities in ACFRs contain words "County" + Louisiana that contain word "Parish" + alaska that contain "Borough" in their names
173
162
acfrs_county_parish_borough <- rbind(acfrs_county, louisiana_parish) %>%
174
163
rbind(alaska_borough)
175
-
176
164
177
165
# next, join with census Census population by = c("state.abb", "county")
178
166
county_pop_census_acfrs <- acfrs_county_parish_borough %>%
179
- left_join(pop, by = c("state.abb", "county")) %>%
180
- drop_na(population) %>%
167
+ left_join(pop_county, by = c("state.abb", "county")) %>%
181
168
182
169
# third, bind with puertorico
183
170
rbind(puertorico_afrs_census_pop) %>%
184
171
arrange(desc(population)) %>% distinct()
185
172
186
- # PROBLEM: Need to explore more why others do not get matched
187
- setdiff(acfrs_county_parish_borough$county, county_pop_census_acfrs$county)
173
+ # --> 2503 county level entities, of which 255 do not have population data
174
+ # county_pop_census_acfrs %>% filter(is.na(population))
188
175
189
- acfrs_county_parish_borough %>%
190
- left_join(pop, by = c("state.abb", "county")) %>% filter(is.na(population))
191
176
```
192
177
193
178
@@ -197,7 +182,7 @@ county_pop_census_acfrs %>%
197
182
filter(state.abb == "KY" | state.abb == "TN") %>%
198
183
filter(str_detect(county, "davidson|jefferson")) %>% select(state.abb, population, total_liabilities, id.y, county)
199
184
200
- write.csv(county_pop_census_acfrs, "county_pop_census_acfrs.csv")
185
+ # write.csv(county_pop_census_acfrs, "county_pop_census_acfrs.csv")
201
186
saveRDS(county_pop_census_acfrs, "county_pop_census_acfrs.RDS")
202
187
203
188
```
@@ -210,41 +195,41 @@ The matched dataset has `r nrow(county_pop_census_acfrs)` observations/ counties
210
195
``` {r}
211
196
# stoplist
212
197
213
- stoplist1 <- pop %>%
214
- filter(state.name == "New York" & county %in% c("Bronx County", "Richmond County", "Kings County", "Queens County", "New York County"))
215
-
216
- stoplist2 <- pop %>%
217
- filter(state.name == "Florida" & county == "Duval County") # FL Duval County's government is combined with FL Jacksonville,
218
-
219
- stoplist3 <- pop %>%
220
- filter(state.name == "Tennessee" & county == "Davidson County")
221
-
222
- stoplist4 <- pop %>%
223
- filter(state.name == "Kentucky" & county == "Jefferson County")
224
-
225
- stoplist5 <- pop %>%
226
- filter(state.name == "Ohio" & county == "Franklin County")
227
-
228
- stoplist <- rbind(stoplist1, stoplist2, stoplist3, stoplist4, stoplist5)
229
-
230
- # get counties in Census population data that have > 100k pop, county level, not Connecticut, not in stoplist
231
-
232
- census_pop_100k <- pop %>%
233
- # counties with > 100k pop in Census
234
- filter(population > 100000) %>%
235
-
236
- # only get county level
237
- filter(grepl("County|Parish", county)) %>% # how about |Borough?
238
- filter(state.name != "Connecticut") %>%
239
- filter(!id %in% stoplist$id)
198
+ # stoplist1 <- pop %>%
199
+ # filter(state.name == "New York" & county %in% c("Bronx County", "Richmond County", "Kings County", "Queens County", "New York County"))
200
+ #
201
+ # stoplist2 <- pop %>%
202
+ # filter(state.name == "Florida" & county == "Duval County") # FL Duval County's government is combined with FL Jacksonville,
203
+ #
204
+ # stoplist3 <- pop %>%
205
+ # filter(state.name == "Tennessee" & county == "Davidson County")
206
+ #
207
+ # stoplist4 <- pop %>%
208
+ # filter(state.name == "Kentucky" & county == "Jefferson County")
209
+ #
210
+ # stoplist5 <- pop %>%
211
+ # filter(state.name == "Ohio" & county == "Franklin County")
212
+ #
213
+ # stoplist <- rbind(stoplist1, stoplist2, stoplist3, stoplist4, stoplist5)
214
+ #
215
+ # # get counties in Census population data that have > 100k pop, county level, not Connecticut, not in stoplist
216
+ #
217
+ # census_pop_100k <- pop %>%
218
+ # # counties with > 100k pop in Census
219
+ # filter(population > 100000) %>%
220
+ #
221
+ # # only get county level
222
+ # filter(grepl("County|Parish", county)) %>% # how about |Borough?
223
+ # filter(state.name != "Connecticut") %>%
224
+ # filter(!id %in% stoplist$id)
240
225
241
226
```
242
227
243
228
## Counties in Census with > 100k population above that are not yet matched with ACFRs
244
229
245
230
``` {r}
246
- census_pop_NOT_match_acfrs_100k <- census_pop_100k %>%
247
- filter(!id %in% county_pop_census_acfrs$id ) %>% arrange(desc(population)) %>% distinct()
231
+ # census_pop_NOT_match_acfrs_100k <- census_pop_100k %>%
232
+ # filter(!id %in% county_pop_census_acfrs$id ) %>% arrange(desc(population)) %>% distinct()
248
233
#write.csv(census_pop_NOT_match_acfrs_100k, "census_pop_NOT_match_acfrs_100k.csv")
249
234
```
250
235
@@ -255,13 +240,13 @@ There are `r nrow(census_pop_NOT_match_acfrs_100k)` counties in Census with more
255
240
county_pop_census_acfrs
256
241
257
242
``` {r}
258
- census_pop_NOT_match_acfrs_all <- pop %>%
259
-
260
- # only get county level
261
- filter(grepl("County|Parish", county)) %>% # how about |Borough?
262
- filter(state.name != "Connecticut") %>%
263
- filter(!id %in% stoplist$id) %>%
264
- filter(!id %in% county_pop_census_acfrs$id) %>% arrange(desc(population))
243
+ # census_pop_NOT_match_acfrs_all <- pop %>%
244
+ #
245
+ # # only get county level
246
+ # filter(grepl("County|Parish", county)) %>% # how about |Borough?
247
+ # filter(state.name != "Connecticut") %>%
248
+ # filter(!id %in% stoplist$id) %>%
249
+ # filter(!id %in% county_pop_census_acfrs$id) %>% arrange(desc(population))
265
250
266
251
#write.csv(census_pop_NOT_match_acfrs_all, "census_pop_NOT_match_acfrs_all.csv")
267
252
```
0 commit comments