Skip to content

Commit 64d51af

Browse files
committed
data for govfin_app
1 parent 5e9181f commit 64d51af

29 files changed

+11141
-7126
lines changed

.DS_Store

0 Bytes
Binary file not shown.

California-State-Flag.jpeg

-15.7 KB
Binary file not shown.

Flag_of_California.png

-1
This file was deleted.

Flag_of_New_York_City.png

-43.2 KB
Binary file not shown.

Rplots.pdf

-3.53 KB
Binary file not shown.

acfrs_8gov_type.RDS

2.22 MB
Binary file not shown.

acfrs_city_pop_added_char.RDS

625 KB
Binary file not shown.

acfrs_school_districts_clean.csv

+9,479
Large diffs are not rendered by default.

acfrs_census_matching_cities_population.Rmd city_gov.Rmd

+15-40
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,8 @@ pop_governmentID %>%
7272
```{r}
7373
options(scipen = 999)
7474
# census_id in ACFRs is government_ID used in file "City and Town Mapping.xlsx"
75-
acfrs <- readRDS("data_from_dbsite.RDS")
7675
77-
acfrs_governmentID <- acfrs %>% drop_na(census_id) %>%
76+
acfrs_governmentID <- readRDS("data_from_dbsite.RDS") %>% drop_na(census_id) %>%
7877
filter(category == "General Purpose") %>%
7978
rename(government_ID = census_id)
8079
@@ -98,17 +97,17 @@ acfrs_governmentID_14char <- acfrs_governmentID %>%
9897
9998
acfrs_governmentID_1314_char <- rbind(acfrs_governmentID_13char, acfrs_governmentID_14char)
10099
101-
#Joining 1
100+
#Joining method 1
102101
103102
acfrs_city_pop <- acfrs_governmentID %>%
104-
left_join(pop_governmentID) %>% drop_na(population) %>%
103+
left_join(pop_governmentID) %>% #drop_na(population) %>%
105104
select(-c(state, STATE_AB, year)) %>%
106105
arrange(desc(population))
107106
108-
# joining 2: add 0 to those with 13 chars --> # once added extra 0, have more 695 matches
107+
# joining method 2: add 0 to those with 13 chars --> # once added extra 0, have more 695 matches
109108
110109
acfrs_city_pop_added_char <- acfrs_governmentID_1314_char %>%
111-
left_join(pop_governmentID) %>% drop_na(population) %>%
110+
left_join(pop_governmentID) %>% #drop_na(population) %>%
112111
select(-c(state, STATE_AB, year)) %>%
113112
arrange(desc(population))
114113
@@ -119,6 +118,9 @@ acfrs_city_pop %>%
119118
# Now got matched
120119
acfrs_city_pop_added_char %>%
121120
filter(grepl("Los Angeles", name))
121+
122+
#10412 cities, of which 4112 do not have population data
123+
#acfrs_city_pop_added_char %>% filter(is.na(population))
122124
```
123125

124126
```{r}
@@ -130,43 +132,16 @@ saveRDS(acfrs_city_pop_added_char, "acfrs_city_pop_added_char.RDS")
130132
```{r}
131133
132134
# cities with population over 100,000
133-
cities_100Kpop_NOT_acfrs <- pop %>%
134-
filter(population >= 100000) %>%
135-
#filter(!geo_id %in% acfrs_city_pop$geo_id) %>%
136-
137-
# adding 0 to those with 13 char in government_ID in ACFRs get more matches --> now only 33 cities > 100k in pop but NOT in acfrs
138-
filter(!geo_id %in% acfrs_city_pop_added_char$geo_id) %>%
139-
arrange(desc(population)) %>% distinct()
135+
# cities_100Kpop_NOT_acfrs <- pop %>%
136+
# filter(population >= 100000) %>%
137+
# #filter(!geo_id %in% acfrs_city_pop$geo_id) %>%
138+
#
139+
# # adding 0 to those with 13 char in government_ID in ACFRs get more matches --> now only 33 cities > 100k in pop but NOT in acfrs
140+
# filter(!geo_id %in% acfrs_city_pop_added_char$geo_id) %>%
141+
# arrange(desc(population)) %>% distinct()
140142
141143
#write.csv(cities_100Kpop_NOT_acfrs, "cities_100Kpop_NOT_acfrs.csv")
142144
143145
```
144146

145147

146-
147-
# Some special cases
148-
149-
```{r}
150-
151-
#Counties that are also Cities:
152-
#San Francisco, CA
153-
#Jacksonville, FL and Dupage County, FL
154-
#Nashville, TN and Davidson County, TN
155-
# Check Nashville
156-
acfrs %>% filter(state == "TN") %>%
157-
filter(grepl("Nashville", name)) %>% select(state, name, total_liabilities, revenues) %>%
158-
filter(name == "Nashville-Davidson County")
159-
```
160-
161-
```{r}
162-
pop %>% filter(state.abb == "TN") %>%
163-
filter(grepl("Nashville", city_town))
164-
165-
166-
# Combined city/ city government: Jacksonville city, FL = Duval County, FL
167-
jacksonville_city_fl <- acfrs_city_pop_added_char %>%
168-
filter(state.abb == "FL" & name == "Jacksonville")
169-
170-
171-
```
172-

acfrs_census_matching_county_population.Rmd county_gov.Rmd

+50-65
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,16 @@ library(dplyr)
1616
# Census Population - County Level
1717

1818
```{r}
19-
d <- rio::import(here::here("data", "DECENNIALPL2020.P1_data_with_overlays_2021-12-16T123049.csv"), skip = 1) %>%
19+
d_county <- rio::import(here::here("data", "DECENNIALPL2020.P1_data_with_overlays_2021-12-16T123049.csv"), skip = 1) %>%
2020
select(id, `Geographic Area Name`, `!!Total:`) %>%
2121
rename(population = `!!Total:`)
2222
23-
# join with df_state to get state.abb and state.name --> to have a common col with acfrs to join
23+
# join with df_state to get state.abb and state.name --> to have a common col to join with acfrs
2424
df_state <- data.frame(state.abb, state.name) %>%
2525
add_row(state.abb = "PR", state.name = "Puerto Rico") %>%
2626
add_row(state.abb = "DC", state.name = "District of Columbia")
2727
28-
pop <- d %>%
28+
pop_county <- d_county %>%
2929
separate(`Geographic Area Name`, c("county", "state.name"), sep = ",") %>%
3030
mutate(state.name = str_trim(state.name)) %>%
3131
left_join(df_state) %>%
@@ -34,10 +34,10 @@ pop <- d %>%
3434
mutate(county = str_to_lower(county),
3535
county = str_remove(county,"\\.|'|‘"))
3636
37-
pop %>%
37+
## Special cases
38+
pop_county %>%
3839
filter(state.abb == "TN" | state.abb == "KY") %>%
3940
filter(str_detect(county, "davidson|jefferson")) #jefferson county ; davidson county
40-
## Special cases
4141
4242
# # Nashville-Davidson metropolitan government (balance) == "Nashville-Davidson County" in ACFRS--> rename to match
4343
# mutate(county = ifelse(county == "nashville-davidson metropolitan government (balance)", "nashville-davidson county", county)) %>%
@@ -80,13 +80,12 @@ acfrs <- readRDS("data_from_dbsite.RDS") %>%
8080
ACFRs has `r nrow(acfrs)` observations.
8181

8282

83-
## Find "County" in ACFRs
83+
## Find "County" or "Municipality" in ACFRs
8484
```{r}
8585
# ACFRs entities that contains the word "County" in their names
8686
acfrs_county <- acfrs %>%
8787
filter(category == "General Purpose") %>%
88-
filter(grepl("county", county))
89-
88+
filter(grepl("county|municipality", county))
9089
```
9190

9291
There are `r nrow(acfrs_county)` entities that contain the word "county" in their names.
@@ -115,15 +114,6 @@ alaska_borough <- acfrs %>%
115114
filter(category == "General Purpose") %>%
116115
filter(state.abb == "AK") %>%
117116
filter(grepl("borough", county)) %>% arrange(county)
118-
119-
120-
# double check, find any ACFRs entities that has "Municipality" in their names.
121-
acfrs %>% filter(grepl("municipality", county))
122-
123-
# Found 2: SD Municipality of Dell Rapids & WV Municipality of Parkersburg.
124-
# However, they're not in Census population data
125-
pop %>%
126-
filter(county == "municipality of parkersburg" | county == "municipality of dell rapids" )
127117
128118
```
129119

@@ -143,7 +133,7 @@ Potential risk: Not all Puerto Rico entities in ACFRs without the word "Municipi
143133

144134
```{r}
145135
# in Census population, find county that has the word "Municipio" --> remove the word "Municipio" to match with acfrs
146-
puertorico_census_pop <- pop %>%
136+
puertorico_census_pop <- pop_county %>%
147137
filter(state.name == "Puerto Rico") %>%
148138
filter(grepl("municipio", county)) %>%
149139
mutate(county = str_remove(county, " municipio"))
@@ -155,15 +145,14 @@ puertorico_census_pop <- pop %>%
155145
filter(category == "General Purpose") %>%
156146
filter(state.abb == "PR") %>%
157147
#join Puerto Rico in acfrs with PR in census -
158-
left_join(puertorico_census_pop, by = c("state.abb", "county")) %>%
159-
drop_na(population)
148+
left_join(puertorico_census_pop, by = c("state.abb", "county"))
160149
```
161150

162151
# Join ACFRs and Census population data to get population for ACFRs counties
163152

164153

165154
Joining these components:
166-
* acfrs entities contain word "County" in their names
155+
* acfrs entities contain word "County" or "Municipality" in their names
167156
* acfrs entities of Louisiana that contain word "Parish" in their names
168157
* Puerto Rico in acfrs and Census. Note that in ACFRs, there's no Puerto Rico entities has "Municipio". To match with PR in Census, need to remove this word from Census population data. This introduces a risk that there might be cases where PR entities in ACFRs without word "Municipio" are NOT actually Municipio.
169158

@@ -172,22 +161,18 @@ Joining these components:
172161
# first, join entities in ACFRs contain words "County" + Louisiana that contain word "Parish" + alaska that contain "Borough" in their names
173162
acfrs_county_parish_borough <- rbind(acfrs_county, louisiana_parish) %>%
174163
rbind(alaska_borough)
175-
176164
177165
# next, join with census Census population by = c("state.abb", "county")
178166
county_pop_census_acfrs <- acfrs_county_parish_borough %>%
179-
left_join(pop, by = c("state.abb", "county")) %>%
180-
drop_na(population) %>%
167+
left_join(pop_county, by = c("state.abb", "county")) %>%
181168
182169
# third, bind with puertorico
183170
rbind(puertorico_afrs_census_pop) %>%
184171
arrange(desc(population)) %>% distinct()
185172
186-
# PROBLEM: Need to explore more why others do not get matched
187-
setdiff(acfrs_county_parish_borough$county, county_pop_census_acfrs$county)
173+
# --> 2503 county level entities, of which 255 do not have population data
174+
#county_pop_census_acfrs %>% filter(is.na(population))
188175
189-
acfrs_county_parish_borough %>%
190-
left_join(pop, by = c("state.abb", "county")) %>% filter(is.na(population))
191176
```
192177

193178

@@ -197,7 +182,7 @@ county_pop_census_acfrs %>%
197182
filter(state.abb == "KY" | state.abb == "TN") %>%
198183
filter(str_detect(county, "davidson|jefferson")) %>% select(state.abb, population, total_liabilities, id.y, county)
199184
200-
write.csv(county_pop_census_acfrs, "county_pop_census_acfrs.csv")
185+
#write.csv(county_pop_census_acfrs, "county_pop_census_acfrs.csv")
201186
saveRDS(county_pop_census_acfrs, "county_pop_census_acfrs.RDS")
202187
203188
```
@@ -210,41 +195,41 @@ The matched dataset has `r nrow(county_pop_census_acfrs)` observations/ counties
210195
```{r}
211196
# stoplist
212197
213-
stoplist1 <- pop %>%
214-
filter(state.name == "New York" & county %in% c("Bronx County", "Richmond County", "Kings County", "Queens County", "New York County"))
215-
216-
stoplist2 <- pop %>%
217-
filter(state.name == "Florida" & county == "Duval County") # FL Duval County's government is combined with FL Jacksonville,
218-
219-
stoplist3 <- pop %>%
220-
filter(state.name == "Tennessee" & county == "Davidson County")
221-
222-
stoplist4 <- pop %>%
223-
filter(state.name == "Kentucky" & county == "Jefferson County")
224-
225-
stoplist5 <- pop %>%
226-
filter(state.name == "Ohio" & county == "Franklin County")
227-
228-
stoplist <- rbind(stoplist1, stoplist2, stoplist3, stoplist4, stoplist5)
229-
230-
# get counties in Census population data that have > 100k pop, county level, not Connecticut, not in stoplist
231-
232-
census_pop_100k <- pop %>%
233-
# counties with > 100k pop in Census
234-
filter(population > 100000) %>%
235-
236-
# only get county level
237-
filter(grepl("County|Parish", county)) %>% # how about |Borough?
238-
filter(state.name != "Connecticut") %>%
239-
filter(!id %in% stoplist$id)
198+
# stoplist1 <- pop %>%
199+
# filter(state.name == "New York" & county %in% c("Bronx County", "Richmond County", "Kings County", "Queens County", "New York County"))
200+
#
201+
# stoplist2 <- pop %>%
202+
# filter(state.name == "Florida" & county == "Duval County") # FL Duval County's government is combined with FL Jacksonville,
203+
#
204+
# stoplist3 <- pop %>%
205+
# filter(state.name == "Tennessee" & county == "Davidson County")
206+
#
207+
# stoplist4 <- pop %>%
208+
# filter(state.name == "Kentucky" & county == "Jefferson County")
209+
#
210+
# stoplist5 <- pop %>%
211+
# filter(state.name == "Ohio" & county == "Franklin County")
212+
#
213+
# stoplist <- rbind(stoplist1, stoplist2, stoplist3, stoplist4, stoplist5)
214+
#
215+
# # get counties in Census population data that have > 100k pop, county level, not Connecticut, not in stoplist
216+
#
217+
# census_pop_100k <- pop %>%
218+
# # counties with > 100k pop in Census
219+
# filter(population > 100000) %>%
220+
#
221+
# # only get county level
222+
# filter(grepl("County|Parish", county)) %>% # how about |Borough?
223+
# filter(state.name != "Connecticut") %>%
224+
# filter(!id %in% stoplist$id)
240225
241226
```
242227

243228
## Counties in Census with > 100k population above that are not yet matched with ACFRs
244229

245230
```{r}
246-
census_pop_NOT_match_acfrs_100k <- census_pop_100k %>%
247-
filter(!id %in% county_pop_census_acfrs$id ) %>% arrange(desc(population)) %>% distinct()
231+
# census_pop_NOT_match_acfrs_100k <- census_pop_100k %>%
232+
# filter(!id %in% county_pop_census_acfrs$id ) %>% arrange(desc(population)) %>% distinct()
248233
#write.csv(census_pop_NOT_match_acfrs_100k, "census_pop_NOT_match_acfrs_100k.csv")
249234
```
250235

@@ -255,13 +240,13 @@ There are `r nrow(census_pop_NOT_match_acfrs_100k)` counties in Census with more
255240
county_pop_census_acfrs
256241

257242
```{r}
258-
census_pop_NOT_match_acfrs_all <- pop %>%
259-
260-
# only get county level
261-
filter(grepl("County|Parish", county)) %>% # how about |Borough?
262-
filter(state.name != "Connecticut") %>%
263-
filter(!id %in% stoplist$id) %>%
264-
filter(!id %in% county_pop_census_acfrs$id) %>% arrange(desc(population))
243+
# census_pop_NOT_match_acfrs_all <- pop %>%
244+
#
245+
# # only get county level
246+
# filter(grepl("County|Parish", county)) %>% # how about |Borough?
247+
# filter(state.name != "Connecticut") %>%
248+
# filter(!id %in% stoplist$id) %>%
249+
# filter(!id %in% county_pop_census_acfrs$id) %>% arrange(desc(population))
265250
266251
#write.csv(census_pop_NOT_match_acfrs_all, "census_pop_NOT_match_acfrs_all.csv")
267252
```

county_pop_census_acfrs.RDS

200 KB
Binary file not shown.

0 commit comments

Comments
 (0)