@@ -147,13 +147,8 @@ def identify_agency(df, identifier_col):
147
147
148
148
return full_df
149
149
150
- def identify_agency2 (df : pd .DataFrame ) -> pd .DataFrame :
151
- """
152
- Fill in locodes, using the column rk_locode first
153
- then using the original function from Natalie.
154
- """
155
- # Load dataframe with locodes
156
- locodes_df = to_snakecase (
150
+ def load_locodes ()-> pd .DataFrame :
151
+ df = to_snakecase (
157
152
pd .read_excel (
158
153
f"gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/locodes_updated7122021.xlsx"
159
154
)
@@ -162,7 +157,54 @@ def identify_agency2(df: pd.DataFrame) -> pd.DataFrame:
162
157
"agency_name" : "implementing_agency" ,
163
158
}
164
159
)
160
+ return df
165
161
162
+ def load_county ()-> pd .DataFrame :
163
+ df = to_snakecase (
164
+ pd .read_excel (
165
+ f"{ GCS_FILE_PATH } /Copy of County.xlsx" , sheet_name = "County" , header = [1 ]
166
+ )
167
+ )[["recipient_name" , "county_description" , "county_code" ]]
168
+
169
+ df ['county_description' ] = df ['county_description' ] + " County"
170
+ return df
171
+
172
+ def county_district_crosswalk ()-> pd .DataFrame :
173
+ """
174
+ Aggregate locodes dataset to find which
175
+ districts a county lies in.
176
+ """
177
+ # Load locodes
178
+ locodes_df = load_locodes ()
179
+
180
+ # Load counties
181
+ county_base = load_county ()
182
+
183
+ county_district = (
184
+ locodes_df
185
+ >> group_by (_ .district , _ .county_name )
186
+ >> count (_ .county_name )
187
+ >> select (_ .district , _ .county_name )
188
+ >> filter (_ .county_name != "Multi-County" , _ .district != 53 )
189
+ )
190
+
191
+ county_info = pd .merge (
192
+ county_base ,
193
+ county_district ,
194
+ how = "left" ,
195
+ left_on = "county_description" ,
196
+ right_on = "county_name" ,
197
+ ).drop (columns = ["county_name" ])
198
+ return county_info
199
+
200
+ def identify_agency2 (df : pd .DataFrame ) -> pd .DataFrame :
201
+ """
202
+ Fill in locodes, using the column rk_locode first
203
+ then use the original function from Natalie.
204
+ """
205
+ # Load dataframe with locodes
206
+ locodes_df = load_locodes ()
207
+
166
208
# Filter out for rows in which rk_locode is filled
167
209
filled_locode_df = df .loc [df .rk_locode .notna ()].reset_index (drop = True )
168
210
@@ -181,7 +223,6 @@ def identify_agency2(df: pd.DataFrame) -> pd.DataFrame:
181
223
# Clean
182
224
filled_locode_df2 = filled_locode_df2 .rename (
183
225
columns = {
184
- "agency_name" : "implementing_agency" ,
185
226
"rk_locode" : "implementing_agency_locode" ,
186
227
}
187
228
).drop (
@@ -199,50 +240,33 @@ def identify_agency2(df: pd.DataFrame) -> pd.DataFrame:
199
240
)
200
241
201
242
# Fill in summary_recipient_defined_text_field_1_value
202
- missing_locode_df .summary_recipient_defined_text_field_1_value = (
203
- missing_locode_df .summary_recipient_defined_text_field_1_value .fillna ("None" )
204
- )
243
+ # missing_locode_df.summary_recipient_defined_text_field_1_value = (
244
+ # missing_locode_df.summary_recipient_defined_text_field_1_value.fillna("None")
245
+ # )
205
246
206
247
# Try add_name_from_locode from _data_utils
207
248
missing_locode_df2 = _data_utils .add_name_from_locode (
208
249
missing_locode_df , "summary_recipient_defined_text_field_1_value"
209
250
)
210
251
211
- # Concat all the dataframes
212
- final_df = pd .concat ([filled_locode_df2 , missing_locode_df2 ])
213
- display ("Do the # of rows match?" )
214
- display (len (final_df ) == len (df ))
215
-
216
- # More cleaning
217
- county_base = to_snakecase (pd .read_excel (f"{ GCS_FILE_PATH } /Copy of County.xlsx" , sheet_name = 'County' , header = [1 ]))
218
- county_base .drop (columns = ['unnamed:_0' , 'unnamed:_4' ], axis = 1 , inplace = True )
219
- county_base ['county_description' ] = county_base ['county_description' ] + " County"
252
+ # Manually add in info for any rows that are still missing info
253
+ county_info = county_district_crosswalk ()
220
254
221
- county_district = (
222
- locodes_df
223
- >> group_by (_ .district , _ .county_name )
224
- >> count (_ .county_name )
225
- >> select (_ .district , _ .county_name )
226
- >> filter (_ .county_name != "Multi-County" , _ .district != 53 )
227
- )
228
- county_info = pd .merge (
229
- county_base ,
230
- county_district ,
231
- how = "left" ,
232
- left_on = "county_description" ,
233
- right_on = "county_name" ,
234
- ).drop (columns = ["county_name" ])
235
255
mapping1 = dict (county_info [["county_code" , "county_description" ]].values )
236
256
mapping2 = dict (county_info [["county_code" , "recipient_name" ]].values )
237
257
mapping3 = dict (county_info [["county_code" , "district" ]].values )
238
258
239
- final_df ["county_description" ] = final_df .county_code .map (mapping1 )
240
- final_df ["recipient_name" ] = final_df .county_code .map (mapping2 )
241
- final_df ["district" ] = final_df .county_code .map (mapping3 )
242
-
243
- final_df .loc [
244
- final_df .county_name == "Statewide County" , "county_name" ] = "Statewide"
259
+ missing_locode_df2 ["county_description" ] = missing_locode_df2 .county_code .map (mapping1 )
260
+ missing_locode_df2 ["district" ] = missing_locode_df2 .county_code .map (mapping3 )
261
+ missing_locode_df2 ["implementing_agency" ] = missing_locode_df2 .county_code .map (mapping2 )
262
+
263
+ # Concat all the dataframes
264
+ final_df = pd .concat ([filled_locode_df2 , missing_locode_df2 ])
265
+ display ("Do the # of rows match?" )
266
+ display (len (final_df ) == len (df ))
245
267
268
+ # Clean & fill in nans with Unknown
269
+ final_df .loc [final_df .county_name == "Statewide County" , "county_name" ] = "Statewide"
246
270
final_df ["implementing_agency" ] = final_df [
247
271
"implementing_agency"
248
272
].fillna (value = "Unknown" )
@@ -599,7 +623,7 @@ def get_clean_data(df, full_or_agg = ''):
599
623
aggdf = add_new_description_col (aggdf )
600
624
601
625
##asserting that the there is one row for each project id in the new
602
- assert len (aggdf ) == df .project_number .nunique ()
626
+ display ( len (aggdf ) == df .project_number .nunique () )
603
627
604
628
return aggdf
605
629
0 commit comments