@@ -147,6 +147,109 @@ def identify_agency(df, identifier_col):
147
147
148
148
return full_df
149
149
150
+ def identify_agency2 (df : pd .DataFrame ) -> pd .DataFrame :
151
+ """
152
+ Fill in locodes, using the column rk_locode first
153
+ then using the original function from Natalie.
154
+ """
155
+ # Load dataframe with locodes
156
+ locodes_df = to_snakecase (
157
+ pd .read_excel (
158
+ f"gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/locodes_updated7122021.xlsx"
159
+ )
160
+ ).rename (
161
+ columns = {
162
+ "agency_name" : "implementing_agency" ,
163
+ }
164
+ )
165
+
166
+ # Filter out for rows in which rk_locode is filled
167
+ filled_locode_df = df .loc [df .rk_locode .notna ()].reset_index (drop = True )
168
+
169
+ # Merge the two dataframes
170
+ filled_locode_df2 = pd .merge (
171
+ filled_locode_df ,
172
+ locodes_df ,
173
+ left_on = "rk_locode" ,
174
+ right_on = "agency_locode" ,
175
+ how = "left" ,
176
+ indicator = True ,
177
+ )
178
+ display ("Rows with locodes filled" )
179
+ display (filled_locode_df2 ._merge .value_counts ())
180
+
181
+ # Clean
182
+ filled_locode_df2 = filled_locode_df2 .rename (
183
+ columns = {
184
+ "agency_name" : "implementing_agency" ,
185
+ "rk_locode" : "implementing_agency_locode" ,
186
+ }
187
+ ).drop (
188
+ columns = [
189
+ "active_e76s______7_12_2021_" ,
190
+ "mpo_locode_fads" ,
191
+ "agency_locode" ,
192
+ "_merge" ,
193
+ ]
194
+ )
195
+
196
+ # Filter out for rows with missing locodes
197
+ missing_locode_df = (df .loc [(df .rk_locode .isna ())].reset_index (drop = True )).drop (
198
+ columns = ["rk_locode" ]
199
+ )
200
+
201
+ # Fill in summary_recipient_defined_text_field_1_value
202
+ missing_locode_df .summary_recipient_defined_text_field_1_value = (
203
+ missing_locode_df .summary_recipient_defined_text_field_1_value .fillna ("None" )
204
+ )
205
+
206
+ # Try add_name_from_locode from _data_utils
207
+ missing_locode_df2 = _data_utils .add_name_from_locode (
208
+ missing_locode_df , "summary_recipient_defined_text_field_1_value"
209
+ )
210
+
211
+ # Concat all the dataframes
212
+ final_df = pd .concat ([filled_locode_df2 , missing_locode_df2 ])
213
+ display ("Do the # of rows match?" )
214
+ display (len (final_df ) == len (df ))
215
+
216
+ # More cleaning
217
+ county_base = to_snakecase (pd .read_excel (f"{ GCS_FILE_PATH } /Copy of County.xlsx" , sheet_name = 'County' , header = [1 ]))
218
+ county_base .drop (columns = ['unnamed:_0' , 'unnamed:_4' ], axis = 1 , inplace = True )
219
+ county_base ['county_description' ] = county_base ['county_description' ] + " County"
220
+
221
+ county_district = (
222
+ locodes_df
223
+ >> group_by (_ .district , _ .county_name )
224
+ >> count (_ .county_name )
225
+ >> select (_ .district , _ .county_name )
226
+ >> filter (_ .county_name != "Multi-County" , _ .district != 53 )
227
+ )
228
+ county_info = pd .merge (
229
+ county_base ,
230
+ county_district ,
231
+ how = "left" ,
232
+ left_on = "county_description" ,
233
+ right_on = "county_name" ,
234
+ ).drop (columns = ["county_name" ])
235
+ mapping1 = dict (county_info [["county_code" , "county_description" ]].values )
236
+ mapping2 = dict (county_info [["county_code" , "recipient_name" ]].values )
237
+ mapping3 = dict (county_info [["county_code" , "district" ]].values )
238
+
239
+ final_df ["county_description" ] = final_df .county_code .map (mapping1 )
240
+ final_df ["recipient_name" ] = final_df .county_code .map (mapping2 )
241
+ final_df ["district" ] = final_df .county_code .map (mapping3 )
242
+
243
+ final_df .loc [
244
+ final_df .county_name == "Statewide County" , "county_name" ] = "Statewide"
245
+
246
+ final_df ["implementing_agency" ] = final_df [
247
+ "implementing_agency"
248
+ ].fillna (value = "Unknown" )
249
+ final_df ["county_name" ] = final_df ["county_name" ].fillna (
250
+ value = "Unknown"
251
+ )
252
+ return final_df
150
253
151
254
def condense_df (df ):
152
255
"""
@@ -536,7 +639,23 @@ def run_script(file_name, recipient_column, df_agg_level):
536
639
537
640
return agg
538
641
642
+ def run_script2 (file_name , recipient_column , df_agg_level ):
643
+
644
+ ### Read in data
645
+ proj_list = to_snakecase (pd .read_excel (f"{ GCS_FILE_PATH } /{ file_name } " ))
646
+
647
+ ### run function to get new program codes
648
+ proj_cleaned = _data_utils .add_new_codes (proj_list )
649
+
650
+ ## function that adds known agency name to df
651
+ df = identify_agency2 (proj_cleaned )
652
+
653
+ ### run the data through the rest of the script
654
+ ### return a dataset that is aggregated at the project and program code
655
+ agg = get_clean_data (df , full_or_agg = df_agg_level )
539
656
657
+ return agg
658
+
540
659
def export_to_gcs (df , export_date ):
541
660
542
661
### pretty print the column names
0 commit comments