6
6
- add project types that classify project type
7
7
- create a public-friendly project title
8
8
'''
9
-
10
9
import numpy as np
11
10
import pandas as pd
12
11
from siuba import *
13
12
14
13
import dla_utils
15
-
16
14
from calitp_data_analysis .sql import to_snakecase
17
-
18
15
import _data_utils
19
-
20
16
import intake
21
17
22
18
# import nltk
27
23
28
24
GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/dla/dla-iija'
29
25
30
-
31
26
def _prep_data (file_name ):
32
27
proj = to_snakecase (pd .read_excel (f"{ GCS_FILE_PATH } /{ file_name } " ))
33
28
@@ -152,6 +147,109 @@ def identify_agency(df, identifier_col):
152
147
153
148
return full_df
154
149
150
+ def identify_agency2 (df : pd .DataFrame ) -> pd .DataFrame :
151
+ """
152
+ Fill in locodes, using the column rk_locode first
153
+ then using the original function from Natalie.
154
+ """
155
+ # Load dataframe with locodes
156
+ locodes_df = to_snakecase (
157
+ pd .read_excel (
158
+ f"gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/locodes_updated7122021.xlsx"
159
+ )
160
+ ).rename (
161
+ columns = {
162
+ "agency_name" : "implementing_agency" ,
163
+ }
164
+ )
165
+
166
+ # Filter out for rows in which rk_locode is filled
167
+ filled_locode_df = df .loc [df .rk_locode .notna ()].reset_index (drop = True )
168
+
169
+ # Merge the two dataframes
170
+ filled_locode_df2 = pd .merge (
171
+ filled_locode_df ,
172
+ locodes_df ,
173
+ left_on = "rk_locode" ,
174
+ right_on = "agency_locode" ,
175
+ how = "left" ,
176
+ indicator = True ,
177
+ )
178
+ display ("Rows with locodes filled" )
179
+ display (filled_locode_df2 ._merge .value_counts ())
180
+
181
+ # Clean
182
+ filled_locode_df2 = filled_locode_df2 .rename (
183
+ columns = {
184
+ "agency_name" : "implementing_agency" ,
185
+ "rk_locode" : "implementing_agency_locode" ,
186
+ }
187
+ ).drop (
188
+ columns = [
189
+ "active_e76s______7_12_2021_" ,
190
+ "mpo_locode_fads" ,
191
+ "agency_locode" ,
192
+ "_merge" ,
193
+ ]
194
+ )
195
+
196
+ # Filter out for rows with missing locodes
197
+ missing_locode_df = (df .loc [(df .rk_locode .isna ())].reset_index (drop = True )).drop (
198
+ columns = ["rk_locode" ]
199
+ )
200
+
201
+ # Fill in summary_recipient_defined_text_field_1_value
202
+ missing_locode_df .summary_recipient_defined_text_field_1_value = (
203
+ missing_locode_df .summary_recipient_defined_text_field_1_value .fillna ("None" )
204
+ )
205
+
206
+ # Try add_name_from_locode from _data_utils
207
+ missing_locode_df2 = _data_utils .add_name_from_locode (
208
+ missing_locode_df , "summary_recipient_defined_text_field_1_value"
209
+ )
210
+
211
+ # Concat all the dataframes
212
+ final_df = pd .concat ([filled_locode_df2 , missing_locode_df2 ])
213
+ display ("Do the # of rows match?" )
214
+ display (len (final_df ) == len (df ))
215
+
216
+ # More cleaning
217
+ county_base = to_snakecase (pd .read_excel (f"{ GCS_FILE_PATH } /Copy of County.xlsx" , sheet_name = 'County' , header = [1 ]))
218
+ county_base .drop (columns = ['unnamed:_0' , 'unnamed:_4' ], axis = 1 , inplace = True )
219
+ county_base ['county_description' ] = county_base ['county_description' ] + " County"
220
+
221
+ county_district = (
222
+ locodes_df
223
+ >> group_by (_ .district , _ .county_name )
224
+ >> count (_ .county_name )
225
+ >> select (_ .district , _ .county_name )
226
+ >> filter (_ .county_name != "Multi-County" , _ .district != 53 )
227
+ )
228
+ county_info = pd .merge (
229
+ county_base ,
230
+ county_district ,
231
+ how = "left" ,
232
+ left_on = "county_description" ,
233
+ right_on = "county_name" ,
234
+ ).drop (columns = ["county_name" ])
235
+ mapping1 = dict (county_info [["county_code" , "county_description" ]].values )
236
+ mapping2 = dict (county_info [["county_code" , "recipient_name" ]].values )
237
+ mapping3 = dict (county_info [["county_code" , "district" ]].values )
238
+
239
+ final_df ["county_description" ] = final_df .county_code .map (mapping1 )
240
+ final_df ["recipient_name" ] = final_df .county_code .map (mapping2 )
241
+ final_df ["district" ] = final_df .county_code .map (mapping3 )
242
+
243
+ final_df .loc [
244
+ final_df .county_name == "Statewide County" , "county_name" ] = "Statewide"
245
+
246
+ final_df ["implementing_agency" ] = final_df [
247
+ "implementing_agency"
248
+ ].fillna (value = "Unknown" )
249
+ final_df ["county_name" ] = final_df ["county_name" ].fillna (
250
+ value = "Unknown"
251
+ )
252
+ return final_df
155
253
156
254
def condense_df (df ):
157
255
"""
@@ -160,9 +258,11 @@ def condense_df(df):
160
258
# make sure columns are in string format
161
259
df [['county_code' , 'improvement_type' ,
162
260
'implementing_agency_locode' , 'district' ,
163
- 'program_code_description' , 'recipient_project_number' ]] = df [['county_code' , 'improvement_type' ,
261
+ 'program_code_description' , 'recipient_project_number' ,
262
+ "funding_type_code" ]] = df [['county_code' , 'improvement_type' ,
164
263
'implementing_agency_locode' , 'district' ,
165
- 'program_code_description' , 'recipient_project_number' ]].astype (str )
264
+ 'program_code_description' , 'recipient_project_number' ,
265
+ "funding_type_code" ]].astype (str )
166
266
# copy county column over to use for project title name easier
167
267
df ['county_name_title' ] = df ['county_name' ]
168
268
# copy program code column over to use for project description column easier
@@ -174,7 +274,7 @@ def condense_df(df):
174
274
.groupby (['fmis_transaction_date' ,'project_number' , 'implementing_agency' , 'summary_recipient_defined_text_field_1_value'
175
275
# , 'program_code', 'program_code_description'
176
276
])
177
- .agg ({
277
+ .agg ({'funding_type_code' : lambda x : '|' . join ( x . unique ()),
178
278
'program_code' :lambda x :'|' .join (x .unique ()), # get unique values to concatenate ##hashing this out to group by instead
179
279
'program_code_description' :lambda x :'|' .join (x .unique ()), # get unique values to concatenate ##hashing this out to group by instead
180
280
'recipient_project_number' :lambda x :'|' .join (x .unique ()), #'first',
@@ -541,7 +641,23 @@ def run_script(file_name, recipient_column, df_agg_level):
541
641
542
642
return agg
543
643
644
+ def run_script2 (file_name , recipient_column , df_agg_level ):
544
645
646
+ ### Read in data
647
+ proj_list = to_snakecase (pd .read_excel (f"{ GCS_FILE_PATH } /{ file_name } " ))
648
+
649
+ ### run function to get new program codes
650
+ proj_cleaned = _data_utils .add_new_codes (proj_list )
651
+
652
+ ## function that adds known agency name to df
653
+ df = identify_agency2 (proj_cleaned )
654
+
655
+ ### run the data through the rest of the script
656
+ ### return a dataset that is aggregated at the project and program code
657
+ agg = get_clean_data (df , full_or_agg = df_agg_level )
658
+
659
+ return agg
660
+
545
661
def export_to_gcs (df , export_date ):
546
662
547
663
### pretty print the column names
0 commit comments