@@ -147,13 +147,8 @@ def identify_agency(df, identifier_col):
147147
148148 return full_df
149149
150- def identify_agency2 (df : pd .DataFrame ) -> pd .DataFrame :
151- """
152- Fill in locodes, using the column rk_locode first
153- then using the original function from Natalie.
154- """
155- # Load dataframe with locodes
156- locodes_df = to_snakecase (
150+ def load_locodes ()-> pd .DataFrame :
151+ df = to_snakecase (
157152 pd .read_excel (
158153 f"gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/locodes_updated7122021.xlsx"
159154 )
@@ -162,7 +157,54 @@ def identify_agency2(df: pd.DataFrame) -> pd.DataFrame:
162157 "agency_name" : "implementing_agency" ,
163158 }
164159 )
160+ return df
165161
162+ def load_county ()-> pd .DataFrame :
163+ df = to_snakecase (
164+ pd .read_excel (
165+ f"{ GCS_FILE_PATH } /Copy of County.xlsx" , sheet_name = "County" , header = [1 ]
166+ )
167+ )[["recipient_name" , "county_description" , "county_code" ]]
168+
169+ df ['county_description' ] = df ['county_description' ] + " County"
170+ return df
171+
172+ def county_district_crosswalk ()-> pd .DataFrame :
173+ """
174+ Aggregate locodes dataset to find which
175+ districts a county lies in.
176+ """
177+ # Load locodes
178+ locodes_df = load_locodes ()
179+
180+ # Load counties
181+ county_base = load_county ()
182+
183+ county_district = (
184+ locodes_df
185+ >> group_by (_ .district , _ .county_name )
186+ >> count (_ .county_name )
187+ >> select (_ .district , _ .county_name )
188+ >> filter (_ .county_name != "Multi-County" , _ .district != 53 )
189+ )
190+
191+ county_info = pd .merge (
192+ county_base ,
193+ county_district ,
194+ how = "left" ,
195+ left_on = "county_description" ,
196+ right_on = "county_name" ,
197+ ).drop (columns = ["county_name" ])
198+ return county_info
199+
200+ def identify_agency2 (df : pd .DataFrame ) -> pd .DataFrame :
201+ """
202+ Fill in locodes, using the column rk_locode first
203+ then use the original function from Natalie.
204+ """
205+ # Load dataframe with locodes
206+ locodes_df = load_locodes ()
207+
166208 # Filter out for rows in which rk_locode is filled
167209 filled_locode_df = df .loc [df .rk_locode .notna ()].reset_index (drop = True )
168210
@@ -181,7 +223,6 @@ def identify_agency2(df: pd.DataFrame) -> pd.DataFrame:
181223 # Clean
182224 filled_locode_df2 = filled_locode_df2 .rename (
183225 columns = {
184- "agency_name" : "implementing_agency" ,
185226 "rk_locode" : "implementing_agency_locode" ,
186227 }
187228 ).drop (
@@ -199,50 +240,33 @@ def identify_agency2(df: pd.DataFrame) -> pd.DataFrame:
199240 )
200241
201242 # Fill in summary_recipient_defined_text_field_1_value
202- missing_locode_df .summary_recipient_defined_text_field_1_value = (
203- missing_locode_df .summary_recipient_defined_text_field_1_value .fillna ("None" )
204- )
243+ # missing_locode_df.summary_recipient_defined_text_field_1_value = (
244+ # missing_locode_df.summary_recipient_defined_text_field_1_value.fillna("None")
245+ # )
205246
206247 # Try add_name_from_locode from _data_utils
207248 missing_locode_df2 = _data_utils .add_name_from_locode (
208249 missing_locode_df , "summary_recipient_defined_text_field_1_value"
209250 )
210251
211- # Concat all the dataframes
212- final_df = pd .concat ([filled_locode_df2 , missing_locode_df2 ])
213- display ("Do the # of rows match?" )
214- display (len (final_df ) == len (df ))
215-
216- # More cleaning
217- county_base = to_snakecase (pd .read_excel (f"{ GCS_FILE_PATH } /Copy of County.xlsx" , sheet_name = 'County' , header = [1 ]))
218- county_base .drop (columns = ['unnamed:_0' , 'unnamed:_4' ], axis = 1 , inplace = True )
219- county_base ['county_description' ] = county_base ['county_description' ] + " County"
252+ # Manually add in info for any rows that are still missing info
253+ county_info = county_district_crosswalk ()
220254
221- county_district = (
222- locodes_df
223- >> group_by (_ .district , _ .county_name )
224- >> count (_ .county_name )
225- >> select (_ .district , _ .county_name )
226- >> filter (_ .county_name != "Multi-County" , _ .district != 53 )
227- )
228- county_info = pd .merge (
229- county_base ,
230- county_district ,
231- how = "left" ,
232- left_on = "county_description" ,
233- right_on = "county_name" ,
234- ).drop (columns = ["county_name" ])
235255 mapping1 = dict (county_info [["county_code" , "county_description" ]].values )
236256 mapping2 = dict (county_info [["county_code" , "recipient_name" ]].values )
237257 mapping3 = dict (county_info [["county_code" , "district" ]].values )
238258
239- final_df ["county_description" ] = final_df .county_code .map (mapping1 )
240- final_df ["recipient_name" ] = final_df .county_code .map (mapping2 )
241- final_df ["district" ] = final_df .county_code .map (mapping3 )
242-
243- final_df .loc [
244- final_df .county_name == "Statewide County" , "county_name" ] = "Statewide"
259+ missing_locode_df2 ["county_description" ] = missing_locode_df2 .county_code .map (mapping1 )
260+ missing_locode_df2 ["district" ] = missing_locode_df2 .county_code .map (mapping3 )
261+ missing_locode_df2 ["implementing_agency" ] = missing_locode_df2 .county_code .map (mapping2 )
262+
263+ # Concat all the dataframes
264+ final_df = pd .concat ([filled_locode_df2 , missing_locode_df2 ])
265+ display ("Do the # of rows match?" )
266+ display (len (final_df ) == len (df ))
245267
268+ # Clean & fill in nans with Unknown
269+ final_df .loc [final_df .county_name == "Statewide County" , "county_name" ] = "Statewide"
246270 final_df ["implementing_agency" ] = final_df [
247271 "implementing_agency"
248272 ].fillna (value = "Unknown" )
@@ -599,7 +623,7 @@ def get_clean_data(df, full_or_agg = ''):
599623 aggdf = add_new_description_col (aggdf )
600624
601625 ##asserting that the there is one row for each project id in the new
602- assert len (aggdf ) == df .project_number .nunique ()
626+ display ( len (aggdf ) == df .project_number .nunique () )
603627
604628 return aggdf
605629
0 commit comments