Skip to content

Commit 4183c25

Browse files
committed
finished updating rk locodes
1 parent 7ad46c6 commit 4183c25

File tree

4 files changed

+959
-684
lines changed

4 files changed

+959
-684
lines changed

dla/iija/_data_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ def add_new_codes(df):
194194
#new_codes = update_program_code_list2()
195195

196196
## adding updated program codes 1/30/25
197-
new_codes = update_program_code_list_2025
197+
new_codes = update_program_code_list_2025()
198198
code_map = dict(new_codes[['iija_program_code', 'program_name']].values)
199199

200200
df['program_code_description'] = df.program_code.map(code_map)

dla/iija/_script_utils.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,109 @@ def identify_agency(df, identifier_col):
147147

148148
return full_df
149149

150+
def identify_agency2(df: pd.DataFrame) -> pd.DataFrame:
151+
"""
152+
Fill in locodes, using the column rk_locode first
153+
then using the original function from Natalie.
154+
"""
155+
# Load dataframe with locodes
156+
locodes_df = to_snakecase(
157+
pd.read_excel(
158+
f"gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/locodes_updated7122021.xlsx"
159+
)
160+
).rename(
161+
columns={
162+
"agency_name": "implementing_agency",
163+
}
164+
)
165+
166+
# Filter out for rows in which rk_locode is filled
167+
filled_locode_df = df.loc[df.rk_locode.notna()].reset_index(drop=True)
168+
169+
# Merge the two dataframes
170+
filled_locode_df2 = pd.merge(
171+
filled_locode_df,
172+
locodes_df,
173+
left_on="rk_locode",
174+
right_on="agency_locode",
175+
how="left",
176+
indicator=True,
177+
)
178+
display("Rows with locodes filled")
179+
display(filled_locode_df2._merge.value_counts())
180+
181+
# Clean
182+
filled_locode_df2 = filled_locode_df2.rename(
183+
columns={
184+
"agency_name": "implementing_agency",
185+
"rk_locode": "implementing_agency_locode",
186+
}
187+
).drop(
188+
columns=[
189+
"active_e76s______7_12_2021_",
190+
"mpo_locode_fads",
191+
"agency_locode",
192+
"_merge",
193+
]
194+
)
195+
196+
# Filter out for rows with missing locodes
197+
missing_locode_df = (df.loc[(df.rk_locode.isna())].reset_index(drop=True)).drop(
198+
columns=["rk_locode"]
199+
)
200+
201+
# Fill in summary_recipient_defined_text_field_1_value
202+
missing_locode_df.summary_recipient_defined_text_field_1_value = (
203+
missing_locode_df.summary_recipient_defined_text_field_1_value.fillna("None")
204+
)
205+
206+
# Try add_name_from_locode from _data_utils
207+
missing_locode_df2 = _data_utils.add_name_from_locode(
208+
missing_locode_df, "summary_recipient_defined_text_field_1_value"
209+
)
210+
211+
# Concat all the dataframes
212+
final_df = pd.concat([filled_locode_df2, missing_locode_df2])
213+
display("Do the # of rows match?")
214+
display(len(final_df) == len(df))
215+
216+
# More cleaning
217+
county_base = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/Copy of County.xlsx", sheet_name='County', header=[1]))
218+
county_base.drop(columns =['unnamed:_0', 'unnamed:_4'], axis=1, inplace=True)
219+
county_base['county_description'] = county_base['county_description'] + " County"
220+
221+
county_district = (
222+
locodes_df
223+
>> group_by(_.district, _.county_name)
224+
>> count(_.county_name)
225+
>> select(_.district, _.county_name)
226+
>> filter(_.county_name != "Multi-County", _.district != 53)
227+
)
228+
county_info = pd.merge(
229+
county_base,
230+
county_district,
231+
how="left",
232+
left_on="county_description",
233+
right_on="county_name",
234+
).drop(columns=["county_name"])
235+
mapping1 = dict(county_info[["county_code", "county_description"]].values)
236+
mapping2 = dict(county_info[["county_code", "recipient_name"]].values)
237+
mapping3 = dict(county_info[["county_code", "district"]].values)
238+
239+
final_df["county_description"] = final_df.county_code.map(mapping1)
240+
final_df["recipient_name"] = final_df.county_code.map(mapping2)
241+
final_df["district"] = final_df.county_code.map(mapping3)
242+
243+
final_df.loc[
244+
final_df.county_name == "Statewide County", "county_name"] = "Statewide"
245+
246+
final_df["implementing_agency"] = final_df[
247+
"implementing_agency"
248+
].fillna(value="Unknown")
249+
final_df["county_name"] = final_df["county_name"].fillna(
250+
value="Unknown"
251+
)
252+
return final_df
150253

151254
def condense_df(df):
152255
"""
@@ -536,7 +639,23 @@ def run_script(file_name, recipient_column, df_agg_level):
536639

537640
return agg
538641

642+
def run_script2(file_name, recipient_column, df_agg_level):
643+
644+
### Read in data
645+
proj_list = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/{file_name}"))
646+
647+
### run function to get new program codes
648+
proj_cleaned = _data_utils.add_new_codes(proj_list)
649+
650+
## function that adds known agency name to df
651+
df = identify_agency2(proj_cleaned)
652+
653+
### run the data through the rest of the script
654+
### return a dataset that is aggregated at the project and program code
655+
agg = get_clean_data(df, full_or_agg = df_agg_level)
539656

657+
return agg
658+
540659
def export_to_gcs(df, export_date):
541660

542661
### pretty print the column names

0 commit comments

Comments
 (0)