Skip to content

Commit bd7ae30

Browse files
authored
Merge pull request #1364 from cal-itp/iija_data
IIJA Data
2 parents aaba50c + c18b87a commit bd7ae30

File tree

6 files changed

+2130
-243
lines changed

6 files changed

+2130
-243
lines changed

dla/iija/_data_utils.py

Lines changed: 132 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6,25 +6,16 @@
66
If using the get_list_of_words function, remove the comment out hashtag from the import nltk and re in this script
77
AND run a `! pip install nltk` in the first cell of your notebook
88
'''
9-
10-
119
import pandas as pd
1210
from siuba import *
13-
1411
from calitp_data_analysis.sql import to_snakecase
15-
1612
import _script_utils
17-
18-
1913
GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/dla/dla-iija'
2014

21-
2215
# import nltk
2316
# from nltk.corpus import stopwords
2417
# from nltk.tokenize import word_tokenize, sent_tokenize
2518
# import re
26-
27-
2819
def read_data_all():
2920
proj = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/CopyofFMIS_Projects_Universe_IIJA_Reporting_4.xls",
3021
# sheet_name='FMIS 5 Projects ', header=[3]
@@ -42,6 +33,10 @@ def read_data_all():
4233
return proj
4334

4435

36+
'''
37+
Program Code
38+
Functions
39+
'''
4540
# def update_program_code_list():
4641

4742
# ## read in the program codes
@@ -59,11 +54,6 @@ def read_data_all():
5954

6055
# return program_codes
6156

62-
63-
'''
64-
Updated version of the update_program_code_list to alter program names if needed.
65-
66-
'''
6757
def update_program_code_list2():
6858
updated_codes = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/program_codes/FY21-22ProgramCodesAsOf5-25-2022.v2_expanded090823.xlsx"))
6959
updated_codes = updated_codes>>select(_.iija_program_code, _.new_description)
@@ -87,21 +77,142 @@ def add_program_to_row(row):
8777

8878
return program_codes
8979

80+
def add_program_to_row(row):
81+
if "Program" not in row["program_name"]:
82+
return row["program_name"] + " Program"
83+
else:
84+
return row["program_name"]
85+
86+
def load_program_codes_og() -> pd.DataFrame:
87+
df = to_snakecase(
88+
pd.read_excel(
89+
f"{GCS_FILE_PATH}/program_codes/Copy of lst_IIJA_Code_20230908.xlsx"
90+
)
91+
)[["iija_program_code", "description", "program_name"]]
92+
return df
93+
94+
def load_program_codes_sept_2023() -> pd.DataFrame:
95+
df = to_snakecase(
96+
pd.read_excel(
97+
f"{GCS_FILE_PATH}/program_codes/FY21-22ProgramCodesAsOf5-25-2022.v2_expanded090823.xlsx"
98+
)
99+
)[["iija_program_code", "new_description"]]
100+
return df
101+
102+
def load_program_codes_jan_2025() -> pd.DataFrame:
103+
df = to_snakecase(
104+
pd.read_excel(f"{GCS_FILE_PATH}/program_codes/Ycodes_01.2025.xlsx")
105+
)[["program_code", "short_name", "program_code_description", "funding_type_code"]]
106+
107+
df = df.rename(
108+
columns={
109+
"program_code": "iija_program_code",
110+
}
111+
)
112+
df.short_name = df.short_name.str.title()
113+
return df
114+
115+
def update_program_code_list_2025():
116+
"""
117+
On January 2025, we received a new list of updated codes.
118+
Merge this new list with codes received originally and in
119+
September 2023.
120+
"""
121+
# Load original codes
122+
original_codes_df = load_program_codes_og()
123+
124+
# Load September 2023 codes
125+
program_codes_sept_2023 = load_program_codes_sept_2023()
126+
127+
# Merge original + September first
128+
m1 = pd.merge(
129+
program_codes_sept_2023,
130+
original_codes_df,
131+
on="iija_program_code",
132+
how="outer",
133+
indicator=True,
134+
)
135+
136+
# Clean up description
137+
m1["new_description"] = (
138+
m1["new_description"].str.strip().fillna(m1.description)
139+
)
140+
141+
# Delete unnecessary columns
142+
m1 = m1.drop(columns={"description", "_merge"})
143+
144+
# Load January 2025 code
145+
program_codes_jan_2025 = load_program_codes_jan_2025()
146+
147+
# Merge m1 with program codes from January 2025.
148+
m2 = pd.merge(
149+
program_codes_jan_2025,
150+
m1,
151+
on="iija_program_code",
152+
how="outer",
153+
indicator=True,
154+
)
155+
# Update descriptions
156+
m2["2025_description"] = (
157+
m2["program_code_description"].str.strip().fillna(m2.new_description)
158+
)
159+
160+
# Update program names
161+
m2["2025_program_name"] = m2.program_name.fillna(m2.short_name)
162+
163+
# Delete outdated columns
164+
m2 = m2.drop(
165+
columns=[
166+
"short_name",
167+
"program_name",
168+
"program_code_description",
169+
"new_description",
170+
"_merge",
171+
]
172+
)
173+
174+
# Rename to match original sheet
175+
m2 = m2.rename(
176+
columns={
177+
"2025_description": "new_description",
178+
"2025_program_name": "program_name",
179+
}
180+
)
181+
182+
# Add program to another program names without the string "program"
183+
m2["program_name"] = m2.apply(add_program_to_row, axis=1)
184+
return m2
90185

91-
## Function to add the updated program codes to the data
92186
def add_new_codes(df):
187+
"""
188+
Function to add the updated program codes to the data
189+
"""
93190
#new_codes = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/FY21-22ProgramCodesAsOf5-25-2022.v2.xlsx"))
94191
#code_map = dict(new_codes[['iija_program_code', 'new_description']].values)
95192

96193
## adding updated program codes 05/11/23
97-
new_codes = update_program_code_list2()
98-
code_map = dict(new_codes[['iija_program_code', 'program_name']].values)
99-
100-
df['program_code_description'] = df.program_code.map(code_map)
194+
#new_codes = update_program_code_list2()
195+
196+
## adding updated program codes 1/30/25
197+
new_codes = update_program_code_list_2025()
198+
iija_code_map = dict(new_codes[['iija_program_code', 'program_name']].values)
199+
df['program_code_description'] = df.program_code.map(iija_code_map)
200+
201+
# Add funding_type_code
202+
funding_type_code_df = new_codes[[
203+
'iija_program_code',
204+
'funding_type_code']].drop_duplicates()
205+
206+
df = pd.merge(df, funding_type_code_df,
207+
left_on = "program_code",
208+
right_on = "iija_program_code",
209+
how = "left")
210+
# Turn summary_recipient_defined_text_field_1_value to a string
101211
df['summary_recipient_defined_text_field_1_value'] = df['summary_recipient_defined_text_field_1_value'].astype(str)
102212

103-
df.loc[df.program_code =='ER01', 'program_code_description'] = 'Emergency Relieve Funding'
104-
df.loc[df.program_code =='ER03', 'program_code_description'] = 'Emergency Relieve Funding'
213+
# Amanda: January 2025, notified this should be called emergency supplement funding
214+
df.loc[df.program_code =='ER01', 'program_code_description'] = 'Emergency Supplement Funding'
215+
df.loc[df.program_code =='ER03', 'program_code_description'] = 'Emergency Supplement Funding'
105216

106217
return df
107218

dla/iija/_script_utils.py

Lines changed: 124 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,13 @@
66
- add project types that classify project type
77
- create a public-friendly project title
88
'''
9-
109
import numpy as np
1110
import pandas as pd
1211
from siuba import *
1312

1413
import dla_utils
15-
1614
from calitp_data_analysis.sql import to_snakecase
17-
1815
import _data_utils
19-
2016
import intake
2117

2218
# import nltk
@@ -27,7 +23,6 @@
2723

2824
GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/dla/dla-iija'
2925

30-
3126
def _prep_data(file_name):
3227
proj = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/{file_name}"))
3328

@@ -152,6 +147,109 @@ def identify_agency(df, identifier_col):
152147

153148
return full_df
154149

150+
def identify_agency2(df: pd.DataFrame) -> pd.DataFrame:
151+
"""
152+
Fill in locodes, using the column rk_locode first
153+
then using the original function from Natalie.
154+
"""
155+
# Load dataframe with locodes
156+
locodes_df = to_snakecase(
157+
pd.read_excel(
158+
f"gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/locodes_updated7122021.xlsx"
159+
)
160+
).rename(
161+
columns={
162+
"agency_name": "implementing_agency",
163+
}
164+
)
165+
166+
# Filter out for rows in which rk_locode is filled
167+
filled_locode_df = df.loc[df.rk_locode.notna()].reset_index(drop=True)
168+
169+
# Merge the two dataframes
170+
filled_locode_df2 = pd.merge(
171+
filled_locode_df,
172+
locodes_df,
173+
left_on="rk_locode",
174+
right_on="agency_locode",
175+
how="left",
176+
indicator=True,
177+
)
178+
display("Rows with locodes filled")
179+
display(filled_locode_df2._merge.value_counts())
180+
181+
# Clean
182+
filled_locode_df2 = filled_locode_df2.rename(
183+
columns={
184+
"agency_name": "implementing_agency",
185+
"rk_locode": "implementing_agency_locode",
186+
}
187+
).drop(
188+
columns=[
189+
"active_e76s______7_12_2021_",
190+
"mpo_locode_fads",
191+
"agency_locode",
192+
"_merge",
193+
]
194+
)
195+
196+
# Filter out for rows with missing locodes
197+
missing_locode_df = (df.loc[(df.rk_locode.isna())].reset_index(drop=True)).drop(
198+
columns=["rk_locode"]
199+
)
200+
201+
# Fill in summary_recipient_defined_text_field_1_value
202+
missing_locode_df.summary_recipient_defined_text_field_1_value = (
203+
missing_locode_df.summary_recipient_defined_text_field_1_value.fillna("None")
204+
)
205+
206+
# Try add_name_from_locode from _data_utils
207+
missing_locode_df2 = _data_utils.add_name_from_locode(
208+
missing_locode_df, "summary_recipient_defined_text_field_1_value"
209+
)
210+
211+
# Concat all the dataframes
212+
final_df = pd.concat([filled_locode_df2, missing_locode_df2])
213+
display("Do the # of rows match?")
214+
display(len(final_df) == len(df))
215+
216+
# More cleaning
217+
county_base = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/Copy of County.xlsx", sheet_name='County', header=[1]))
218+
county_base.drop(columns =['unnamed:_0', 'unnamed:_4'], axis=1, inplace=True)
219+
county_base['county_description'] = county_base['county_description'] + " County"
220+
221+
county_district = (
222+
locodes_df
223+
>> group_by(_.district, _.county_name)
224+
>> count(_.county_name)
225+
>> select(_.district, _.county_name)
226+
>> filter(_.county_name != "Multi-County", _.district != 53)
227+
)
228+
county_info = pd.merge(
229+
county_base,
230+
county_district,
231+
how="left",
232+
left_on="county_description",
233+
right_on="county_name",
234+
).drop(columns=["county_name"])
235+
mapping1 = dict(county_info[["county_code", "county_description"]].values)
236+
mapping2 = dict(county_info[["county_code", "recipient_name"]].values)
237+
mapping3 = dict(county_info[["county_code", "district"]].values)
238+
239+
final_df["county_description"] = final_df.county_code.map(mapping1)
240+
final_df["recipient_name"] = final_df.county_code.map(mapping2)
241+
final_df["district"] = final_df.county_code.map(mapping3)
242+
243+
final_df.loc[
244+
final_df.county_name == "Statewide County", "county_name"] = "Statewide"
245+
246+
final_df["implementing_agency"] = final_df[
247+
"implementing_agency"
248+
].fillna(value="Unknown")
249+
final_df["county_name"] = final_df["county_name"].fillna(
250+
value="Unknown"
251+
)
252+
return final_df
155253

156254
def condense_df(df):
157255
"""
@@ -160,9 +258,11 @@ def condense_df(df):
160258
# make sure columns are in string format
161259
df[['county_code', 'improvement_type',
162260
'implementing_agency_locode', 'district',
163-
'program_code_description', 'recipient_project_number']] = df[['county_code', 'improvement_type',
261+
'program_code_description', 'recipient_project_number',
262+
"funding_type_code"]] = df[['county_code', 'improvement_type',
164263
'implementing_agency_locode', 'district',
165-
'program_code_description', 'recipient_project_number']].astype(str)
264+
'program_code_description', 'recipient_project_number',
265+
"funding_type_code"]].astype(str)
166266
# copy county column over to use for project title name easier
167267
df['county_name_title'] = df['county_name']
168268
# copy program code column over to use for project description column easier
@@ -174,7 +274,7 @@ def condense_df(df):
174274
.groupby(['fmis_transaction_date','project_number', 'implementing_agency', 'summary_recipient_defined_text_field_1_value'
175275
# , 'program_code', 'program_code_description'
176276
])
177-
.agg({
277+
.agg({'funding_type_code':lambda x:'|'.join(x.unique()),
178278
'program_code':lambda x:'|'.join(x.unique()), # get unique values to concatenate ##hashing this out to group by instead
179279
'program_code_description':lambda x:'|'.join(x.unique()), # get unique values to concatenate ##hashing this out to group by instead
180280
'recipient_project_number':lambda x:'|'.join(x.unique()), #'first',
@@ -541,7 +641,23 @@ def run_script(file_name, recipient_column, df_agg_level):
541641

542642
return agg
543643

644+
def run_script2(file_name, recipient_column, df_agg_level):
544645

646+
### Read in data
647+
proj_list = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/{file_name}"))
648+
649+
### run function to get new program codes
650+
proj_cleaned = _data_utils.add_new_codes(proj_list)
651+
652+
## function that adds known agency name to df
653+
df = identify_agency2(proj_cleaned)
654+
655+
### run the data through the rest of the script
656+
### return a dataset that is aggregated at the project and program code
657+
agg = get_clean_data(df, full_or_agg = df_agg_level)
658+
659+
return agg
660+
545661
def export_to_gcs(df, export_date):
546662

547663
### pretty print the column names

0 commit comments

Comments
 (0)