Skip to content

Commit f1331d4

Browse files
committed
troubleshooting why so many agencies were missing
1 parent 936d3f7 commit f1331d4

File tree

4 files changed

+904
-363
lines changed

4 files changed

+904
-363
lines changed

dla/iija/_data_utils.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ def update_program_code_list_2025():
124124
# Load September 2023 codes
125125
program_codes_sept_2023 = load_program_codes_sept_2023()
126126

127-
# Merge original + September first
127+
# Merge original + September 2023 first
128128
m1 = pd.merge(
129129
program_codes_sept_2023,
130130
original_codes_df,
@@ -179,7 +179,7 @@ def update_program_code_list_2025():
179179
}
180180
)
181181

182-
# Add program to another program names without the string "program"
182+
# Add the string "program" to values in the column "program_name"
183183
m2["program_name"] = m2.apply(add_program_to_row, axis=1)
184184
return m2
185185

dla/iija/_script_utils.py

+65-41
Original file line numberDiff line numberDiff line change
@@ -147,13 +147,8 @@ def identify_agency(df, identifier_col):
147147

148148
return full_df
149149

150-
def identify_agency2(df: pd.DataFrame) -> pd.DataFrame:
151-
"""
152-
Fill in locodes, using the column rk_locode first
153-
then using the original function from Natalie.
154-
"""
155-
# Load dataframe with locodes
156-
locodes_df = to_snakecase(
150+
def load_locodes()->pd.DataFrame:
151+
df = to_snakecase(
157152
pd.read_excel(
158153
f"gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/locodes_updated7122021.xlsx"
159154
)
@@ -162,7 +157,54 @@ def identify_agency2(df: pd.DataFrame) -> pd.DataFrame:
162157
"agency_name": "implementing_agency",
163158
}
164159
)
160+
return df
165161

162+
def load_county()->pd.DataFrame:
163+
df = to_snakecase(
164+
pd.read_excel(
165+
f"{GCS_FILE_PATH}/Copy of County.xlsx", sheet_name="County", header=[1]
166+
)
167+
)[["recipient_name", "county_description", "county_code"]]
168+
169+
df['county_description'] = df['county_description'] + " County"
170+
return df
171+
172+
def county_district_crosswalk()->pd.DataFrame:
173+
"""
174+
Aggregate locodes dataset to find which
175+
districts a county lies in.
176+
"""
177+
# Load locodes
178+
locodes_df = load_locodes()
179+
180+
# Load counties
181+
county_base = load_county()
182+
183+
county_district = (
184+
locodes_df
185+
>> group_by(_.district, _.county_name)
186+
>> count(_.county_name)
187+
>> select(_.district, _.county_name)
188+
>> filter(_.county_name != "Multi-County", _.district != 53)
189+
)
190+
191+
county_info = pd.merge(
192+
county_base,
193+
county_district,
194+
how="left",
195+
left_on="county_description",
196+
right_on="county_name",
197+
).drop(columns=["county_name"])
198+
return county_info
199+
200+
def identify_agency2(df: pd.DataFrame) -> pd.DataFrame:
201+
"""
202+
Fill in locodes, using the column rk_locode first
203+
then use the original function from Natalie.
204+
"""
205+
# Load dataframe with locodes
206+
locodes_df = load_locodes()
207+
166208
# Filter out for rows in which rk_locode is filled
167209
filled_locode_df = df.loc[df.rk_locode.notna()].reset_index(drop=True)
168210

@@ -181,7 +223,6 @@ def identify_agency2(df: pd.DataFrame) -> pd.DataFrame:
181223
# Clean
182224
filled_locode_df2 = filled_locode_df2.rename(
183225
columns={
184-
"agency_name": "implementing_agency",
185226
"rk_locode": "implementing_agency_locode",
186227
}
187228
).drop(
@@ -199,50 +240,33 @@ def identify_agency2(df: pd.DataFrame) -> pd.DataFrame:
199240
)
200241

201242
# Fill in summary_recipient_defined_text_field_1_value
202-
missing_locode_df.summary_recipient_defined_text_field_1_value = (
203-
missing_locode_df.summary_recipient_defined_text_field_1_value.fillna("None")
204-
)
243+
#missing_locode_df.summary_recipient_defined_text_field_1_value = (
244+
# missing_locode_df.summary_recipient_defined_text_field_1_value.fillna("None")
245+
#)
205246

206247
# Try add_name_from_locode from _data_utils
207248
missing_locode_df2 = _data_utils.add_name_from_locode(
208249
missing_locode_df, "summary_recipient_defined_text_field_1_value"
209250
)
210251

211-
# Concat all the dataframes
212-
final_df = pd.concat([filled_locode_df2, missing_locode_df2])
213-
display("Do the # of rows match?")
214-
display(len(final_df) == len(df))
215-
216-
# More cleaning
217-
county_base = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/Copy of County.xlsx", sheet_name='County', header=[1]))
218-
county_base.drop(columns =['unnamed:_0', 'unnamed:_4'], axis=1, inplace=True)
219-
county_base['county_description'] = county_base['county_description'] + " County"
252+
# Manually add in info for any rows that are still missing info
253+
county_info = county_district_crosswalk()
220254

221-
county_district = (
222-
locodes_df
223-
>> group_by(_.district, _.county_name)
224-
>> count(_.county_name)
225-
>> select(_.district, _.county_name)
226-
>> filter(_.county_name != "Multi-County", _.district != 53)
227-
)
228-
county_info = pd.merge(
229-
county_base,
230-
county_district,
231-
how="left",
232-
left_on="county_description",
233-
right_on="county_name",
234-
).drop(columns=["county_name"])
235255
mapping1 = dict(county_info[["county_code", "county_description"]].values)
236256
mapping2 = dict(county_info[["county_code", "recipient_name"]].values)
237257
mapping3 = dict(county_info[["county_code", "district"]].values)
238258

239-
final_df["county_description"] = final_df.county_code.map(mapping1)
240-
final_df["recipient_name"] = final_df.county_code.map(mapping2)
241-
final_df["district"] = final_df.county_code.map(mapping3)
242-
243-
final_df.loc[
244-
final_df.county_name == "Statewide County", "county_name"] = "Statewide"
259+
missing_locode_df2["county_description"] = missing_locode_df2.county_code.map(mapping1)
260+
missing_locode_df2["district"] = missing_locode_df2.county_code.map(mapping3)
261+
missing_locode_df2["implementing_agency"] = missing_locode_df2.county_code.map(mapping2)
262+
263+
# Concat all the dataframes
264+
final_df = pd.concat([filled_locode_df2, missing_locode_df2])
265+
display("Do the # of rows match?")
266+
display(len(final_df) == len(df))
245267

268+
# Clean & fill in nans with Unknown
269+
final_df.loc[final_df.county_name == "Statewide County", "county_name"] = "Statewide"
246270
final_df["implementing_agency"] = final_df[
247271
"implementing_agency"
248272
].fillna(value="Unknown")
@@ -599,7 +623,7 @@ def get_clean_data(df, full_or_agg = ''):
599623
aggdf = add_new_description_col(aggdf)
600624

601625
##asserting that the there is one row for each project id in the new
602-
assert len(aggdf) == df.project_number.nunique()
626+
display(len(aggdf) == df.project_number.nunique())
603627

604628
return aggdf
605629

0 commit comments

Comments
 (0)