cal-itp
diff --git a/‎dla/iija/_data_utils.py
Lines changed: 132 additions & 21 deletions b/‎dla/iija/_data_utils.py
Lines changed: 132 additions & 21 deletions
diff --git a/‎dla/iija/_script_utils.py
Lines changed: 124 additions & 8 deletions b/‎dla/iija/_script_utils.py
Lines changed: 124 additions & 8 deletions
@@ -6,25 +6,16 @@
 If using the get_list_of_words function, remove the comment out hashtag from the import nltk and re in this script 
 AND run a `! pip install nltk` in the first cell of your notebook
 '''
-
-
 import pandas as pd
 from siuba import *
-
 from calitp_data_analysis.sql import to_snakecase
-
 import _script_utils
-
-
 GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses/dla/dla-iija'
 
-
 # import nltk
 # from nltk.corpus import stopwords
 # from nltk.tokenize import word_tokenize, sent_tokenize
 # import re
-
-
 def read_data_all():
     proj = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/CopyofFMIS_Projects_Universe_IIJA_Reporting_4.xls", 
                            # sheet_name='FMIS 5 Projects  ', header=[3]
@@ -42,6 +33,10 @@ def read_data_all():
     return proj
 
 
+'''
+Program Code
+Functions
+'''
 # def update_program_code_list():
 
 #     ## read in the program codes
@@ -59,11 +54,6 @@ def read_data_all():
 
 #     return program_codes 
 
-
-'''
-Updated version of the update_program_code_list to alter program names if needed.
-
-'''
 def update_program_code_list2():
     updated_codes = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/program_codes/FY21-22ProgramCodesAsOf5-25-2022.v2_expanded090823.xlsx"))
     updated_codes = updated_codes>>select(_.iija_program_code, _.new_description)
@@ -87,21 +77,142 @@ def add_program_to_row(row):
 
     return program_codes
 
+def add_program_to_row(row):
+    if "Program" not in row["program_name"]:
+        return row["program_name"] + " Program"
+    else:
+        return row["program_name"]
+
+def load_program_codes_og() -> pd.DataFrame:
+    df = to_snakecase(
+        pd.read_excel(
+            f"{GCS_FILE_PATH}/program_codes/Copy of lst_IIJA_Code_20230908.xlsx"
+        )
+    )[["iija_program_code", "description", "program_name"]]
+    return df
+
+def load_program_codes_sept_2023() -> pd.DataFrame:
+    df = to_snakecase(
+        pd.read_excel(
+            f"{GCS_FILE_PATH}/program_codes/FY21-22ProgramCodesAsOf5-25-2022.v2_expanded090823.xlsx"
+        )
+    )[["iija_program_code", "new_description"]]
+    return df
+
+def load_program_codes_jan_2025() -> pd.DataFrame:
+    df = to_snakecase(
+        pd.read_excel(f"{GCS_FILE_PATH}/program_codes/Ycodes_01.2025.xlsx")
+    )[["program_code", "short_name", "program_code_description", "funding_type_code"]]
+
+    df = df.rename(
+        columns={
+            "program_code": "iija_program_code",
+        }
+    )
+    df.short_name = df.short_name.str.title()
+    return df
+
+def update_program_code_list_2025():
+    """
+    On January 2025, we received a new list of updated codes.
+    Merge this new list with codes received originally and in
+    September 2023.
+    """
+    # Load original codes
+    original_codes_df = load_program_codes_og()
+
+    # Load September 2023 codes
+    program_codes_sept_2023 = load_program_codes_sept_2023()
+
+    # Merge original + September first
+    m1 = pd.merge(
+        program_codes_sept_2023,
+        original_codes_df,
+        on="iija_program_code",
+        how="outer",
+        indicator=True,
+    )
+
+    # Clean up description
+    m1["new_description"] = (
+        m1["new_description"].str.strip().fillna(m1.description)
+    )
+
+    # Delete unnecessary columns
+    m1 = m1.drop(columns={"description", "_merge"})
+
+    # Load January 2025 code
+    program_codes_jan_2025 = load_program_codes_jan_2025()
+
+    # Merge m1 with program codes from January 2025.
+    m2 = pd.merge(
+        program_codes_jan_2025,
+        m1,
+        on="iija_program_code",
+        how="outer",
+        indicator=True,
+    )
+    # Update descriptions
+    m2["2025_description"] = (
+        m2["program_code_description"].str.strip().fillna(m2.new_description)
+    )
+
+    # Update program names
+    m2["2025_program_name"] = m2.program_name.fillna(m2.short_name)
+
+    # Delete outdated columns
+    m2 = m2.drop(
+        columns=[
+            "short_name",
+            "program_name",
+            "program_code_description",
+            "new_description",
+            "_merge",
+        ]
+    )
+
+    # Rename to match original sheet
+    m2 = m2.rename(
+        columns={
+            "2025_description": "new_description",
+            "2025_program_name": "program_name",
+        }
+    )
+
+    # Add program to another program names without the string "program"
+    m2["program_name"] = m2.apply(add_program_to_row, axis=1)
+    return m2
 
-## Function to add the updated program codes to the data
 def add_new_codes(df):
+    """
+    Function to add the updated program codes to the data
+    """
     #new_codes = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/FY21-22ProgramCodesAsOf5-25-2022.v2.xlsx"))
     #code_map = dict(new_codes[['iija_program_code', 'new_description']].values)
 
     ## adding updated program codes 05/11/23
-    new_codes = update_program_code_list2()
-    code_map = dict(new_codes[['iija_program_code', 'program_name']].values)
-
-    df['program_code_description'] = df.program_code.map(code_map)
+    #new_codes = update_program_code_list2()
+    
+    ## adding updated program codes 1/30/25
+    new_codes = update_program_code_list_2025()
+    iija_code_map = dict(new_codes[['iija_program_code', 'program_name']].values)
+    df['program_code_description'] = df.program_code.map(iija_code_map)
+    
+    # Add funding_type_code
+    funding_type_code_df = new_codes[[
+        'iija_program_code', 
+        'funding_type_code']].drop_duplicates()
+    
+    df = pd.merge(df, funding_type_code_df, 
+                  left_on = "program_code",
+                  right_on = "iija_program_code",
+                  how = "left")
+    # Turn summary_recipient_defined_text_field_1_value to a string
     df['summary_recipient_defined_text_field_1_value'] = df['summary_recipient_defined_text_field_1_value'].astype(str)
 
-    df.loc[df.program_code =='ER01', 'program_code_description'] = 'Emergency Relieve Funding'
-    df.loc[df.program_code =='ER03', 'program_code_description'] = 'Emergency Relieve Funding'
+    # Amanda: January 2025, notified this should be called emergency supplement funding
+    df.loc[df.program_code =='ER01', 'program_code_description'] = 'Emergency Supplement Funding'
+    df.loc[df.program_code =='ER03', 'program_code_description'] = 'Emergency Supplement Funding'
 
     return df
 
 
@@ -6,17 +6,13 @@
 - add project types that classify project type
 - create a public-friendly project title 
 '''
-
 import numpy as np
 import pandas as pd
 from siuba import *
 
 import dla_utils
-
 from calitp_data_analysis.sql import to_snakecase
-
 import _data_utils
-
 import intake
 
 # import nltk
@@ -27,7 +23,6 @@
 
 GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses/dla/dla-iija'
 
-
 def _prep_data(file_name): 
     proj = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/{file_name}"))
 
@@ -152,6 +147,109 @@ def identify_agency(df, identifier_col):
 
     return full_df
 
+def identify_agency2(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Fill in locodes, using the column rk_locode first
+    then using the original function from Natalie.
+    """
+    # Load dataframe with locodes
+    locodes_df = to_snakecase(
+        pd.read_excel(
+            f"gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/locodes_updated7122021.xlsx"
+        )
+    ).rename(
+        columns={
+            "agency_name": "implementing_agency",
+        }
+    )
+
+    # Filter out for rows in which rk_locode is filled
+    filled_locode_df = df.loc[df.rk_locode.notna()].reset_index(drop=True)
+
+    # Merge the two dataframes
+    filled_locode_df2 = pd.merge(
+        filled_locode_df,
+        locodes_df,
+        left_on="rk_locode",
+        right_on="agency_locode",
+        how="left",
+        indicator=True,
+    )
+    display("Rows with locodes filled")
+    display(filled_locode_df2._merge.value_counts())
+
+    # Clean
+    filled_locode_df2 = filled_locode_df2.rename(
+        columns={
+            "agency_name": "implementing_agency",
+            "rk_locode": "implementing_agency_locode",
+        }
+    ).drop(
+        columns=[
+            "active_e76s______7_12_2021_",
+            "mpo_locode_fads",
+            "agency_locode",
+            "_merge",
+        ]
+    )
+
+    # Filter out for rows with missing locodes
+    missing_locode_df = (df.loc[(df.rk_locode.isna())].reset_index(drop=True)).drop(
+        columns=["rk_locode"]
+    )
+
+    # Fill in summary_recipient_defined_text_field_1_value
+    missing_locode_df.summary_recipient_defined_text_field_1_value = (
+        missing_locode_df.summary_recipient_defined_text_field_1_value.fillna("None")
+    )
+
+    # Try add_name_from_locode from _data_utils
+    missing_locode_df2 = _data_utils.add_name_from_locode(
+        missing_locode_df, "summary_recipient_defined_text_field_1_value"
+    )
+
+    # Concat all the dataframes
+    final_df = pd.concat([filled_locode_df2, missing_locode_df2])
+    display("Do the # of rows match?")
+    display(len(final_df) == len(df))
+
+    # More cleaning
+    county_base = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/Copy of County.xlsx", sheet_name='County', header=[1]))
+    county_base.drop(columns =['unnamed:_0', 'unnamed:_4'], axis=1, inplace=True)
+    county_base['county_description'] = county_base['county_description'] + " County"
+    
+    county_district = (
+        locodes_df
+        >> group_by(_.district, _.county_name)
+        >> count(_.county_name)
+        >> select(_.district, _.county_name)
+        >> filter(_.county_name != "Multi-County", _.district != 53)
+    )
+    county_info = pd.merge(
+        county_base,
+        county_district,
+        how="left",
+        left_on="county_description",
+        right_on="county_name",
+    ).drop(columns=["county_name"])
+    mapping1 = dict(county_info[["county_code", "county_description"]].values)
+    mapping2 = dict(county_info[["county_code", "recipient_name"]].values)
+    mapping3 = dict(county_info[["county_code", "district"]].values)
+    
+    final_df["county_description"] = final_df.county_code.map(mapping1)
+    final_df["recipient_name"] = final_df.county_code.map(mapping2)
+    final_df["district"] = final_df.county_code.map(mapping3)
+    
+    final_df.loc[
+    final_df.county_name == "Statewide County", "county_name"] = "Statewide"
+
+    final_df["implementing_agency"] = final_df[
+        "implementing_agency"
+    ].fillna(value="Unknown")
+    final_df["county_name"] = final_df["county_name"].fillna(
+        value="Unknown"
+    )
+    return final_df
 
 def condense_df(df):
     """
@@ -160,9 +258,11 @@ def condense_df(df):
     # make sure columns are in string format
     df[['county_code', 'improvement_type',
      'implementing_agency_locode', 'district',
-     'program_code_description', 'recipient_project_number']] = df[['county_code', 'improvement_type',
+     'program_code_description', 'recipient_project_number',
+       "funding_type_code"]] = df[['county_code', 'improvement_type',
                                                                      'implementing_agency_locode', 'district',
-                                                                     'program_code_description', 'recipient_project_number']].astype(str)
+                                                                     'program_code_description', 'recipient_project_number',
+                                  "funding_type_code"]].astype(str)
     # copy county column over to use for project title name easier
     df['county_name_title'] = df['county_name'] 
     # copy program code column over to use for project description column easier
@@ -174,7 +274,7 @@ def condense_df(df):
            .groupby(['fmis_transaction_date','project_number', 'implementing_agency', 'summary_recipient_defined_text_field_1_value'
                     # , 'program_code', 'program_code_description'
                     ])
-           .agg({
+           .agg({'funding_type_code':lambda x:'|'.join(x.unique()),
                  'program_code':lambda x:'|'.join(x.unique()), # get unique values to concatenate                ##hashing this out to group by instead
                  'program_code_description':lambda x:'|'.join(x.unique()), # get unique values to concatenate    ##hashing this out to group by instead
                  'recipient_project_number':lambda x:'|'.join(x.unique()), #'first',
@@ -541,7 +641,23 @@ def run_script(file_name, recipient_column, df_agg_level):
 
     return agg
 
+def run_script2(file_name, recipient_column, df_agg_level):
 
+    ### Read in data
+    proj_list = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/{file_name}"))
+    
+    ### run function to get new program codes
+    proj_cleaned = _data_utils.add_new_codes(proj_list)
+    
+    ## function that adds known agency name to df 
+    df = identify_agency2(proj_cleaned)
+    
+    ### run the data through the rest of the script
+    ### return a dataset that is aggregated at the project and program code
+    agg = get_clean_data(df, full_or_agg = df_agg_level)
+    
+    return agg
+
 def export_to_gcs(df, export_date):
 
     ### pretty print the column names