v4.5.14.3 Implement denylist for flow-based CatFIM (#1413)

EmilyDeardorff · web-flow · commit 5be651409b28 · 2025-01-31T14:15:10.000-06:00
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -1,6 +1,22 @@
 All notable changes to this project will be documented in this file.
 We follow the [Semantic Versioning 2.0.0](http://semver.org/) format.
 
+## v4.5.14.3 - 2025-01-31 - [PR#1413](https://github.com/NOAA-OWP/inundation-mapping/pull/1413)
+
+Implements a denylist for flow-based CatFIM (that uses the same conventions as the existing denylist functionality used in stage-based CatFIM. Adds CMUG1 to the denylist for flow-based CatFIM. 
+
+### Additions
+- `tools/catfim/ahps_restricted_sites.csv`: Renamed from `stage_based_ahps_restricted_sites.csv`. Added an additional column, `catfim_type`, that specifies whether a site should be restricted for flow-based CatFIM (`flow`), stage-based CatFIM (`stage`), or both (`both`).
+
+### Changes
+- `tools/catfim/generate_categorical_fim.py`: Update the `load_restricted_sites()` function to handle restricted sites for both flow- and stage-based CatFIM.
+- `tools/catfim/generate_categorical_fim_flows.py`: Add restricted sites filtration to flow-based CatFIM processing. 
+
+### Removals
+- `tools/catfim/stage_based_ahps_restricted_sites.csv`: Renamed to `ahps_restricted_sites.csv`
+
+<br/><br/>
+
 ## v4.5.14.2 - 2025-01-24 - [PR#1178](https://github.com/NOAA-OWP/inundation-mapping/pull/1178)
 
 ### Summary
diff --git a/tools/catfim/ahps_restricted_sites.csv b/tools/catfim/ahps_restricted_sites.csv
@@ -0,0 +1,70 @@
+nws_lid,restricted_reason,catfim_type
+# RULES: Ensure that the top line has no comments above the header line above:
+#     -- Careful not to add commas other then the one after the lid ID.
+#     -- Don't allow for any duplicate LID values in this list please.
+#
+# Comment lines can be added as any independent line in this csv.
+#
+AABDB,Test Site,stage
+ADLG1,Stage thresholds seem to be based on sea level and not channel thalweg,stage
+AUGG1,Stage thresholds seem to be based on sea level and not channel thalweg,stage
+BAXG1,Stage thresholds seem to be based on sea level and not channel thalweg,stage
+BRKTS,Test Site,stage
+CNNN6,Pool elevation thresholds. Disabled at request of MARFC,stage
+DMBT2,Test Site,stage
+DMSF1,Tidal Gauge,stage
+DVNIA,Test Site,both
+GVDA1,bad data in API,stage
+HFMW4,gage relocated. check again later for review,stage
+HLZU1,bad data in API,stage
+HNXCA,Test Site,stage
+HRAG1,Stage thresholds seem to be based on sea level and not channel thalweg,stage
+HUSKR,Test Site,stage
+HZT00,Test Site,stage
+HZT02,Test Site,stage
+ILMNA,Test Site,stage
+JTEST,Test Site,stage
+JYNT2,Test Site,stage
+KLNQ9,Outside U.S.,both
+LAMF1,Stage thresholds seem to be based on sea level and not channel thalweg.,stage
+MCVA3,historical gauge,both
+NVRN6,Pool elevation thresholds. Disabled at request of MARFC.,stage
+PEPN6,Pool elevation thresholds. Disabled at request of MARFC.,stage
+PRXQ9,Outside U.S.,both
+ONDN6,Inundation issues,stage
+QUTG1,Stage thresholds seem to be based on sea level and not channel thalweg,stage
+RCYF1,Out of Service,both
+RWBW3, historical gauge,stage
+SMSV2,point not in AHPS,both
+STNG1,Stage thresholds seem to be based on sea level and not channel thalweg,stage
+TESM8,Test Site,stage
+TEST,Test Site,stage
+TEST1,Test Site,stage
+TEST11,Test Site,stage
+TEST2,Test Site,stage
+TEST3,Test Site,stage
+TEST4,Test Site,stage
+TEST9,Test Site,stage
+TESTA,Test Site,stage
+TESTB,Test Site,stage
+TESTC,Test Site,stage
+TESTPT1,Test Site,stage
+TETM8,Test Site,stage
+TS2W3,Test Site,stage
+TS3W3,Test Site,stage
+TS4W3,Test Site,stage
+TSTMO,Test Site,stage
+TSTN6,Test Site,stage
+TSTSP,Test Site,stage
+TSTUP,Test Site,stage
+TTARX,Test Site,both
+VEFNV,Test Site,both
+WBYA1,Out of Service,both
+WLSA1, bad data in API,stage
+WUNTS, not a real gauge,both
+XXXN3,Test Site,stage
+YDAQ9,Outside U.S.,both
+YLKA2,Discontinued Gage,stage
+YWRQ9,Outside U.S.,both
+ZZZT2,Test Site,stage
+CMUG1,Out of Service,both
diff --git a/tools/catfim/generate_categorical_fim.py b/tools/catfim/generate_categorical_fim.py
@@ -247,13 +247,13 @@ def process_generate_categorical_fim(
     # STAGE-BASED
     if is_stage_based:
         # Generate Stage-Based CatFIM mapping
-        # does flows and inundation  (mapping)
+        # does flows and inundation (mapping)
 
         catfim_sites_file_path = os.path.join(output_mapping_dir, 'stage_based_catfim_sites.gpkg')
 
         if step_num <= 1:
 
-            df_restricted_sites = load_restricted_sites()
+            df_restricted_sites = load_restricted_sites(is_stage_based)
 
             generate_stage_based_categorical_fim(
                 output_catfim_dir,
@@ -300,6 +300,9 @@ def process_generate_categorical_fim(
         job_flows = job_number_huc * job_number_inundate
 
         if step_num <= 1:
+
+            df_restricted_sites = load_restricted_sites(is_stage_based)
+
             generate_flows(
                 output_catfim_dir,
                 nwm_us_search,
@@ -310,6 +313,7 @@ def process_generate_categorical_fim(
                 valid_ahps_hucs,
                 nwm_metafile,
                 FLOG.LOG_FILE_PATH,
+                df_restricted_sites,
             )
             end = time.time()
             elapsed_time = (end - start) / 60
@@ -1167,23 +1171,31 @@ def __calc_stage_intervals(non_rec_stage_values_df, past_major_interval_cap, huc
     return interval_recs
 
 
-def load_restricted_sites():
+def load_restricted_sites(is_stage_based):
     """
-    At this point, only stage based uses this. But a arg of "catfim_type (stage or flow) or something
-    can be added later.
+    Previously, only stage based used this. It is now being used by stage-based and flow-based (1/24/25)
+
+    The 'catfim_type' column can have three different values: 'stage', 'flow', and 'both'. This determines
+    whether the site should be filtered out for stage-based CatFIM, flow-based CatFIM, or both of them.
 
     Returns: a dataframe for the restricted lid and the reason why:
-        "nws_lid", "restricted_reason"
+        'nws_lid', 'restricted_reason', 'catfim_type'
     """
 
-    file_name = "stage_based_ahps_restricted_sites.csv"
+    file_name = "ahps_restricted_sites.csv"
     current_script_folder = os.path.dirname(__file__)
     file_path = os.path.join(current_script_folder, file_name)
 
     df_restricted_sites = pd.read_csv(file_path, dtype=str)
 
     df_restricted_sites['nws_lid'].fillna("", inplace=True)
     df_restricted_sites['restricted_reason'].fillna("", inplace=True)
+    df_restricted_sites['catfim_type'].fillna("", inplace=True)
+
+    # remove extra empty spaces on either side of all cellls
+    df_restricted_sites['nws_lid'] = df_restricted_sites['nws_lid'].str.strip()
+    df_restricted_sites['restricted_reason'] = df_restricted_sites['restricted_reason'].str.strip()
+    df_restricted_sites['catfim_type'] = df_restricted_sites['catfim_type'].str.strip()
 
     # Need to drop the comment lines before doing any more processing
     df_restricted_sites.drop(
@@ -1195,11 +1207,13 @@ def load_restricted_sites():
     # There are enough conditions and a low number of rows that it is easier to
     # test / change them via a for loop
     indexs_for_recs_to_be_removed_from_list = []
+
+    # Clean up dataframe
     for ind, row in df_restricted_sites.iterrows():
         nws_lid = row['nws_lid']
         restricted_reason = row['restricted_reason']
 
-        if len(nws_lid) != 5:  # could be just a blank row in the
+        if len(nws_lid) != 5:  # Invalid row, could be just a blank row in the file
             FLOG.warning(
                 f"From the ahps_restricted_sites list, an invalid nws_lid value of '{nws_lid}'"
                 " and has dropped from processing"
@@ -1213,14 +1227,22 @@ def load_restricted_sites():
             df_restricted_sites.at[ind, 'restricted_reason'] = restricted_reason
             FLOG.warning(f"{restricted_reason}. Lid is '{nws_lid}'")
         continue
-    # end for
+    # end loop
 
-    # Invalid records (not dropping, just completely invalid recs from the csv)
+    # Invalid records in CSV (not dropping, just completely invalid recs from the csv)
     # Could be just blank rows from the csv
     if len(indexs_for_recs_to_be_removed_from_list) > 0:
         df_restricted_sites = df_restricted_sites.drop(indexs_for_recs_to_be_removed_from_list).reset_index()
 
-    # print(df_restricted_sites.head(10))
+    # Filter df_restricted_sites by CatFIM type
+    if is_stage_based == True:  # Keep rows where 'catfim_type' is either 'stage' or 'both'
+        df_restricted_sites = df_restricted_sites[df_restricted_sites['catfim_type'].isin(['stage', 'both'])]
+
+    else:  # Keep rows where 'catfim_type' is either 'flow' or 'both'
+        df_restricted_sites = df_restricted_sites[df_restricted_sites['catfim_type'].isin(['flow', 'both'])]
+
+    # Remove catfim_type column
+    df_restricted_sites.drop('catfim_type', axis=1, inplace=True)
 
     return df_restricted_sites
 
@@ -1526,6 +1548,7 @@ def generate_stage_based_categorical_fim(
             lst_hucs,
             nwm_metafile,
             str(FLOG.LOG_FILE_PATH),
+            df_restricted_sites,
         )
     )
 
diff --git a/tools/catfim/generate_categorical_fim_flows.py b/tools/catfim/generate_categorical_fim_flows.py
@@ -66,6 +66,7 @@ def generate_flows_for_huc(
     nwm_flows_df,
     parent_log_output_file,
     child_log_file_prefix,
+    df_restricted_sites,
 ):
 
     try:
@@ -127,6 +128,17 @@ def generate_flows_for_huc(
             # Convert lid to lower case
             lid = lid.lower()
 
+            # Check whether LID is in the restricted sites list
+            found_restrict_lid = df_restricted_sites.loc[df_restricted_sites['nws_lid'] == lid.upper()]
+
+            # Assume only one rec for now, fix later
+            if len(found_restrict_lid) > 0:
+                reason = found_restrict_lid.iloc[0, found_restrict_lid.columns.get_loc("restricted_reason")]
+                msg = ':' + reason
+                all_messages.append(lid + msg)
+                MP_LOG.warning(huc_lid_id + msg)
+                continue
+
             # TODO:  Jun 17, 2024 - This gets recalled for every huc but only uses the nws_list.
             # Move this somewhere outside the huc list so it doesn't need to be called over and over again
 
@@ -363,6 +375,7 @@ def generate_flows(
     lst_hucs,
     nwm_metafile,
     log_output_file,
+    df_restricted_sites,
 ):
 
     # TODO; Most docstrings like this are now very outdated and need updating
@@ -509,6 +522,7 @@ def generate_flows(
                 nwm_flows_region_df,
                 log_output_file,
                 child_log_file_prefix,
+                df_restricted_sites,
             )
     # end ProcessPoolExecutor
 
diff --git a/tools/catfim/stage_based_ahps_restricted_sites.csv b/tools/catfim/stage_based_ahps_restricted_sites.csv