-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathreproduce_figs.py
231 lines (169 loc) · 9.1 KB
/
reproduce_figs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#%%
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from statsmodels.nonparametric.smoothers_lowess import lowess
import geopandas as gpd
import matplotlib
from sklearn.utils import resample
from sklearn.metrics import mean_squared_error
from matplotlib import gridspec
from scipy.optimize import curve_fit
import pickle
def power_law(x, a, b):
return a * (x ** b)
def inverse_power_law(y, a, b):
return (y / a) ** (1 / b)
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['font.sans-serif'] = ['Helvetica']
sns.set_style('whitegrid')
df_hist = pd.read_excel('data/manuscript_data/Supplementary Table 1.xlsx')
df_hist['normalized_distance'] = df_hist['normalized_distance'].astype(float)
# Set font size and style
sns.set(font_scale=1.0, style="white")
# Create figure and axes
fig, ax = plt.subplots(figsize=(4.5, 1.75))
# Plot histogram
sns.histplot(data=df_hist, x='normalized_distance', kde=False, fill=True,
alpha=.5, edgecolor='black', stat="percent", binwidth=.05, color='#4FA0CA',linewidth=.50)
# Set axis labels and title
ax.set_xlabel('$X_N$', fontsize=6)
ax.set_ylabel('Percent (%)', fontsize=6)
ax.tick_params(axis='both', which='major', labelsize=6, width=0, length=2)
ax.spines['bottom'].set_linewidth(0.5) # Adjust the line width of the x-axis
ax.spines['left'].set_linewidth(0.5) # Adjust the line width of the y-axis
sns.despine()
#%%
# Load data
df = pd.read_excel('data/manuscript_data/starting_data.xlsx')
# Preprocessing calculations
df['sar_mean'] = df[['sar1', 'sar1', 'sar3']].mean(axis=1)
df['sm_mean'] = df[['sm1', 'sm2', 'sm3']].mean(axis=1)
df['gamma'] = df['sar_mean'] / df['sm_mean']
df['spr'] = df['normalized_distance'] / df['sm_mean']
# Calculate the standard error for the mean of XV and parent_ar_slope
df['sar_mean_error'] = df[['sar1', 'sar2', 'sar3']].std(axis=1) / np.sqrt(3)
df['sm_mean_error'] = df[['sm1', 'sm2', 'sm3']].std(axis=1) / np.sqrt(3)
# Calculate gamma and its log-transform
df['log_gamma'] = np.log(df['sar_mean']) - np.log(df['sm_mean'])
# Calculate the standard error of the log-transformed means
df['log_sar_mean_SE'] = df['sar_mean_error'] / df['sar_mean']
df['log_sm_mean_SE'] = df['sm_mean_error'] / df['sm_mean']
# Calculate the absolute error for gamma in the log domain and adjust for the back-transform
df['log_gamma_error'] = np.sqrt(df['log_sar_mean_SE']**2 + df['log_sm_mean_SE']**2)
df['gamma_error_upper'] = np.exp(df['log_gamma'] + df['log_gamma_error']) - df['gamma']
# Calculate the lower error and ensure it doesn't go below the minimum allowed value after back-transform
df['gamma_error_lower'] = np.maximum(df['gamma'] - np.exp(df['log_gamma'] - df['log_gamma_error']), 0.1)
# Calculate the relative errors adjusted for the log scale plot
df['yerr_upper_relative'] = df['gamma_error_upper'] / df['gamma']
df['yerr_lower_relative'] = df['gamma'] - np.maximum(df['gamma'] - df['gamma_error_lower'], 0.1)
# Load XGBoost model and make predictions
xgb_reg = xgb.XGBRegressor()
xgb_reg.load_model("data/BASED_model/models/based_us_early_stopping_combat_overfitting.ubj")
with open('data/BASED_model/inverse_power_law_params.pickle', 'rb') as f:
params = pickle.load(f)
# Apply the power-law correction to the discharge values
df['corrected_discharge'] = inverse_power_law(df['discharge_max_cms'], *params)
guesswork_full = df[['width_m', 'sm_mean', 'corrected_discharge']].astype(float)
guesswork_full.columns = ['width', 'slope', 'discharge']
# Use the corrected discharge, width, and parent_ar_slope_mean to predict the depth
df['xgb_depth'] = xgb_reg.predict(guesswork_full)
df['xgb_depth'] = np.where(df['xgb_depth'] < 0, 0.1, df['xgb_depth'])
# More data transformations
df['har1_m'] = df['are1_m'] - df['fpe1_m']
df['har2_m'] = df['are2_m'] - df['fpe2_m']
df['har3_m'] = df['are3_m'] - df['fpe3_m']
# Superelevation calculations
df['a'] = df[['are1_m', 'are2_m', 'are3_m']].sub(df[['wse1_m', 'wse2_m', 'wse3_m']].values).mean(axis=1)
df['b'] = df['xgb_depth']
# Assuming you have a DataFrame named 'df'
# Calculate 'A1' and 'B1' for the first set of columns (2)
df['a1'] = df['are1_m'] - df['wse1_m']
df['b1'] = df['xgb_depth']
# Calculate 'A2' and 'B2' for the second set of columns (3)
df['a2'] = df['are2_m'] - df['wse2_m']
df['b2'] = df['xgb_depth']
# Calculate 'A3' and 'B3' for the third set of columns (4)
df['a3'] = df['are3_m'] - df['wse3_m']
df['b3'] = df['xgb_depth']
# Find the mean of 'A' values (A1, A2, and A3) and 'B' values (B1, B2, and B3)
df['a'] = df[['a1', 'a2', 'a3']].mean(axis=1)
df['b'] = df[['b1', 'b2', 'b3']].mean(axis=1)
# Calculate A/B
df['a_over_b'] = abs(df['a']) / abs(df['b'])
# Masks for each category
mask1 = df['method_used'] == 1
mask2 = df['method_used'] == 2
mask3 = df['method_used'] == 3
df.loc[mask1, 'se_max1'] = df['har1_m'] / df['xgb_depth']
df.loc[mask1, 'se_max2'] = df['har2_m'] / df['xgb_depth']
df.loc[mask1, 'se_max3'] = df['har3_m'] / df['xgb_depth']
#most confident
df.loc[mask2, 'se_max1'] = df['har1_m'] / (df['are1_m'] - (df['wse1_m']))
df.loc[mask2, 'se_max2'] = df['har2_m'] / (df['are2_m'] - (df['wse2_m']))
df.loc[mask2, 'se_max3'] = df['har3_m'] / (df['are3_m'] - (df['wse3_m']))
df.loc[mask3, 'se_max1'] = df['har1_m'] / (df['are1_m'] - (df['wse1_m']-df['xgb_depth'])) # Fill this in with the relevant formula
df.loc[mask3, 'se_max2'] = df['har2_m'] / (df['are2_m'] - (df['wse2_m']-df['xgb_depth'])) # Fill this in with the relevant formula
df.loc[mask3, 'se_max3'] = df['har3_m'] / (df['are3_m'] - (df['wse3_m']-df['xgb_depth'])) # Fill this in with the relevant formula
df['se_std'] = df[['se_max1', 'se_max2', 'se_max3']].std(axis=1)
df['beta'] = df[['se_max1', 'se_max2', 'se_max3']].mean(axis=1)
df['beta_sem'] = df[['se_max1', 'se_max2', 'se_max3']].sem(axis=1) / 3
# Additional processing
# df['Geomorphology'] = df['Geomorphology'].replace({'Mountain Front': 'Alluvial Plain'})
df['lambda'] = df['beta'] * df['gamma']
df['ab_flag'] = np.abs(df['a']) > np.abs(df['b'])
#%%
# Setting visualization parameters
ytest = pd.read_csv("data/BASED_model/train_test/based_us_ytest_data_sans_trampush.csv", usecols=["depth"]).astype(float)
Xtest = pd.read_csv("data/BASED_model/train_test/based_us_Xtest_data_sans_trampush.csv")
trampush_csv = pd.read_csv("data/BASED_model/TrampushDataClean.csv").dropna(subset=["RA_dis_m3_pmx_2", "Qbf [m3/s]"])
# Loading test data and data to be processed
trampush_csv = trampush_csv.query("Location != 'Ditch Run near Hancock'")
trampush_csv = trampush_csv.query("Location != 'Potomac River Tribuatary near Hancock. Md.'")
trampush_csv = trampush_csv.query("Location != 'Clark Fork tributary near Drummond'")
trampush_csv = trampush_csv.query("Location != 'Richards Creek near Libby MT'")
trampush_csv = trampush_csv.query("Location != 'Ohio Brush Creek near West Union'")
trampush_csv = trampush_csv[trampush_csv["Location"] != "Floyd's Fork at Fisherville"]
# Removing misaligned data
xgb_reg = xgb.XGBRegressor()
xgb_reg.load_model("data/BASED_model/models/based_us_sans_trampush_early_stopping_combat_overfitting.ubj")
# Defining power law and its inverse
x_data = trampush_csv['Qbf [m3/s]'].values
y_data = trampush_csv['RA_dis_m3_pmx_2'].values
params, covariance = curve_fit(power_law, x_data, y_data)
min_val = min(trampush_csv['Qbf [m3/s]'].min(), trampush_csv['RA_dis_m3_pmx_2'].min())
max_val = max(trampush_csv['Qbf [m3/s]'].max(), trampush_csv['RA_dis_m3_pmx_2'].max())
x_fit = np.linspace(min_val, max_val, 100)
y_fit = power_law(x_fit, *params)
# Fitting power law to the data
with open('data/BASED_model/inverse_power_law_params.pickle', 'rb') as f:
params = pickle.load(f)
# Saving parameters to a file
trampush_csv['corrected_discharge'] = inverse_power_law(trampush_csv['RA_dis_m3_pmx_2'], *params)
# Calculating the corrected discharge values
guessworkRA = trampush_csv[['Wbf [m]', 'S [-]', 'RA_dis_m3_pmx_2']].astype(float)
guessworkRA.columns = ['width', 'slope', 'discharge']
trampush_csv['XGB_depth_RA'] = xgb_reg.predict(guessworkRA)
# Predicting the depth values for modeled discharge values
depth_predictions = pd.Series(xgb_reg.predict((Xtest[['width', 'slope', 'discharge']]))).astype(float)
# Predicting the depth values for the test data
guessworkM = trampush_csv[['Wbf [m]', 'S [-]', 'Qbf [m3/s]']].astype(float)
guessworkM.columns = ['width', 'slope', 'discharge']
trampush_csv['XGB_depth_M'] = xgb_reg.predict(guessworkM)
# Predicting the depth values for the manually recorded discharge values
guessworkRA_corr = trampush_csv[['Wbf [m]', 'S [-]', 'corrected_discharge']].astype(float)
guessworkRA_corr.columns = ['width', 'slope', 'discharge']
trampush_csv['XGB_depth_RA_corr'] = xgb_reg.predict(guessworkRA_corr)
# using alluvial ridge elevation, floodplain elevation, and, sar to back calculate the distance
df['sar1_distance_m'] = df['har1_m']/df['sar1']
df['sar2_distance_m'] = df['har2_m']/df['sar2']
df['sar3_distance_m'] = df['har3_m']/df['sar3']
#save out trampush csv and the df
trampush_csv.to_csv("data/figure2_data/TrampushDataCleanProcessed.csv", index=False)
df.to_csv("data/figure2_data/fig2_data.csv", index=False)
# %%