Skip to content

Commit 3b20812

Browse files
Add restart on failure capability for the forecast executable (#2510)
This PR: - enables restart capability of the forecast executable from a previous failure. - saves restarts during the run in a new `DATA` structure. The current `DATA` structure: ![current `DATA`](https://github.com/NOAA-EMC/global-workflow/assets/11394126/03383e2f-b7f8-43e0-8b78-c8f37a79ab84) is being replaced by: ![Screenshot 2024-04-19 at 12 55 44 PM](https://github.com/NOAA-EMC/global-workflow/assets/11394126/8ab6e6df-bbdb-43cf-b0dc-8e066f537ee7) where, the colored boxes are described as: ![Screenshot 2024-04-19 at 12 56 14 PM](https://github.com/NOAA-EMC/global-workflow/assets/11394126/30b20e50-6cc8-4433-988a-02d5b484e7b5) - saves model output from `MOM6` and `CICE` within `MOM6_OUTPUT/` and `CICE_OUTPUT/` sub-directories. This is done to keep the run directory clean and easily identify component output. This PR also: - replaces link with copy. This enables the creation of a `DATA` directory that is self-contained and can be used to diagnose issues during failures. This is a NCO EE2 requirement and addresses part of an outstanding bugzilla. In the process of enabling the restart capability, functionality from `forecast_postdet.sh` is moved to `forecast_predet.sh` that does not depend on the outcome of `forecast_det.sh`. `forecast_det.sh` determines where the initial conditions will come from; `COM` in the case of a clean run or `DATArestart` in the case of a `RERUN`. This should make it easier to separate **static** configuration and data (fix files, etc) from **runtime** configuration (namelists, etc) and data (initial conditions) Additionally, this PR: - adds 3 utility shell scripts in `test/`. - 'nccmp.sh` - compare netCDF files using `nccmp` - `g2cmp.sh` - compare grib2 files using `wgrib2` - `f90nmlcmp.sh` - compare Fortran90 nml files using `f90nml` (Requires modulefiles to load `py-f90nml` module on RDHPCS platforms) They are not used in the workflow, but are useful for users to compare files. Resolves #2273 Co-authored-by: Walter Kolczynski - NOAA <[email protected]>
1 parent 1b6cef5 commit 3b20812

26 files changed

+1160
-1012
lines changed

Diff for: .gitignore

+1-1
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ ush/global_cycle.sh
177177
ush/global_cycle_driver.sh
178178
ush/jediinc2fv3.py
179179
ush/ufsda
180-
ush/finddate.sh
180+
ush/soca
181181
ush/make_NTC_file.pl
182182
ush/make_ntc_bull.pl
183183
ush/make_tif.sh

Diff for: env/HERA.env

+5-2
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,11 @@ export OMP_STACKSIZE=2048000
3030
export NTHSTACK=1024000000
3131
#export LD_BIND_NOW=1
3232

33-
ulimit -s unlimited
34-
ulimit -a
33+
# Setting stacksize to unlimited on login nodes is prohibited
34+
if [[ -n "${SLURM_JOB_ID:-}" ]]; then
35+
ulimit -s unlimited
36+
ulimit -a
37+
fi
3538

3639
if [[ "${step}" = "prep" ]] || [[ "${step}" = "prepbufr" ]]; then
3740

Diff for: jobs/JGLOBAL_FORECAST

+29-19
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,29 @@
11
#! /usr/bin/env bash
22

33
source "${HOMEgfs}/ush/preamble.sh"
4+
45
if (( 10#${ENSMEM:-0} > 0 )); then
6+
export DATAjob="${DATAROOT}/${RUN}efcs${ENSMEM}.${PDY:-}${cyc}"
7+
export DATA="${DATAjob}/${jobid}"
58
source "${HOMEgfs}/ush/jjob_header.sh" -e "efcs" -c "base fcst efcs"
69
else
10+
export DATAjob="${DATAROOT}/${RUN}fcst.${PDY:-}${cyc}"
11+
export DATA="${DATAjob}/${jobid}"
712
source "${HOMEgfs}/ush/jjob_header.sh" -e "fcst" -c "base fcst"
813
fi
914

15+
# Create the directory to hold restarts and output from the model in stmp
16+
export DATArestart="${DATAjob}/restart"
17+
if [[ ! -d "${DATArestart}" ]]; then mkdir -p "${DATArestart}"; fi
18+
export DATAoutput="${DATAjob}/output"
19+
if [[ ! -d "${DATAoutput}" ]]; then mkdir -p "${DATAoutput}"; fi
20+
1021
##############################################
1122
# Begin JOB SPECIFIC work
1223
##############################################
1324

1425
# Restart conditions for GFS cycle come from GDAS
15-
rCDUMP=${RUN}
26+
rCDUMP="${RUN}"
1627
export rCDUMP="${RUN/gfs/gdas}"
1728

1829
# Ignore possible spelling error (nothing is misspelled)
@@ -24,47 +35,46 @@ declare -rx gPDY="${GDATE:0:8}"
2435
declare -rx gcyc="${GDATE:8:2}"
2536

2637
# Construct COM variables from templates (see config.com)
27-
YMD=${PDY} HH=${cyc} declare_from_tmpl -rx COM_ATMOS_RESTART COM_ATMOS_INPUT COM_ATMOS_ANALYSIS \
38+
YMD="${PDY}" HH="${cyc}" declare_from_tmpl -rx COM_ATMOS_RESTART COM_ATMOS_INPUT COM_ATMOS_ANALYSIS \
2839
COM_ATMOS_HISTORY COM_ATMOS_MASTER COM_TOP COM_CONF
2940

30-
RUN=${rCDUMP} YMD="${gPDY}" HH="${gcyc}" declare_from_tmpl -rx \
41+
RUN="${rCDUMP}" YMD="${gPDY}" HH="${gcyc}" declare_from_tmpl -rx \
3142
COM_ATMOS_RESTART_PREV:COM_ATMOS_RESTART_TMPL
3243

33-
if [[ ${DO_WAVE} == "YES" ]]; then
34-
YMD=${PDY} HH=${cyc} declare_from_tmpl -rx COM_WAVE_RESTART COM_WAVE_PREP COM_WAVE_HISTORY
35-
RUN=${rCDUMP} YMD="${gPDY}" HH="${gcyc}" declare_from_tmpl -rx \
44+
if [[ "${DO_WAVE}" == "YES" ]]; then
45+
YMD="${PDY}" HH="${cyc}" declare_from_tmpl -rx COM_WAVE_RESTART COM_WAVE_PREP COM_WAVE_HISTORY
46+
RUN="${rCDUMP}" YMD="${gPDY}" HH="${gcyc}" declare_from_tmpl -rx \
3647
COM_WAVE_RESTART_PREV:COM_WAVE_RESTART_TMPL
37-
declare -rx RUNwave="${RUN}wave"
3848
fi
3949

40-
if [[ ${DO_OCN} == "YES" ]]; then
41-
YMD=${PDY} HH=${cyc} declare_from_tmpl -rx COM_MED_RESTART COM_OCEAN_RESTART COM_OCEAN_INPUT \
50+
if [[ "${DO_OCN}" == "YES" ]]; then
51+
YMD="${PDY}" HH="${cyc}" declare_from_tmpl -rx COM_MED_RESTART COM_OCEAN_RESTART COM_OCEAN_INPUT \
4252
COM_OCEAN_HISTORY COM_OCEAN_ANALYSIS
43-
RUN=${rCDUMP} YMD="${gPDY}" HH="${gcyc}" declare_from_tmpl -rx \
53+
RUN="${rCDUMP}" YMD="${gPDY}" HH="${gcyc}" declare_from_tmpl -rx \
4454
COM_OCEAN_RESTART_PREV:COM_OCEAN_RESTART_TMPL
4555
fi
4656

47-
if [[ ${DO_ICE} == "YES" ]]; then
48-
YMD=${PDY} HH=${cyc} declare_from_tmpl -rx COM_ICE_HISTORY COM_ICE_INPUT COM_ICE_RESTART
49-
RUN=${rCDUMP} YMD="${gPDY}" HH="${gcyc}" declare_from_tmpl -rx \
57+
if [[ "${DO_ICE}" == "YES" ]]; then
58+
YMD="${PDY}" HH="${cyc}" declare_from_tmpl -rx COM_ICE_HISTORY COM_ICE_INPUT COM_ICE_RESTART
59+
RUN="${rCDUMP}" YMD="${gPDY}" HH="${gcyc}" declare_from_tmpl -rx \
5060
COM_ICE_RESTART_PREV:COM_ICE_RESTART_TMPL
5161
fi
5262

53-
if [[ ${DO_AERO} == "YES" ]]; then
54-
YMD=${PDY} HH=${cyc} declare_from_tmpl -rx COM_CHEM_HISTORY
63+
if [[ "${DO_AERO}" == "YES" ]]; then
64+
YMD="${PDY}" HH="${cyc}" declare_from_tmpl -rx COM_CHEM_HISTORY
5565
fi
5666

5767

5868
###############################################################
5969
# Run relevant exglobal script
6070
###############################################################
61-
${FORECASTSH:-${SCRgfs}/exglobal_forecast.sh}
71+
"${FORECASTSH:-${SCRgfs}/exglobal_forecast.sh}"
6272
status=$?
63-
[[ ${status} -ne 0 ]] && exit "${status}"
73+
(( status != 0 )) && exit "${status}"
6474

6575
# Send DBN alerts for EnKF
6676
# TODO: Should these be in post manager instead?
67-
if [[ "${RUN}" =~ "enkf" ]] && [[ "${SENDDBN}" = YES ]]; then
77+
if [[ "${RUN}" =~ "enkf" ]] && [[ "${SENDDBN:-}" == YES ]]; then
6878
for (( fhr = FHOUT; fhr <= FHMAX; fhr + FHOUT )); do
6979
if (( fhr % 3 == 0 )); then
7080
fhr3=$(printf %03i "${fhr}")
@@ -88,6 +98,6 @@ fi
8898
# Remove the Temporary working directory
8999
##########################################
90100
cd "${DATAROOT}" || true
91-
[[ ${KEEPDATA} = "NO" ]] && rm -rf "${DATA}"
101+
[[ "${KEEPDATA}" == "NO" ]] && rm -rf "${DATA} ${DATArestart}" # do not remove DATAjob. It contains DATAoutput
92102

93103
exit 0

Diff for: modulefiles/module_base.hera.lua

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ load(pathJoin("gsi-ncdiag", (os.getenv("gsi_ncdiag_ver") or "None")))
3030
load(pathJoin("crtm", (os.getenv("crtm_ver") or "None")))
3131
load(pathJoin("bufr", (os.getenv("bufr_ver") or "None")))
3232
load(pathJoin("wgrib2", (os.getenv("wgrib2_ver") or "None")))
33+
load(pathJoin("py-f90nml", (os.getenv("py_f90nml_ver") or "None")))
3334
load(pathJoin("py-netcdf4", (os.getenv("py_netcdf4_ver") or "None")))
3435
load(pathJoin("py-pyyaml", (os.getenv("py_pyyaml_ver") or "None")))
3536
load(pathJoin("py-jinja2", (os.getenv("py_jinja2_ver") or "None")))

Diff for: modulefiles/module_base.hercules.lua

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ load(pathJoin("gsi-ncdiag", (os.getenv("gsi_ncdiag_ver") or "None")))
2626
load(pathJoin("crtm", (os.getenv("crtm_ver") or "None")))
2727
load(pathJoin("bufr", (os.getenv("bufr_ver") or "None")))
2828
load(pathJoin("wgrib2", (os.getenv("wgrib2_ver") or "None")))
29+
load(pathJoin("py-f90nml", (os.getenv("py_f90nml_ver") or "None")))
2930
load(pathJoin("py-netcdf4", (os.getenv("py_netcdf4_ver") or "None")))
3031
load(pathJoin("py-pyyaml", (os.getenv("py_pyyaml_ver") or "None")))
3132
load(pathJoin("py-jinja2", (os.getenv("py_jinja2_ver") or "None")))

Diff for: modulefiles/module_base.jet.lua

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ load(pathJoin("gsi-ncdiag", (os.getenv("gsi_ncdiag_ver") or "None")))
2929
load(pathJoin("crtm", (os.getenv("crtm_ver") or "None")))
3030
load(pathJoin("bufr", (os.getenv("bufr_ver") or "None")))
3131
load(pathJoin("wgrib2", (os.getenv("wgrib2_ver") or "None")))
32+
load(pathJoin("py-f90nml", (os.getenv("py_f90nml_ver") or "None")))
3233
load(pathJoin("py-netcdf4", (os.getenv("py_netcdf4_ver") or "None")))
3334
load(pathJoin("py-pyyaml", (os.getenv("py_pyyaml_ver") or "None")))
3435
load(pathJoin("py-jinja2", (os.getenv("py_jinja2_ver") or "None")))

Diff for: modulefiles/module_base.orion.lua

+1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ load(pathJoin("gsi-ncdiag", (os.getenv("gsi_ncdiag_ver") or "None")))
2727
load(pathJoin("crtm", (os.getenv("crtm_ver") or "None")))
2828
load(pathJoin("bufr", (os.getenv("bufr_ver") or "None")))
2929
load(pathJoin("wgrib2", (os.getenv("wgrib2_ver") or "None")))
30+
load(pathJoin("py-f90nml", (os.getenv("py_f90nml_ver") or "None")))
3031
load(pathJoin("py-netcdf4", (os.getenv("py_netcdf4_ver") or "None")))
3132
load(pathJoin("py-pyyaml", (os.getenv("py_pyyaml_ver") or "None")))
3233
load(pathJoin("py-jinja2", (os.getenv("py_jinja2_ver") or "None")))

Diff for: modulefiles/module_base.s4.lua

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ load(pathJoin("gsi-ncdiag", (os.getenv("gsi_ncdiag_ver") or "None")))
2626
load(pathJoin("crtm", (os.getenv("crtm_ver") or "None")))
2727
load(pathJoin("bufr", (os.getenv("bufr_ver") or "None")))
2828
load(pathJoin("wgrib2", (os.getenv("wgrib2_ver") or "None")))
29+
load(pathJoin("py-f90nml", (os.getenv("py_f90nml_ver") or "None")))
2930
load(pathJoin("py-netcdf4", (os.getenv("py_netcdf4_ver") or "None")))
3031
load(pathJoin("py-pyyaml", (os.getenv("py_pyyaml_ver") or "None")))
3132
load(pathJoin("py-jinja2", (os.getenv("py_jinja2_ver") or "None")))

Diff for: parm/config/gefs/config.wave

+6-9
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,6 @@ export CDUMPwave="${RUN}wave"
1515
# In GFS/GDAS, restart files are generated/read from gdas runs
1616
export CDUMPRSTwave="gdas"
1717

18-
# Grids for wave model
19-
export waveGRD=${waveGRD:-'mx025'}
20-
2118
#grid dependent variable defaults
2219
export waveGRDN='1' # grid number for ww3_multi
2320
export waveGRDG='10' # grid group for ww3_multi
@@ -109,8 +106,8 @@ export RSTTYPE_WAV='T' # generate second tier of restart files
109106
rst_dt_gfs=$(( restart_interval_gfs * 3600 )) # TODO: This calculation needs to move to parsing_namelists_WW3.sh
110107
if [[ ${rst_dt_gfs} -gt 0 ]]; then
111108
export DT_1_RST_WAV=0 #${rst_dt_gfs:-0} # time between restart files, set to DTRST=1 for a single restart file
112-
#temporarily set to zero to avoid a clash in requested restart times
113-
#which makes the wave model crash a fix for the model issue will be coming
109+
# temporarily set to zero to avoid a clash in requested restart times
110+
# which makes the wave model crash a fix for the model issue will be coming
114111
export DT_2_RST_WAV=${rst_dt_gfs:-0} # restart stride for checkpointing restart
115112
else
116113
rst_dt_fhmax=$(( FHMAX_WAV * 3600 ))
@@ -121,15 +118,15 @@ export RSTIOFF_WAV=0 # first restart file offset relative to m
121118
#
122119
# Set runmember to default value if not GEFS cpl run
123120
# (for a GFS coupled run, RUNMEN would be unset, this should default to -1)
124-
export RUNMEM=${RUNMEM:--1}
121+
export RUNMEM="-1"
125122
# Set wave model member tags if ensemble run
126123
# -1: no suffix, deterministic; xxxNN: extract two last digits to make ofilename prefix=gwesNN
127-
if [[ ${RUNMEM} = -1 ]]; then
124+
if (( RUNMEM == -1 )); then
128125
# No suffix added to model ID in case of deterministic run
129-
export waveMEMB=
126+
export waveMEMB=""
130127
else
131128
# Extract member number only
132-
export waveMEMB="${RUNMEM: -2}"
129+
export waveMEMB="${RUNMEM}"
133130
fi
134131

135132
# Determine if wave component needs input and/or is coupled

Diff for: parm/config/gfs/config.wave

+9-12
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,6 @@ export CDUMPwave="${RUN}wave"
1515
# In GFS/GDAS, restart files are generated/read from gdas runs
1616
export CDUMPRSTwave="gdas"
1717

18-
# Grids for wave model
19-
export waveGRD=${waveGRD:-'mx025'}
20-
2118
#grid dependent variable defaults
2219
export waveGRDN='1' # grid number for ww3_multi
2320
export waveGRDG='10' # grid group for ww3_multi
@@ -71,14 +68,14 @@ case "${waveGRD}" in
7168
export wavepostGRD='glo_500'
7269
export waveuoutpGRD=${waveGRD}
7370
;;
74-
"uglo_100km")
75-
#unstructured 100km grid
71+
"uglo_100km")
72+
#unstructured 100km grid
7673
export waveinterpGRD='glo_200'
7774
export wavepostGRD=''
7875
export waveuoutpGRD=${waveGRD}
7976
;;
8077
"uglo_m1g16")
81-
#unstructured m1v16 grid
78+
#unstructured m1v16 grid
8279
export waveinterpGRD='glo_15mxt'
8380
export wavepostGRD=''
8481
export waveuoutpGRD=${waveGRD}
@@ -139,8 +136,8 @@ else # This is a GFS run
139136
rst_dt_gfs=$(( restart_interval_gfs * 3600 )) # TODO: This calculation needs to move to parsing_namelists_WW3.sh
140137
if [[ ${rst_dt_gfs} -gt 0 ]]; then
141138
export DT_1_RST_WAV=0 #${rst_dt_gfs:-0} # time between restart files, set to DTRST=1 for a single restart file
142-
#temporarily set to zero to avoid a clash in requested restart times
143-
#which makes the wave model crash a fix for the model issue will be coming
139+
# temporarily set to zero to avoid a clash in requested restart times
140+
# which makes the wave model crash a fix for the model issue will be coming
144141
export DT_2_RST_WAV=${rst_dt_gfs:-0} # restart stride for checkpointing restart
145142
else
146143
rst_dt_fhmax=$(( FHMAX_WAV * 3600 ))
@@ -152,15 +149,15 @@ fi
152149
#
153150
# Set runmember to default value if not GEFS cpl run
154151
# (for a GFS coupled run, RUNMEN would be unset, this should default to -1)
155-
export RUNMEM=${RUNMEM:--1}
152+
export RUNMEM="-1"
156153
# Set wave model member tags if ensemble run
157154
# -1: no suffix, deterministic; xxxNN: extract two last digits to make ofilename prefix=gwesNN
158-
if [[ ${RUNMEM} = -1 ]]; then
155+
if (( RUNMEM == -1 )); then
159156
# No suffix added to model ID in case of deterministic run
160-
export waveMEMB=
157+
export waveMEMB=""
161158
else
162159
# Extract member number only
163-
export waveMEMB="${RUNMEM: -2}"
160+
export waveMEMB="${RUNMEM}"
164161
fi
165162

166163
# Determine if wave component needs input and/or is coupled

0 commit comments

Comments
 (0)