From 20aef1d0fdeb3e487c1fe1ee57c124d6778ecbaf Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 14 Jan 2025 14:29:28 -0600 Subject: [PATCH 01/83] Let the archive jobs store data locally in prep for globus xfers --- jobs/JGLOBAL_ARCHIVE | 72 ++++++++++++++++++------------------- parm/config/gfs/config.arch | 6 ++++ parm/config/gfs/config.base | 14 ++++++-- 3 files changed, 53 insertions(+), 39 deletions(-) diff --git a/jobs/JGLOBAL_ARCHIVE b/jobs/JGLOBAL_ARCHIVE index 401feba35f..a1fc0d8346 100755 --- a/jobs/JGLOBAL_ARCHIVE +++ b/jobs/JGLOBAL_ARCHIVE @@ -8,42 +8,42 @@ source "${HOMEgfs}/ush/jjob_header.sh" -e "arch" -c "base arch" # Set variables used in the script ############################################## YMD=${PDY} HH=${cyc} declare_from_tmpl -rx \ - COMIN_ATMOS_ANALYSIS:COM_ATMOS_ANALYSIS_TMPL \ - COMIN_ATMOS_BUFR:COM_ATMOS_BUFR_TMPL \ - COMIN_ATMOS_GEMPAK:COM_ATMOS_GEMPAK_TMPL \ - COMIN_ATMOS_GENESIS:COM_ATMOS_GENESIS_TMPL \ - COMIN_ATMOS_HISTORY:COM_ATMOS_HISTORY_TMPL \ - COMIN_ATMOS_INPUT:COM_ATMOS_INPUT_TMPL \ - COMIN_ATMOS_MASTER:COM_ATMOS_MASTER_TMPL \ - COMIN_ATMOS_RESTART:COM_ATMOS_RESTART_TMPL \ - COMIN_ATMOS_TRACK:COM_ATMOS_TRACK_TMPL \ - COMIN_ATMOS_WMO:COM_ATMOS_WMO_TMPL \ - COMIN_CHEM_HISTORY:COM_CHEM_HISTORY_TMPL \ - COMIN_CHEM_ANALYSIS:COM_CHEM_ANALYSIS_TMPL \ - COMIN_MED_RESTART:COM_MED_RESTART_TMPL \ - COMIN_SNOW_ANALYSIS:COM_SNOW_ANALYSIS_TMPL \ - COMIN_ICE_HISTORY:COM_ICE_HISTORY_TMPL \ - COMIN_ICE_INPUT:COM_ICE_INPUT_TMPL \ - COMIN_ICE_RESTART:COM_ICE_RESTART_TMPL \ - COMIN_ICE_GRIB:COM_ICE_GRIB_TMPL \ - COMIN_OBS:COM_OBS_TMPL \ - COMIN_TOP:COM_TOP_TMPL \ - COMIN_OCEAN_HISTORY:COM_OCEAN_HISTORY_TMPL \ - COMIN_OCEAN_RESTART:COM_OCEAN_RESTART_TMPL \ - COMIN_OCEAN_GRIB:COM_OCEAN_GRIB_TMPL \ - COMIN_OCEAN_NETCDF:COM_OCEAN_NETCDF_TMPL \ - COMIN_OCEAN_ANALYSIS:COM_OCEAN_ANALYSIS_TMPL \ - COMIN_OCEAN_BMATRIX:COM_OCEAN_BMATRIX_TMPL \ - COMIN_ICE_BMATRIX:COM_ICE_BMATRIX_TMPL \ - COMIN_WAVE_GRID:COM_WAVE_GRID_TMPL \ - COMIN_WAVE_HISTORY:COM_WAVE_HISTORY_TMPL \ - COMIN_WAVE_STATION:COM_WAVE_STATION_TMPL \ - COMIN_WAVE_RESTART:COM_WAVE_RESTART_TMPL \ - COMIN_ATMOS_OZNMON:COM_ATMOS_OZNMON_TMPL \ - COMIN_ATMOS_RADMON:COM_ATMOS_RADMON_TMPL \ - COMIN_ATMOS_MINMON:COM_ATMOS_MINMON_TMPL \ - COMIN_CONF:COM_CONF_TMPL \ - COMOUT_ATMOS_TRACK:COM_ATMOS_TRACK_TMPL + COMIN_ATMOS_ANALYSIS:COM_ATMOS_ANALYSIS_TMPL \ + COMIN_ATMOS_BUFR:COM_ATMOS_BUFR_TMPL \ + COMIN_ATMOS_GEMPAK:COM_ATMOS_GEMPAK_TMPL \ + COMIN_ATMOS_GENESIS:COM_ATMOS_GENESIS_TMPL \ + COMIN_ATMOS_HISTORY:COM_ATMOS_HISTORY_TMPL \ + COMIN_ATMOS_INPUT:COM_ATMOS_INPUT_TMPL \ + COMIN_ATMOS_MASTER:COM_ATMOS_MASTER_TMPL \ + COMIN_ATMOS_RESTART:COM_ATMOS_RESTART_TMPL \ + COMIN_ATMOS_TRACK:COM_ATMOS_TRACK_TMPL \ + COMIN_ATMOS_WMO:COM_ATMOS_WMO_TMPL \ + COMIN_CHEM_HISTORY:COM_CHEM_HISTORY_TMPL \ + COMIN_CHEM_ANALYSIS:COM_CHEM_ANALYSIS_TMPL \ + COMIN_MED_RESTART:COM_MED_RESTART_TMPL \ + COMIN_SNOW_ANALYSIS:COM_SNOW_ANALYSIS_TMPL \ + COMIN_ICE_HISTORY:COM_ICE_HISTORY_TMPL \ + COMIN_ICE_INPUT:COM_ICE_INPUT_TMPL \ + COMIN_ICE_RESTART:COM_ICE_RESTART_TMPL \ + COMIN_ICE_GRIB:COM_ICE_GRIB_TMPL \ + COMIN_OBS:COM_OBS_TMPL \ + COMIN_TOP:COM_TOP_TMPL \ + COMIN_OCEAN_HISTORY:COM_OCEAN_HISTORY_TMPL \ + COMIN_OCEAN_RESTART:COM_OCEAN_RESTART_TMPL \ + COMIN_OCEAN_GRIB:COM_OCEAN_GRIB_TMPL \ + COMIN_OCEAN_NETCDF:COM_OCEAN_NETCDF_TMPL \ + COMIN_OCEAN_ANALYSIS:COM_OCEAN_ANALYSIS_TMPL \ + COMIN_OCEAN_BMATRIX:COM_OCEAN_BMATRIX_TMPL \ + COMIN_ICE_BMATRIX:COM_ICE_BMATRIX_TMPL \ + COMIN_WAVE_GRID:COM_WAVE_GRID_TMPL \ + COMIN_WAVE_HISTORY:COM_WAVE_HISTORY_TMPL \ + COMIN_WAVE_STATION:COM_WAVE_STATION_TMPL \ + COMIN_WAVE_RESTART:COM_WAVE_RESTART_TMPL \ + COMIN_ATMOS_OZNMON:COM_ATMOS_OZNMON_TMPL \ + COMIN_ATMOS_RADMON:COM_ATMOS_RADMON_TMPL \ + COMIN_ATMOS_MINMON:COM_ATMOS_MINMON_TMPL \ + COMIN_CONF:COM_CONF_TMPL \ + COMOUT_ATMOS_TRACK:COM_ATMOS_TRACK_TMPL for grid in "0p25" "0p50" "1p00"; do YMD=${PDY} HH=${cyc} GRID=${grid} declare_from_tmpl -rx \ diff --git a/parm/config/gfs/config.arch b/parm/config/gfs/config.arch index a23bcce6ae..bcacba8c4b 100644 --- a/parm/config/gfs/config.arch +++ b/parm/config/gfs/config.arch @@ -12,4 +12,10 @@ export ARCH_GAUSSIAN="YES" export ARCH_GAUSSIAN_FHMAX=${FHMAX_GFS} export ARCH_GAUSSIAN_FHINC=${FHOUT_GFS} +# If we are running globus archiving, create tarballs in a temporary location +if [[ "${GLOBUSARCH}" == "YES" ]]; then + export ATARDIR="${DATAROOT}/archive_rotdir/${pslot}/${RUN}" + export LOCALARCH="YES" +fi + echo "END: config.arch" diff --git a/parm/config/gfs/config.base b/parm/config/gfs/config.base index 4781f97274..f8b23e69d2 100644 --- a/parm/config/gfs/config.base +++ b/parm/config/gfs/config.base @@ -476,11 +476,19 @@ export FHMAX_FITS=132 # Archiving options export HPSSARCH="@HPSSARCH@" # save data to HPSS archive -export LOCALARCH="@LOCALARCH@" # save data to local archive -if [[ ${HPSSARCH} = "YES" ]] && [[ ${LOCALARCH} = "YES" ]]; then - echo "FATAL ERROR: Both HPSS and local archiving selected. Please choose one or the other." +export LOCALARCH="@LOCALARCH@" # save data to local archive +export GLOBUSARCH="@GLOBUSARCH@" # send data to HPSS via globus xfers to Niagara +count_arch_opts=0 +for arch_opt in "${HPSSARCH}" "${LOCALARCH}" "${GLOBUSARCH}"; do + if [[ "${arch_opt}" == "YES" ]]; then + (( count_arch_opts += 1 )); + fi +done +if [[ ${count_arch_opts} -gt 1 ]]; then + echo "FATAL ERROR: More than one archiving option selected. Please choose no more than one." exit 4 fi + export ARCH_CYC=00 # Archive data at this cycle for warm start and/or forecast-only capabilities export ARCH_WARMICFREQ=4 # Archive frequency in days for warm start capability export ARCH_FCSTICFREQ=1 # Archive frequency in days for gdas and gfs forecast-only capability From 33c555b1809da10e16c6571461d5abb12d653ce3 Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 14 Jan 2025 14:30:55 -0600 Subject: [PATCH 02/83] Remove unused variables from exglobal_archive.py --- scripts/exglobal_archive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/exglobal_archive.py b/scripts/exglobal_archive.py index ae613fb39c..edec1222ea 100755 --- a/scripts/exglobal_archive.py +++ b/scripts/exglobal_archive.py @@ -21,7 +21,7 @@ def main(): for key in ['OCNRES', 'ICERES']: try: archive.task_config[key] = f"{archive.task_config[key]:03d}" - except KeyError as ee: + except KeyError: logger.info(f"key ({key}) not found in archive.task_config!") # Pull out all the configuration keys needed to run the rest of archive steps @@ -46,7 +46,7 @@ def main(): for key in keys: try: archive_dict[key] = archive.task_config[key] - except KeyError as ee: + except KeyError: logger.warning(f"WARNING: key ({key}) not found in archive.task_config!") # Also import all COMIN* and COMOUT* directory and template variables From c2b5d13aa3c8f4362c4d6d3cdf4e3d85ec0b5741 Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 14 Jan 2025 14:31:39 -0600 Subject: [PATCH 03/83] Do the same for earc jobs --- parm/config/gfs/config.earc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/parm/config/gfs/config.earc b/parm/config/gfs/config.earc index 00a2fa95bd..603a5be015 100644 --- a/parm/config/gfs/config.earc +++ b/parm/config/gfs/config.earc @@ -32,4 +32,11 @@ esac export RMOLDSTD_ENKF=144 export RMOLDEND_ENKF=24 +# If we are running globus archiving, create tarballs in a temporary location +if [[ "${GLOBUSARCH}" == "YES" ]]; then + export ENSGRP="${ENSGRP:-0}" + export ATARDIR="${DATAROOT}/archive_rotdir/${pslot}/${RUN}/${ENSGRP}" + export LOCALARCH="YES" +fi + echo "END: config.earc" From 61e2ddba806c54aad383bb71f1e19f790412e70d Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 14 Jan 2025 14:33:17 -0600 Subject: [PATCH 04/83] Add globus task stubs --- jobs/JGDAS_ENKF_GLOBUS | 35 +++++++++++++ jobs/JGLOBAL_GLOBUS | 33 ++++++++++++ parm/config/gefs/config.globus | 1 + parm/config/gfs/config.globus | 14 ++++++ parm/config/gfs/config.resources | 2 +- scripts/exglobal_globus.py | 40 +++++++++++++++ workflow/applications/gefs.py | 3 ++ workflow/applications/gfs_cycled.py | 3 ++ workflow/rocoto/gefs_tasks.py | 78 +++++++++++++++++++---------- workflow/rocoto/gfs_tasks.py | 71 ++++++++++++++++++++++++-- 10 files changed, 250 insertions(+), 30 deletions(-) create mode 100644 jobs/JGDAS_ENKF_GLOBUS create mode 100644 jobs/JGLOBAL_GLOBUS create mode 120000 parm/config/gefs/config.globus create mode 100644 parm/config/gfs/config.globus create mode 100644 scripts/exglobal_globus.py diff --git a/jobs/JGDAS_ENKF_GLOBUS b/jobs/JGDAS_ENKF_GLOBUS new file mode 100644 index 0000000000..f6bb40ec9e --- /dev/null +++ b/jobs/JGDAS_ENKF_GLOBUS @@ -0,0 +1,35 @@ +#! /usr/bin/env bash + +source "${HOMEgfs}/ush/preamble.sh" +source "${HOMEgfs}/ush/jjob_header.sh" -e "globus" -c "base earc ens_group_globus" + + +############################################################### +# Run archive script +############################################################### + +"${SCRgfs}/exgdas_enkf_globus.sh" +status=$? +[[ ${status} -ne 0 ]] && exit "${status}" + +############################################################### + +############################################## +# End JOB SPECIFIC work +############################################## + +############################################## +# Final processing +############################################## +if [[ -e "${pgmout}" ]] ; then + cat "${pgmout}" +fi + + +########################################## +# Remove the Temporary working directory +########################################## +cd "${DATAROOT}" || (echo "${DATAROOT} does not exist. ABORT!"; exit 1) +[[ ${KEEPDATA} = "NO" ]] && rm -rf "${DATA}" + +exit 0 diff --git a/jobs/JGLOBAL_GLOBUS b/jobs/JGLOBAL_GLOBUS new file mode 100644 index 0000000000..2caf5df8cf --- /dev/null +++ b/jobs/JGLOBAL_GLOBUS @@ -0,0 +1,33 @@ +#! /usr/bin/env bash + +source "${HOMEgfs}/ush/preamble.sh" +source "${HOMEgfs}/ush/jjob_header.sh" -e "arch" -c "base arch globus" + + +############################################################### +# Run globus script +############################################################### + +${GLOBALGLOBUSSH:-${SCRgfs}/exglobal_globus.py} +status=$? +[[ ${status} -ne 0 ]] && exit "${status}" + +############################################## +# End JOB SPECIFIC work +############################################## + +############################################## +# Final processing +############################################## +if [[ -e "${pgmout}" ]] ; then + cat "${pgmout}" +fi + + +########################################## +# Remove the Temporary working directory +########################################## +cd "${DATAROOT}" || (echo "${DATAROOT} does not exist. ABORT!"; exit 1) +[[ ${KEEPDATA} = "NO" ]] && rm -rf "${DATA}" + +exit 0 diff --git a/parm/config/gefs/config.globus b/parm/config/gefs/config.globus new file mode 120000 index 0000000000..1cbde15d53 --- /dev/null +++ b/parm/config/gefs/config.globus @@ -0,0 +1 @@ +/work/noaa/global/dhuber/GW/gw_sven/parm/config/gfs/config.globus \ No newline at end of file diff --git a/parm/config/gfs/config.globus b/parm/config/gfs/config.globus new file mode 100644 index 0000000000..89f7d72493 --- /dev/null +++ b/parm/config/gfs/config.globus @@ -0,0 +1,14 @@ +#! /usr/bin/env bash + +########## config.globux ########## +# Globus specific variables + +echo "BEGIN: config.globus" + +# Get task specific resources +. "${EXPDIR}/config.resources" globus + +# Set the globus staging directory populated by the arch jobs +export STAGE_DIR="${DATAROOT}/archive_rotdir/${PSLOT}" + +echo "END: config.globus" diff --git a/parm/config/gfs/config.resources b/parm/config/gfs/config.resources index 5acc7e5620..c7e48bb7ad 100644 --- a/parm/config/gfs/config.resources +++ b/parm/config/gfs/config.resources @@ -1051,7 +1051,7 @@ case ${step} in export is_exclusive=True ;; - "arch" | "earc" | "getic") + "arch" | "earc" | "getic" | "globus") walltime="06:00:00" ntasks=1 tasks_per_node=1 diff --git a/scripts/exglobal_globus.py b/scripts/exglobal_globus.py new file mode 100644 index 0000000000..bfc09f27e3 --- /dev/null +++ b/scripts/exglobal_globus.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 + +import os + +from pygfs.task.globus import Globus +from wxflow import AttrDict, Logger, cast_strdict_as_dtypedict, logit + +# initialize root logger +logger = Logger(level=os.environ.get("LOGGING_LEVEL", "DEBUG"), colored_log=True) + + +@logit(logger) +def main(): + + config = cast_strdict_as_dtypedict(os.environ) + + # Instantiate the globus object + globus = Globus(config) + + keys = ['STAGE_DIR', 'current_cycle', 'RUN', 'PDY', 'NMEM_ENS', 'HOMEgfs', 'sven_dir'] + + globus_dict = AttrDict() + for key in keys: + try: + globus_dict[key] = globus.task_config[key] + except KeyError: + logger.warning(f"WARNING: key ({key}) not found in globus.task_config!") + + # Determine which tarballs to send + transfer_set = globus.configure(globus_dict) + + # Send the tarballs to HPSS via Niagara + globus.execute_transfer_data(transfer_set) + + # Clean up any temporary files + globus.clean() + + +if __name__ == '__main__': + main() diff --git a/workflow/applications/gefs.py b/workflow/applications/gefs.py index 33545eb2ec..023a89ba68 100644 --- a/workflow/applications/gefs.py +++ b/workflow/applications/gefs.py @@ -47,6 +47,9 @@ def _get_app_configs(self, run): if options['do_extractvars']: configs += ['extractvars'] + if options['globusarch']: + configs += ['globus'] + return configs @staticmethod diff --git a/workflow/applications/gfs_cycled.py b/workflow/applications/gfs_cycled.py index 4df03b9444..851fa8db9e 100644 --- a/workflow/applications/gfs_cycled.py +++ b/workflow/applications/gfs_cycled.py @@ -146,6 +146,9 @@ def _get_app_configs(self, run): 'mos_stn_prdgen', 'mos_grd_prdgen', 'mos_ext_stn_prdgen', 'mos_ext_grd_prdgen', 'mos_wx_prdgen', 'mos_wx_ext_prdgen'] + if options['globusarch']: + configs += ['globus'] + return configs @staticmethod diff --git a/workflow/rocoto/gefs_tasks.py b/workflow/rocoto/gefs_tasks.py index ca29bcdf1e..44467d3e9d 100644 --- a/workflow/rocoto/gefs_tasks.py +++ b/workflow/rocoto/gefs_tasks.py @@ -11,7 +11,7 @@ def __init__(self, app_config: AppConfig, run: str) -> None: def stage_ic(self): resources = self.get_resource('stage_ic') - task_name = f'gefs_stage_ic' + task_name = 'gefs_stage_ic' task_dict = {'task_name': task_name, 'resources': resources, 'envars': self.envars, @@ -28,7 +28,7 @@ def stage_ic(self): def waveinit(self): resources = self.get_resource('waveinit') - task_name = f'gefs_wave_init' + task_name = 'gefs_wave_init' task_dict = {'task_name': task_name, 'resources': resources, 'envars': self.envars, @@ -61,15 +61,15 @@ def prep_emissions(self): def fcst(self): dependencies = [] - dep_dict = {'type': 'task', 'name': f'gefs_stage_ic'} + dep_dict = {'type': 'task', 'name': 'gefs_stage_ic'} dependencies.append(rocoto.add_dependency(dep_dict)) if self.options['do_wave']: - dep_dict = {'type': 'task', 'name': f'gefs_wave_init'} + dep_dict = {'type': 'task', 'name': 'gefs_wave_init'} dependencies.append(rocoto.add_dependency(dep_dict)) if self.options['do_aero_fcst']: - dep_dict = {'type': 'task', 'name': f'gefs_prep_emissions'} + dep_dict = {'type': 'task', 'name': 'gefs_prep_emissions'} dependencies.append(rocoto.add_dependency(dep_dict)) dependencies = rocoto.create_dependency(dep_condition='and', dep=dependencies) @@ -82,7 +82,7 @@ def fcst(self): fcst_vars.append(rocoto.create_envar(name=key, value=str(value))) resources = self.get_resource('fcst') - task_name = f'gefs_fcst_mem000_seg#seg#' + task_name = 'gefs_fcst_mem000_seg#seg#' task_dict = {'task_name': task_name, 'resources': resources, 'dependency': dependencies, @@ -95,7 +95,7 @@ def fcst(self): } seg_var_dict = {'seg': ' '.join([f"{seg}" for seg in range(0, num_fcst_segments)])} - metatask_dict = {'task_name': f'gefs_fcst_mem000', + metatask_dict = {'task_name': 'gefs_fcst_mem000', 'is_serial': True, 'var_dict': seg_var_dict, 'task_dict': task_dict @@ -107,15 +107,15 @@ def fcst(self): def efcs(self): dependencies = [] - dep_dict = {'type': 'task', 'name': f'gefs_stage_ic'} + dep_dict = {'type': 'task', 'name': 'gefs_stage_ic'} dependencies.append(rocoto.add_dependency(dep_dict)) if self.options['do_wave']: - dep_dict = {'type': 'task', 'name': f'gefs_wave_init'} + dep_dict = {'type': 'task', 'name': 'gefs_wave_init'} dependencies.append(rocoto.add_dependency(dep_dict)) if self.options['do_aero_fcst']: - dep_dict = {'type': 'task', 'name': f'gefs_prep_emissions'} + dep_dict = {'type': 'task', 'name': 'gefs_prep_emissions'} dependencies.append(rocoto.add_dependency(dep_dict)) dependencies = rocoto.create_dependency(dep_condition='and', dep=dependencies) @@ -279,7 +279,7 @@ def atmos_ensstat(self): for key, value in postenvar_dict.items(): postenvars.append(rocoto.create_envar(name=key, value=str(value))) - task_name = f'gefs_atmos_ensstat_f#fhr#' + task_name = 'gefs_atmos_ensstat_f#fhr#' task_dict = {'task_name': task_name, 'resources': resources, 'dependency': dependencies, @@ -299,7 +299,7 @@ def atmos_ensstat(self): fhr_var_dict = {'fhr': ' '.join([f"{fhr:03d}" for fhr in fhrs])} - fhr_metatask_dict = {'task_name': f'gefs_atmos_ensstat', + fhr_metatask_dict = {'task_name': 'gefs_atmos_ensstat', 'task_dict': task_dict, 'var_dict': fhr_var_dict} @@ -309,7 +309,7 @@ def atmos_ensstat(self): def wavepostsbs(self): deps = [] - dep_dict = {'type': 'metatask', 'name': f'gefs_fcst_mem#member#'} + dep_dict = {'type': 'metatask', 'name': 'gefs_fcst_mem#member#'} deps.append(rocoto.add_dependency(dep_dict)) dependencies = rocoto.create_dependency(dep=deps) @@ -323,7 +323,7 @@ def wavepostsbs(self): resources = self.get_resource('wavepostsbs') - task_name = f'gefs_wave_post_grid_mem#member#_f#fhr#' + task_name = 'gefs_wave_post_grid_mem#member#_f#fhr#' task_dict = {'task_name': task_name, 'resources': resources, 'dependency': dependencies, @@ -342,12 +342,12 @@ def wavepostsbs(self): fhr_var_dict = {'fhr': ' '.join([f"{fhr:03d}" for fhr in fhrs])} - fhr_metatask_dict = {'task_name': f'gefs_wave_post_grid_#member#', + fhr_metatask_dict = {'task_name': 'gefs_wave_post_grid_#member#', 'task_dict': task_dict, 'var_dict': fhr_var_dict} member_var_dict = {'member': ' '.join([f"{mem:03d}" for mem in range(0, self.nmem + 1)])} - member_metatask_dict = {'task_name': f'gefs_wave_post_grid', + member_metatask_dict = {'task_name': 'gefs_wave_post_grid', 'task_dict': fhr_metatask_dict, 'var_dict': member_var_dict} @@ -357,7 +357,7 @@ def wavepostsbs(self): def wavepostbndpnt(self): deps = [] - dep_dict = {'type': 'metatask', 'name': f'gefs_fcst_mem#member#'} + dep_dict = {'type': 'metatask', 'name': 'gefs_fcst_mem#member#'} deps.append(rocoto.add_dependency(dep_dict)) dependencies = rocoto.create_dependency(dep=deps) @@ -369,7 +369,7 @@ def wavepostbndpnt(self): wave_post_bndpnt_envars.append(rocoto.create_envar(name=key, value=str(value))) resources = self.get_resource('wavepostbndpnt') - task_name = f'gefs_wave_post_bndpnt_mem#member#' + task_name = 'gefs_wave_post_bndpnt_mem#member#' task_dict = {'task_name': task_name, 'resources': resources, 'dependency': dependencies, @@ -402,7 +402,7 @@ def wavepostbndpntbll(self): dep_dict = {'type': 'data', 'data': data} deps.append(rocoto.add_dependency(dep_dict)) - dep_dict = {'type': 'metatask', 'name': f'gefs_fcst_mem#member#'} + dep_dict = {'type': 'metatask', 'name': 'gefs_fcst_mem#member#'} deps.append(rocoto.add_dependency(dep_dict)) dependencies = rocoto.create_dependency(dep_condition='or', dep=deps) @@ -414,7 +414,7 @@ def wavepostbndpntbll(self): wave_post_bndpnt_bull_envars.append(rocoto.create_envar(name=key, value=str(value))) resources = self.get_resource('wavepostbndpntbll') - task_name = f'gefs_wave_post_bndpnt_bull_mem#member#' + task_name = 'gefs_wave_post_bndpnt_bull_mem#member#' task_dict = {'task_name': task_name, 'resources': resources, 'dependency': dependencies, @@ -438,10 +438,10 @@ def wavepostbndpntbll(self): def wavepostpnt(self): deps = [] - dep_dict = {'type': 'metatask', 'name': f'gefs_fcst_mem#member#'} + dep_dict = {'type': 'metatask', 'name': 'gefs_fcst_mem#member#'} deps.append(rocoto.add_dependency(dep_dict)) if self.options['do_wave_bnd']: - dep_dict = {'type': 'task', 'name': f'gefs_wave_post_bndpnt_bull_mem#member#'} + dep_dict = {'type': 'task', 'name': 'gefs_wave_post_bndpnt_bull_mem#member#'} deps.append(rocoto.add_dependency(dep_dict)) dependencies = rocoto.create_dependency(dep_condition='and', dep=deps) @@ -453,7 +453,7 @@ def wavepostpnt(self): wave_post_pnt_envars.append(rocoto.create_envar(name=key, value=str(value))) resources = self.get_resource('wavepostpnt') - task_name = f'gefs_wave_post_pnt_mem#member#' + task_name = 'gefs_wave_post_pnt_mem#member#' task_dict = {'task_name': task_name, 'resources': resources, 'dependency': dependencies, @@ -498,7 +498,7 @@ def extractvars(self): extractvars_envars.append(rocoto.create_envar(name=key, value=str(value))) resources = self.get_resource('extractvars') - task_name = f'gefs_extractvars_mem#member#' + task_name = 'gefs_extractvars_mem#member#' task_dict = {'task_name': task_name, 'resources': resources, 'dependency': dependencies, @@ -564,12 +564,38 @@ def arch(self): return task + def globus(self): + deps = [] + dep_dict = {'type': 'task', 'name': 'gefs_arch'} + deps.append(rocoto.add_dependency(dep_dict)) + dependencies = rocoto.create_dependency(dep=deps) + + resources = self.get_resource('globus') + task_name = 'globus' + task_dict = {'task_name': task_name, + 'resources': resources, + 'envars': self.envars, + 'cycledef': 'gefs', + 'dependency': dependencies, + 'command': f'{self.HOMEgfs}/jobs/rocoto/globus.sh', + 'job_name': f'{self.pslot}_{task_name}_@H', + 'log': f'{self.rotdir}/logs/@Y@m@d@H/{task_name}.log', + 'maxtries': '&MAXTRIES;' + } + + task = rocoto.create_task(task_dict) + + return task + def cleanup(self): deps = [] if self.options['do_extractvars']: - dep_dict = {'type': 'task', 'name': 'arch'} + dep_dict = {'type': 'task', 'name': 'gefs_arch'} deps.append(rocoto.add_dependency(dep_dict)) - dependencies = rocoto.create_dependency(dep=deps) + if self.options['globusarch']: + dep_dict = {'type': 'task', 'name': 'gefs_globus'} + deps.append(rocoto.add_dependency(dep_dict)) + dependencies = rocoto.create_dependency(dep=deps, dep_condition='and') else: dep_dict = {'type': 'metatask', 'name': 'gefs_atmos_prod'} deps.append(rocoto.add_dependency(dep_dict)) diff --git a/workflow/rocoto/gfs_tasks.py b/workflow/rocoto/gfs_tasks.py index d2a3e43719..05fcd61660 100644 --- a/workflow/rocoto/gfs_tasks.py +++ b/workflow/rocoto/gfs_tasks.py @@ -154,7 +154,7 @@ def aerosol_init(self): offset = timedelta_to_HMS(-interval) # Files from previous cycle - files = [f'@Y@m@d.@H0000.fv_core.res.nc'] + \ + files = ['@Y@m@d.@H0000.fv_core.res.nc'] + \ [f'@Y@m@d.@H0000.fv_core.res.tile{tile}.nc' for tile in range(1, self.n_tiles + 1)] + \ [f'@Y@m@d.@H0000.fv_tracer.res.tile{tile}.nc' for tile in range(1, self.n_tiles + 1)] @@ -512,7 +512,7 @@ def aeroanlvar(self): deps = [] dep_dict = { - 'type': 'task', 'name': f'gdas_aeroanlgenb', + 'type': 'task', 'name': 'gdas_aeroanlgenb', 'offset': f"-{timedelta_to_HMS(self._base['interval_gdas'])}", } deps.append(rocoto.add_dependency(dep_dict)) @@ -641,7 +641,7 @@ def prepoceanobs(self): def marineanlletkf(self): deps = [] - dep_dict = {'type': 'metatask', 'name': f'enkfgdas_fcst', 'offset': f"-{timedelta_to_HMS(self._base['interval_gdas'])}"} + dep_dict = {'type': 'metatask', 'name': 'enkfgdas_fcst', 'offset': f"-{timedelta_to_HMS(self._base['interval_gdas'])}"} deps.append(rocoto.add_dependency(dep_dict)) dep_dict = {'type': 'task', 'name': f'{self.run}_prepoceanobs'} deps.append(rocoto.add_dependency(dep_dict)) @@ -2332,15 +2332,80 @@ def arch(self): return task + # Globus transfer for HPSS archiving + def globus(self): + deps = [] + dep_dict = {'type': 'task', 'name': f'{self.run}_arch'} + deps.append(rocoto.add_dependency(dep_dict)) + dependencies = rocoto.create_dependency(dep=deps) + + resources = self.get_resource('globus') + task_name = f'{self.run}_globus' + task_dict = {'task_name': task_name, + 'resources': resources, + 'dependency': dependencies, + 'envars': self.envars, + 'cycledef': self.run.replace('enkf', ''), + 'command': f'{self.HOMEgfs}/jobs/rocoto/globus.sh', + 'job_name': f'{self.pslot}_{task_name}_@H', + 'log': f'{self.rotdir}/logs/@Y@m@d@H/{task_name}.log', + 'maxtries': '&MAXTRIES;' + } + + task = rocoto.create_task(task_dict) + + return task + + # Ensemble globus transfer for HPSS archiving + def ens_group_globus(self): + deps = [] + dep_dict = {'type': 'metatask', 'name': f'{self.run}_eamn'} + deps.append(rocoto.add_dependency(dep_dict)) + dependencies = rocoto.create_dependency(dep=deps) + + # Integer division is floor division, but we need ceiling division + n_groups = -(self.nmem // -self._configs['earc']['NMEM_EARCGRP']) + groups = ' '.join([f'{grp:02d}' for grp in range(0, n_groups + 1)]) + + resources = self.get_resource('ens_group_globus') + var_dict = {'grp': groups} + + task_name = f'{self.run}_ens_globus' + task_dict = {'task_name': task_name, + 'resources': resources, + 'dependency': dependencies, + 'envars': self.envars, + 'cycledef': self.run.replace('enkf', ''), + 'command': f'{self.HOMEgfs}/jobs/rocoto/globus.sh', + 'job_name': f'{self.pslot}_{task_name}_@H', + 'log': f'{self.rotdir}/logs/@Y@m@d@H/{task_name}.log', + 'maxtries': '&MAXTRIES;' + } + + metatask_dict = {'task_name': f'{self.run}_eglobus', + 'var_dict': var_dict, + 'task_dict': task_dict + } + + task = rocoto.create_task(metatask_dict) + + return task + # Cleanup def cleanup(self): deps = [] if 'enkf' in self.run: dep_dict = {'type': 'metatask', 'name': f'{self.run}_eamn'} deps.append(rocoto.add_dependency(dep_dict)) + if self.options['globusarch']: + dep_dict = {'type': 'metatask', 'name': f'{self.run}_ens_globus'} + deps.append(rocoto.add_dependency(dep_dict)) else: dep_dict = {'type': 'task', 'name': f'{self.run}_arch'} deps.append(rocoto.add_dependency(dep_dict)) + if self.options['globusarch']: + dep_dict = {'type': 'task', 'name': f'{self.run}_globus'} + deps.append(rocoto.add_dependency(dep_dict)) if self.options['do_gempak']: if self.run in ['gdas']: From 61066742d1a250de364ed82241ba86b6d587bd25 Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 14 Jan 2025 14:33:45 -0600 Subject: [PATCH 05/83] Add path to Sven --- modulefiles/module_base.hercules.lua | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modulefiles/module_base.hercules.lua b/modulefiles/module_base.hercules.lua index 4245b0d6f9..5e672f4a66 100644 --- a/modulefiles/module_base.hercules.lua +++ b/modulefiles/module_base.hercules.lua @@ -49,4 +49,7 @@ load(pathJoin("prepobs", (os.getenv("prepobs_run_ver") or "None"))) prepend_path("MODULEPATH", pathJoin("/work/noaa/global/glopara/git_rocky9/Fit2Obs/v" .. (os.getenv("fit2obs_ver") or "None"), "modulefiles")) load(pathJoin("fit2obs", (os.getenv("fit2obs_ver") or "None"))) +-- Set the path for the globus package handler, Sven +setenv("sven_dir", "/home/gfekete/sven") + whatis("Description: GFS run environment") From 2683013e966c7953ce63da56861e0127adb8b84c Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 14 Jan 2025 14:39:58 -0600 Subject: [PATCH 06/83] Replace globusarch with do_globusarch --- workflow/applications/applications.py | 2 +- workflow/applications/gefs.py | 2 +- workflow/applications/gfs_cycled.py | 2 +- workflow/rocoto/gefs_tasks.py | 2 +- workflow/rocoto/gfs_tasks.py | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/workflow/applications/applications.py b/workflow/applications/applications.py index 22e299df20..9ef674c8eb 100644 --- a/workflow/applications/applications.py +++ b/workflow/applications/applications.py @@ -99,7 +99,7 @@ def _get_run_options(self, conf: Configuration) -> Dict[str, Any]: run_options[run]['do_aero_anl'] = run_base.get('DO_AERO_ANL', False) run_options[run]['do_aero_fcst'] = run_base.get('DO_AERO_FCST', False) - run_options[run]['do_hpssarch'] = run_base.get('HPSSARCH', False) + run_options[run]['do_globusarch'] = run_base.get('GLOBUSARCH', False) run_options[run]['fcst_segments'] = run_base.get('FCST_SEGMENTS', None) if not AppConfig.is_monotonic(run_options[run]['fcst_segments']): diff --git a/workflow/applications/gefs.py b/workflow/applications/gefs.py index 023a89ba68..aadf325531 100644 --- a/workflow/applications/gefs.py +++ b/workflow/applications/gefs.py @@ -47,7 +47,7 @@ def _get_app_configs(self, run): if options['do_extractvars']: configs += ['extractvars'] - if options['globusarch']: + if options['do_globusarch']: configs += ['globus'] return configs diff --git a/workflow/applications/gfs_cycled.py b/workflow/applications/gfs_cycled.py index 392082f512..725968241f 100644 --- a/workflow/applications/gfs_cycled.py +++ b/workflow/applications/gfs_cycled.py @@ -146,7 +146,7 @@ def _get_app_configs(self, run): 'mos_stn_prdgen', 'mos_grd_prdgen', 'mos_ext_stn_prdgen', 'mos_ext_grd_prdgen', 'mos_wx_prdgen', 'mos_wx_ext_prdgen'] - if options['globusarch']: + if options['do_globusarch']: configs += ['globus'] return configs diff --git a/workflow/rocoto/gefs_tasks.py b/workflow/rocoto/gefs_tasks.py index 44467d3e9d..f1723e5cf8 100644 --- a/workflow/rocoto/gefs_tasks.py +++ b/workflow/rocoto/gefs_tasks.py @@ -592,7 +592,7 @@ def cleanup(self): if self.options['do_extractvars']: dep_dict = {'type': 'task', 'name': 'gefs_arch'} deps.append(rocoto.add_dependency(dep_dict)) - if self.options['globusarch']: + if self.options['do_globusarch']: dep_dict = {'type': 'task', 'name': 'gefs_globus'} deps.append(rocoto.add_dependency(dep_dict)) dependencies = rocoto.create_dependency(dep=deps, dep_condition='and') diff --git a/workflow/rocoto/gfs_tasks.py b/workflow/rocoto/gfs_tasks.py index 3ead730e2a..9fa4c53987 100644 --- a/workflow/rocoto/gfs_tasks.py +++ b/workflow/rocoto/gfs_tasks.py @@ -2397,13 +2397,13 @@ def cleanup(self): if 'enkf' in self.run: dep_dict = {'type': 'metatask', 'name': f'{self.run}_eamn'} deps.append(rocoto.add_dependency(dep_dict)) - if self.options['globusarch']: + if self.options['do_globusarch']: dep_dict = {'type': 'metatask', 'name': f'{self.run}_ens_globus'} deps.append(rocoto.add_dependency(dep_dict)) else: dep_dict = {'type': 'task', 'name': f'{self.run}_arch'} deps.append(rocoto.add_dependency(dep_dict)) - if self.options['globusarch']: + if self.options['do_globusarch']: dep_dict = {'type': 'task', 'name': f'{self.run}_globus'} deps.append(rocoto.add_dependency(dep_dict)) From 74f681af89fb440bfa8cc4f6e250a45b4d5a969b Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 14 Jan 2025 14:44:05 -0600 Subject: [PATCH 07/83] Update gefs config.globus --- parm/config/gefs/config.globus | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parm/config/gefs/config.globus b/parm/config/gefs/config.globus index 1cbde15d53..acbaffeb72 120000 --- a/parm/config/gefs/config.globus +++ b/parm/config/gefs/config.globus @@ -1 +1 @@ -/work/noaa/global/dhuber/GW/gw_sven/parm/config/gfs/config.globus \ No newline at end of file +../gfs/config.globus \ No newline at end of file From 0984ed176301df8424cf3deb23b664e39ae8e1cd Mon Sep 17 00:00:00 2001 From: David Huber Date: Fri, 17 Jan 2025 11:49:16 -0600 Subject: [PATCH 08/83] Add templates for sven/doorman interfaces --- jobs/JGLOBAL_ARCHIVE | 1 + parm/config/gfs/config.arch | 1 + parm/globus/dm.conf.j2 | 3 + parm/globus/doorman.sh.j2 | 3 + parm/globus/return.j2 | 6 ++ parm/globus/todo.sh.j2 | 68 ++++++++++++++++ parm/globus/verify.sh.j2 | 12 +++ scripts/exglobal_archive.py | 2 +- scripts/exglobal_globus.py | 7 +- ush/python/pygfs/task/archive.py | 31 ++++++- ush/python/pygfs/task/globus_hpss.py | 116 +++++++++++++++++++++++++++ 11 files changed, 243 insertions(+), 7 deletions(-) create mode 100644 parm/globus/dm.conf.j2 create mode 100644 parm/globus/doorman.sh.j2 create mode 100644 parm/globus/return.j2 create mode 100644 parm/globus/todo.sh.j2 create mode 100644 parm/globus/verify.sh.j2 create mode 100644 ush/python/pygfs/task/globus_hpss.py diff --git a/jobs/JGLOBAL_ARCHIVE b/jobs/JGLOBAL_ARCHIVE index a1fc0d8346..748e851182 100755 --- a/jobs/JGLOBAL_ARCHIVE +++ b/jobs/JGLOBAL_ARCHIVE @@ -43,6 +43,7 @@ YMD=${PDY} HH=${cyc} declare_from_tmpl -rx \ COMIN_ATMOS_RADMON:COM_ATMOS_RADMON_TMPL \ COMIN_ATMOS_MINMON:COM_ATMOS_MINMON_TMPL \ COMIN_CONF:COM_CONF_TMPL \ + COMOUT_CONF:COM_CONF_TMPL \ COMOUT_ATMOS_TRACK:COM_ATMOS_TRACK_TMPL for grid in "0p25" "0p50" "1p00"; do diff --git a/parm/config/gfs/config.arch b/parm/config/gfs/config.arch index bcacba8c4b..1f3149f7dc 100644 --- a/parm/config/gfs/config.arch +++ b/parm/config/gfs/config.arch @@ -16,6 +16,7 @@ export ARCH_GAUSSIAN_FHINC=${FHOUT_GFS} if [[ "${GLOBUSARCH}" == "YES" ]]; then export ATARDIR="${DATAROOT}/archive_rotdir/${pslot}/${RUN}" export LOCALARCH="YES" + export DATASETS_YAML="backup_tarballs.yaml" fi echo "END: config.arch" diff --git a/parm/globus/dm.conf.j2 b/parm/globus/dm.conf.j2 new file mode 100644 index 0000000000..5cea0258c3 --- /dev/null +++ b/parm/globus/dm.conf.j2 @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +export dropbox="{{sven_dropbox}}" diff --git a/parm/globus/doorman.sh.j2 b/parm/globus/doorman.sh.j2 new file mode 100644 index 0000000000..325176104f --- /dev/null +++ b/parm/globus/doorman.sh.j2 @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +# This script executes on Niagara to pull the read the todo, verify, return diff --git a/parm/globus/return.j2 b/parm/globus/return.j2 new file mode 100644 index 0000000000..974f699148 --- /dev/null +++ b/parm/globus/return.j2 @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +# Local globus endpoint UUID +SENDER_GLC="{{host_globus_uuid}}" +# Local dropbox location +SENDER_DRP="{{sven_dropbox}}" diff --git a/parm/globus/todo.sh.j2 b/parm/globus/todo.sh.j2 new file mode 100644 index 0000000000..b4be6df3a2 --- /dev/null +++ b/parm/globus/todo.sh.j2 @@ -0,0 +1,68 @@ +#!/bin/bash +# Jinja-templated variables +hpss_target_dir={{target_dir}} +globus_xfer_id={{globus_xfer_id}} +log_directory={{niagara_log_directory}} + +cwd=$(pwd) +file="${1}" +file_full="${cwd}/${1}" +mkdir -p "${log_directory}" +log_file="${log_directory}/hpss_status_${globus_xfer_id}.log" +touch "${log_file}" + +send_to_hpss() +{ + # Change the group of the file to rstprod + chgrp "rstprod" "${file_full}" > "${log_file}" 2>&1 + if [[ $? != 0 ]]; then + rm -f "${file_full}" + echo "Failed to run chmod on ${file_full}. Deleting and exiting." >> "${log_file}" + return 1 + fi + + # Check that the MD5 checksum matches what was sent + chk=$(md5sum ${file_full}) + if [[ "${chk}" != "${globus_xfer_id} ${file_full}" ]]; then + echo "MD5 checksum of ${file} does not match. Exiting." >> "${log_file}" + return 2 + fi + + # Write a command file to place the file on hpss and protect it + local hpss_target="${hpss_target_dir}/${file}" + local command_file="command_file_${globus_xfer_id}" + cat >"${command_file}" << EOF + mkdir -p "${hpss_dir}" + put "${file_full}" : "${hpss_target}" + chgrp rstprod "${hpss_target}" + chmod 640 "${hpss_target}" +EOF + hsi in "${command_file}" >> "${log_file}" 2>&1 + if [[ $? != 0 ]]; then + echo "Failed to send ${file} to HPSS and/or protect it. Deleting and exiting." >> "${log_file}" + hsi rm "${hpss_target}" + return 3 + fi + + rm -f "${command_file}" + + # Create an index file if the file is a tarball + if [[ ${file} == *.tar ]]; then + htar -Xvf "${hpss_target}" >> "${log_file}" 2>&1 + if [[ $? != 0 ]]; then + echo "Failed to create an index file for ${hpss_target}. Exiting." >> "${log_file}" + return 4 + fi + fi + + return 0 +} + +send_to_hpss +hpss_stat=$? + +if [[ ${hpss_stat} -eq 0 ]]; then + echo "SUCCESS" >> "${log_file}" +else + echo "FAILURE" >> "${log_file}" +fi diff --git a/parm/globus/verify.sh.j2 b/parm/globus/verify.sh.j2 new file mode 100644 index 0000000000..8c7eec9a45 --- /dev/null +++ b/parm/globus/verify.sh.j2 @@ -0,0 +1,12 @@ +#!/bin/bash +# Jinja-templated variables +log_directory={{niagara_log_directory}} +globus_xfer_id={{globus_xfer_id}} + +cwd=$(pwd) +file="${cwd}/${1}" +log_file="${log_directory}/hpss_status_${globus_xfer_id}.log" + +hpss_stat=$(tail -1 ${log_file}) + +echo "${hpss_stat}" diff --git a/scripts/exglobal_archive.py b/scripts/exglobal_archive.py index 1d8083dee1..f9b5daa896 100755 --- a/scripts/exglobal_archive.py +++ b/scripts/exglobal_archive.py @@ -40,7 +40,7 @@ def main(): 'NMEM_ENS', 'DO_JEDIATMVAR', 'DO_VRFY_OCEANDA', 'FHMAX_FITS', 'waveGRD', 'IAUFHRS', 'DO_FIT2OBS', 'NET', 'FHOUT_HF_GFS', 'FHMAX_HF_GFS', 'REPLAY_ICS', 'OFFSET_START_HOUR', 'ARCH_EXPDIR', 'EXPDIR', 'ARCH_EXPDIR_FREQ', 'ARCH_HASHES', - 'ARCH_DIFFS', 'SDATE', 'EDATE', 'HOMEgfs', 'DO_GEMPAK'] + 'ARCH_DIFFS', 'SDATE', 'EDATE', 'HOMEgfs', 'DO_GEMPAK', 'DATASETS_YAML'] archive_dict = AttrDict() for key in keys: diff --git a/scripts/exglobal_globus.py b/scripts/exglobal_globus.py index bfc09f27e3..3fbb890aa4 100644 --- a/scripts/exglobal_globus.py +++ b/scripts/exglobal_globus.py @@ -2,7 +2,7 @@ import os -from pygfs.task.globus import Globus +from pygfs.task.globus_hpss import GlobusHpss from wxflow import AttrDict, Logger, cast_strdict_as_dtypedict, logit # initialize root logger @@ -15,9 +15,10 @@ def main(): config = cast_strdict_as_dtypedict(os.environ) # Instantiate the globus object - globus = Globus(config) + globus = GlobusHpss(config) - keys = ['STAGE_DIR', 'current_cycle', 'RUN', 'PDY', 'NMEM_ENS', 'HOMEgfs', 'sven_dir'] + keys = ['STAGE_DIR', 'current_cycle', 'RUN', 'PDY', 'NMEM_ENS', 'HOMEgfs', 'sven_dir', + 'DATASETS_YAML'] globus_dict = AttrDict() for key in keys: diff --git a/ush/python/pygfs/task/archive.py b/ush/python/pygfs/task/archive.py index ed63a22230..1f1de71aaf 100644 --- a/ush/python/pygfs/task/archive.py +++ b/ush/python/pygfs/task/archive.py @@ -9,7 +9,7 @@ from wxflow import (AttrDict, FileHandler, Hsi, Htar, Task, to_timedelta, chgrp, get_gid, logit, mkdir_p, parse_j2yaml, rm_p, rmdir, - strftime, to_YMDH, which, chdir, ProcessError) + strftime, to_YMDH, which, chdir, ProcessError, save_as_yaml) git_filename = "git_info.log" logger = getLogger(__name__.split('.')[-1]) @@ -22,8 +22,6 @@ class Archive(Task): @logit(logger, name="Archive") def __init__(self, config: Dict[str, Any]) -> None: """Constructor for the Archive task - The constructor is responsible for collecting necessary yamls based on - the runtime options and RUN. Parameters ---------- @@ -138,6 +136,10 @@ def configure(self, arch_dict: Dict[str, Any]) -> (Dict[str, Any], List[Dict[str atardir_sets.append(dataset) + # If we are running globus, save the tarball list as a YAML + if self.task_config.get('GLOBUSARCH', False): + self._create_datasets_yaml(atardir_sets) + return arcdir_set, atardir_sets @logit(logger) @@ -568,6 +570,29 @@ def _pop_git_info(self, arch_dict: Dict[str, Any]) -> Dict[str, Any]: return + @logit(logger) + def _create_datasets_yaml(self, datasets): + """ + Go through the dataset dictionaries, extract the tarball names and has_rstprod + boolean, and write a YAML with the info in COM_CONF. + """ + + if len(datasets) == 0: + logger.warning("WARNING: Skipping dataset YAML creation as no datasets were provided.") + return + + com_conf = self.task_config.COMOUT_CONF + yaml_filename = self.task_config.DATASETS_YAML + yaml_filename = os.path.join(com_conf, yaml_filename) + + output_yaml = {} + + for dataset in datasets: + output_yaml[dataset.name] = {"target": dataset.target, + "has_rstprod": dataset.has_rstprod} + + save_as_yaml(output_yaml, yaml_filename) + @logit(logger) def clean(self): """ diff --git a/ush/python/pygfs/task/globus_hpss.py b/ush/python/pygfs/task/globus_hpss.py new file mode 100644 index 0000000000..5d42c70662 --- /dev/null +++ b/ush/python/pygfs/task/globus_hpss.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 + +import os +from logging import getLogger +from typing import Any, Dict, List + +from wxflow import (AttrDict, Task, + logit, parse_yaml) + +logger = getLogger(__name__.split('.')[-1]) + + +class GlobusHpss(Task): + """Task to send tarballs (created by the archive task) to HPSS via Globus + """ + + @logit(logger, name="GlobusHpss") + def __init__(self, config: Dict[str, Any]) -> None: + """Constructor for the GlobusHpss task + + Parameters + ---------- + config : Dict[str, Any] + Incoming configuration for the task from the environment + + Returns + ------- + None + """ + super().__init__(config) + + self.task_config = AttrDict(**self.task_config) + + @logit(logger) + def configure(self, globus_dict: Dict[str, Any]) -> (Dict[str, Any], List[Dict[str, Any]]): + """Collects the list of tarballs created by the arch task and writes + instructions to send them to HPSS via Globus. + + Parameters + ---------- + globus_dict : Dict[str, Any] + Task specific keys, e.g. the name of the input YAML. + + Return + ------ + globus_targets : List[Dict[str, Any]] + List of tarballs and instructions for sending them to HPSS via Globus + """ + + globus_parm = os.path.join(globus_dict.PARMgfs, "globus") + + com_conf = globus_dict.COMIN_CONF + + # Collect the files and properties from the input YAML + backup_yaml = os.path.join(com_conf, globus_dict.DATASETS_YAML) + + backup_set = AttrDict(**parse_yaml(backup_yaml)) + + globus_instructions = [] + for name in backup_set.values(): + + tarball = backup_set[name].target + if backup_set[name].has_rstprod: + globus_instructions.append(self._sven_rstprod_instructions(tarball)) + else: + globus_instructions.append(self._sven_instructions(tarball)) + + return globus_instructions + + @logit(logger) + def execute_transfer_data(self, atardir_set: Dict[str, Any]) -> None: + """Create a backup tarball from a yaml dict. + + Parameters + ---------- + atardir_set: Dict[str, Any] + Dict defining set of files to backup and the target tarball. + + Return + ------ + None + """ + + if atardir_set.has_rstprod: + + try: + self.cvf(atardir_set.target, atardir_set.fileset) + # Regardless of exception type, attempt to remove the target + except Exception: + self.rm_cmd(atardir_set.target) + raise RuntimeError(f"FATAL ERROR: Failed to create restricted archive {atardir_set.target}, deleting!") + + self._protect_rstprod(atardir_set) + + else: + self.cvf(atardir_set.target, atardir_set.fileset) + + @logit(logger) + def _protect_rstprod(self, atardir_set: Dict[str, Any]) -> None: + """ + Changes the group of the target tarball to rstprod and the permissions to + 640. If this fails for any reason, attempt to delete the file before exiting. + + """ + + pass + + @logit(logger) + def clean(self): + """ + Remove the temporary directories/files created by the GlobusHpss task. + Presently, this is only the ROTDIR/expdir directory if EXPDIR archiving + was performed. + """ + + return From ae455b0114c9495907964866ce84665bfb948699 Mon Sep 17 00:00:00 2001 From: David Huber Date: Fri, 17 Jan 2025 15:03:55 -0600 Subject: [PATCH 09/83] Add more templating, begin to unify --- parm/config/gfs/config.arch | 2 +- parm/config/gfs/config.globus | 16 +++++++++ parm/globus/dm.conf.j2 | 1 + parm/globus/places.inc.j2 | 11 ++++++ parm/globus/run_doorman.sh.j2 | 68 +++++++++++++++++++++++++++++++++++ workflow/hosts/hercules.yaml | 1 + 6 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 parm/globus/places.inc.j2 create mode 100644 parm/globus/run_doorman.sh.j2 diff --git a/parm/config/gfs/config.arch b/parm/config/gfs/config.arch index 1f3149f7dc..d5d6f1883d 100644 --- a/parm/config/gfs/config.arch +++ b/parm/config/gfs/config.arch @@ -14,7 +14,7 @@ export ARCH_GAUSSIAN_FHINC=${FHOUT_GFS} # If we are running globus archiving, create tarballs in a temporary location if [[ "${GLOBUSARCH}" == "YES" ]]; then - export ATARDIR="${DATAROOT}/archive_rotdir/${pslot}/${RUN}" + export ATARDIR="${DATAROOT}/archive_rotdir/${RUN}" export LOCALARCH="YES" export DATASETS_YAML="backup_tarballs.yaml" fi diff --git a/parm/config/gfs/config.globus b/parm/config/gfs/config.globus index 89f7d72493..712c5f3d95 100644 --- a/parm/config/gfs/config.globus +++ b/parm/config/gfs/config.globus @@ -11,4 +11,20 @@ echo "BEGIN: config.globus" # Set the globus staging directory populated by the arch jobs export STAGE_DIR="${DATAROOT}/archive_rotdir/${PSLOT}" +# Set variables used by the Sven and Doorman services +# Niagara's globus UUID +export SERVER_GLOBUS_UUID=1bfd8a79-52b2-4589-88b2-0648e0c0b35d +# Client address +export CLIENT_GLOBUS_UUID=@CLIENT_UUID@ + +# General delivery location on Niagara (staging area for data) +# This is tricky because user IDs don't match between Hercules and Niagara. +# This will be a user input at setup_expt runtime. +niagara_uid=@niagara_uid@ +# data_untrusted is a misnomer. This just means the data is kept for 5 days instead of 60. +export GENERAL_DELIVERY_ROOT="/collab1/data_untrusted/${niagara_uid}/GENERAL_DELIVERY" + +# Sven's dropbox +export SVEN_DROPBOX_ROOT="${DATAROOT}/archive_rotdir/${RUN}/SVEN_DROPBOX" + echo "END: config.globus" diff --git a/parm/globus/dm.conf.j2 b/parm/globus/dm.conf.j2 index 5cea0258c3..39e7ec1943 100644 --- a/parm/globus/dm.conf.j2 +++ b/parm/globus/dm.conf.j2 @@ -1,3 +1,4 @@ #!/usr/bin/env bash +# The location on the sending client (e.g. Hercules) of Sven's dropbox. export dropbox="{{sven_dropbox}}" diff --git a/parm/globus/places.inc.j2 b/parm/globus/places.inc.j2 new file mode 100644 index 0000000000..9329edecee --- /dev/null +++ b/parm/globus/places.inc.j2 @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# This file will be pushed to Niagara and is used to initialize the Doorman server + +# This is the staging area on Niagara. +# This is where tarballs will be received and confirmations are written and sent. +export GENDEL={{general_delivery_dir}} + +# The globus UUID for the sending platform (e.g. Hercules) +export CLIENT_ENDPOINT={{HERC_GLC}} +# The location of the dropbox on the sending platform +export CLIENT_DROPBOX={{sven_dropbox}} diff --git a/parm/globus/run_doorman.sh.j2 b/parm/globus/run_doorman.sh.j2 new file mode 100644 index 0000000000..9afacaff80 --- /dev/null +++ b/parm/globus/run_doorman.sh.j2 @@ -0,0 +1,68 @@ +#!/usr/bin/env bash + +# This script runs on Niagara to interact with the Doorman service +{% set cycle_YMDH = current_cycle | to_YMDH %} +subdir="{{RUN}}/{{cycle_YMDH}}" + +# Make the working directory +doorman_dir="${HOME}/.doorman/${subdir}" +mkdir -p "${doorman_dir}" +cd "${doorman_dir}" +rm -f dm.conf +rm -f places.inc +rm -f FLIST + +# Tell the doorman where Sven's dropbox is on the sending client (e.g. Hercules) +echo 'export dropbox="{{sven_dropbox}}"' > dm.conf + +# Tell the doorman where the general delivery space is on Niagara (unique for each RUN/cycle) +# This is where tarballs will be received and confirmations are written and sent. +echo 'export GENDEL={{GENERAL_DELIVERY_ROOT}}/{{RUN}}/{{cycle_YMDH}}' > places.inc +# Tell the doorman what the sender's UUID is +echo 'export CLIENT_ENDPOINT={{CLIENT_GLOBUS_UUID}}' >> places.inc +# Tell the doorman where the sending client's dropbox is (why twice??) +echo 'export CLIENT_DROPBOX={{sven_dropbox}}' >> places.inc + +# Point to the doorman executable scripts +export PATH="${PATH}:{{doorman_root}}/bin" + +# Create the general delivery space if it wasn't already +initialize.sh + +# Transfer the data from the sender and execute the 'todo' script +receive.sh --go + +# If receive didn't produce an FLIST file, then something went wrong +if [[ ! -f FLIST ]]; then + echo "receive.sh failed!" + return 2 +fi + +# Parse the FLIST file created by receive.sh to get the transfer IDs +IDs="" +while IFS= read -r line; do + package_name=$(grep -o "package_location_.*\.tgz") + tmp="${package_name#package_location_}" + ID="${tmp%.tgz}" + IDs="${IDs} ${ID}" +done < FLIST + +# Sleep for a minute to allow time for all globus artifacts to resolve +sleep 1m + +# Validate and generate the acknowledgement for each transfer ID +for ID in ${IDs}; do + ack.sh "${ID}" +done + +# Send the acknowledgement back to the sender +send.sh + +stat=$? + +if [[ ${stat} -ne 0 ]]; then + echo "Failed to send status back to client!" + exit 3 +fi + +exit 0 diff --git a/workflow/hosts/hercules.yaml b/workflow/hosts/hercules.yaml index a2974377dd..37c21043e1 100644 --- a/workflow/hosts/hercules.yaml +++ b/workflow/hosts/hercules.yaml @@ -32,3 +32,4 @@ COMINecmwf: /work/noaa/global/glopara/data/external_gempak/ecmwf COMINnam: /work/noaa/global/glopara/data/external_gempak/nam COMINukmet: /work/noaa/global/glopara/data/external_gempak/ukmet AERO_INPUTS_DIR: /work2/noaa/global/wkolczyn/noscrub/global-workflow/gocart_emissions +CLIENT_GLOBUS_UUID: '869912fe-f6de-46c0-af10-b22efd84a022' From e3a60b3bdef00995f6c59e4b947d2b138cffd933 Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 21 Jan 2025 08:17:15 -0600 Subject: [PATCH 10/83] Load the globus modulefile --- modulefiles/module_base.hercules.lua | 5 ++--- modulefiles/module_base.orion.lua | 5 ++--- versions/run.hercules.ver | 1 + versions/run.orion.ver | 2 ++ 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/modulefiles/module_base.hercules.lua b/modulefiles/module_base.hercules.lua index 5e672f4a66..875e42e539 100644 --- a/modulefiles/module_base.hercules.lua +++ b/modulefiles/module_base.hercules.lua @@ -35,12 +35,11 @@ load(pathJoin("py-python-dateutil", (os.getenv("py_python_dateutil_ver") or "Non load(pathJoin("met", (os.getenv("met_ver") or "None"))) load(pathJoin("metplus", (os.getenv("metplus_ver") or "None"))) load(pathJoin("py-xarray", (os.getenv("py_xarray_ver") or "None"))) +load(pathJoin("globus-cli", (os.getenv("globus_cli_ver") or "None"))) +-- TODO remove this once we move to spack-stack 1.8.0+; as WGRIB2 will be assigned by the wgrib2 module setenv("WGRIB2","wgrib2") --- Stop gap fix for wgrib with spack-stack 1.6.0 --- TODO Remove this when spack-stack issue #1097 is resolved -setenv("WGRIB","wgrib") setenv("UTILROOT",(os.getenv("prod_util_ROOT") or "None")) prepend_path("MODULEPATH", pathJoin("/work/noaa/global/glopara/git_rocky9/prepobs/v" .. (os.getenv("prepobs_run_ver") or "None"), "modulefiles")) diff --git a/modulefiles/module_base.orion.lua b/modulefiles/module_base.orion.lua index e7b51ed563..b2d63e7160 100644 --- a/modulefiles/module_base.orion.lua +++ b/modulefiles/module_base.orion.lua @@ -34,12 +34,11 @@ load(pathJoin("py-python-dateutil", (os.getenv("py_python_dateutil_ver") or "Non load(pathJoin("met", (os.getenv("met_ver") or "None"))) load(pathJoin("metplus", (os.getenv("metplus_ver") or "None"))) load(pathJoin("py-xarray", (os.getenv("py_xarray_ver") or "None"))) +load(pathJoin("globus-cli", (os.getenv("globus_cli_ver") or "None"))) +-- TODO remove this once we move to spack-stack 1.8.0+; as WGRIB2 will be assigned by the wgrib2 module setenv("WGRIB2","wgrib2") --- Stop gap fix for wgrib with spack-stack 1.6.0 --- TODO Remove this when spack-stack issue #1097 is resolved -setenv("WGRIB","wgrib") setenv("UTILROOT",(os.getenv("prod_util_ROOT") or "None")) prepend_path("MODULEPATH", pathJoin("/work/noaa/global/glopara/git_rocky9/prepobs/v" .. (os.getenv("prepobs_run_ver") or "None"), "modulefiles")) diff --git a/versions/run.hercules.ver b/versions/run.hercules.ver index 2d7185d5e7..244ad51748 100644 --- a/versions/run.hercules.ver +++ b/versions/run.hercules.ver @@ -2,5 +2,6 @@ export stack_intel_ver=2021.9.0 export stack_impi_ver=2021.9.0 export intel_mkl_ver=2023.1.0 export spack_env=gsi-addon-env +export globus_cli_ver=3.27 source "${HOMEgfs:-}/versions/spack.ver" export spack_mod_path="/work/noaa/epic/role-epic/spack-stack/hercules/spack-stack-${spack_stack_ver}/envs/${spack_env}/install/modulefiles/Core" diff --git a/versions/run.orion.ver b/versions/run.orion.ver index 29e02f0873..5c357c73c0 100644 --- a/versions/run.orion.ver +++ b/versions/run.orion.ver @@ -1,5 +1,7 @@ export stack_intel_ver=2021.9.0 export stack_impi_ver=2021.9.0 +export intel_mkl_ver=2023.1.0 export spack_env=gsi-addon-env-rocky9 +export globus_cli_ver=3.27 source "${HOMEgfs:-}/versions/spack.ver" export spack_mod_path="/work/noaa/epic/role-epic/spack-stack/orion/spack-stack-${spack_stack_ver}/envs/${spack_env}/install/modulefiles/Core" From 24596e34ea0a30bfbddaae60425749b57c7129f4 Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 21 Jan 2025 08:31:41 -0600 Subject: [PATCH 11/83] Address shellcheck issues --- parm/globus/dm.conf.j2 | 1 + parm/globus/places.inc.j2 | 6 +++--- parm/globus/return.j2 | 4 ++-- parm/globus/run_doorman.sh.j2 | 4 +++- parm/globus/todo.sh.j2 | 15 +++++++-------- parm/globus/verify.sh.j2 | 6 ++---- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/parm/globus/dm.conf.j2 b/parm/globus/dm.conf.j2 index 39e7ec1943..0954c047d7 100644 --- a/parm/globus/dm.conf.j2 +++ b/parm/globus/dm.conf.j2 @@ -1,4 +1,5 @@ #!/usr/bin/env bash +#shellcheck disable=all # The location on the sending client (e.g. Hercules) of Sven's dropbox. export dropbox="{{sven_dropbox}}" diff --git a/parm/globus/places.inc.j2 b/parm/globus/places.inc.j2 index 9329edecee..12e5a4166f 100644 --- a/parm/globus/places.inc.j2 +++ b/parm/globus/places.inc.j2 @@ -3,9 +3,9 @@ # This is the staging area on Niagara. # This is where tarballs will be received and confirmations are written and sent. -export GENDEL={{general_delivery_dir}} +export GENDEL="{{general_delivery_dir}}" # The globus UUID for the sending platform (e.g. Hercules) -export CLIENT_ENDPOINT={{HERC_GLC}} +export CLIENT_ENDPOINT="{{HERC_GLC}}" # The location of the dropbox on the sending platform -export CLIENT_DROPBOX={{sven_dropbox}} +export CLIENT_DROPBOX="{{sven_dropbox}}" diff --git a/parm/globus/return.j2 b/parm/globus/return.j2 index 974f699148..c5eed26d0a 100644 --- a/parm/globus/return.j2 +++ b/parm/globus/return.j2 @@ -1,6 +1,6 @@ #!/usr/bin/env bash # Local globus endpoint UUID -SENDER_GLC="{{host_globus_uuid}}" +export SENDER_GLC="{{host_globus_uuid}}" # Local dropbox location -SENDER_DRP="{{sven_dropbox}}" +export SENDER_DRP="{{sven_dropbox}}" diff --git a/parm/globus/run_doorman.sh.j2 b/parm/globus/run_doorman.sh.j2 index 9afacaff80..c4e76c37c2 100644 --- a/parm/globus/run_doorman.sh.j2 +++ b/parm/globus/run_doorman.sh.j2 @@ -1,7 +1,9 @@ #!/usr/bin/env bash +# shellcheck disable=all +# Shellcheck won't be able to parse this file # This script runs on Niagara to interact with the Doorman service -{% set cycle_YMDH = current_cycle | to_YMDH %} +cycle_YMDH="{{cycle_YMDH}}" subdir="{{RUN}}/{{cycle_YMDH}}" # Make the working directory diff --git a/parm/globus/todo.sh.j2 b/parm/globus/todo.sh.j2 index b4be6df3a2..af802bc92b 100644 --- a/parm/globus/todo.sh.j2 +++ b/parm/globus/todo.sh.j2 @@ -1,8 +1,8 @@ #!/bin/bash # Jinja-templated variables -hpss_target_dir={{target_dir}} -globus_xfer_id={{globus_xfer_id}} -log_directory={{niagara_log_directory}} +hpss_target_dir="{{target_dir}}" +globus_xfer_id="{{globus_xfer_id}}" +log_directory="{{niagara_log_directory}}" cwd=$(pwd) file="${1}" @@ -14,15 +14,14 @@ touch "${log_file}" send_to_hpss() { # Change the group of the file to rstprod - chgrp "rstprod" "${file_full}" > "${log_file}" 2>&1 - if [[ $? != 0 ]]; then + if ! chgrp "rstprod" "${file_full}" > "${log_file}" 2>&1; then rm -f "${file_full}" echo "Failed to run chmod on ${file_full}. Deleting and exiting." >> "${log_file}" return 1 fi # Check that the MD5 checksum matches what was sent - chk=$(md5sum ${file_full}) + chk=$(md5sum "${file_full}") if [[ "${chk}" != "${globus_xfer_id} ${file_full}" ]]; then echo "MD5 checksum of ${file} does not match. Exiting." >> "${log_file}" return 2 @@ -38,6 +37,7 @@ send_to_hpss() chmod 640 "${hpss_target}" EOF hsi in "${command_file}" >> "${log_file}" 2>&1 + # shellcheck disable=SC2181 if [[ $? != 0 ]]; then echo "Failed to send ${file} to HPSS and/or protect it. Deleting and exiting." >> "${log_file}" hsi rm "${hpss_target}" @@ -48,8 +48,7 @@ EOF # Create an index file if the file is a tarball if [[ ${file} == *.tar ]]; then - htar -Xvf "${hpss_target}" >> "${log_file}" 2>&1 - if [[ $? != 0 ]]; then + if ! htar -Xvf "${hpss_target}" >> "${log_file}" 2>&1; then echo "Failed to create an index file for ${hpss_target}. Exiting." >> "${log_file}" return 4 fi diff --git a/parm/globus/verify.sh.j2 b/parm/globus/verify.sh.j2 index 8c7eec9a45..69b6f263a0 100644 --- a/parm/globus/verify.sh.j2 +++ b/parm/globus/verify.sh.j2 @@ -1,10 +1,8 @@ #!/bin/bash # Jinja-templated variables -log_directory={{niagara_log_directory}} -globus_xfer_id={{globus_xfer_id}} +log_directory="{{niagara_log_directory}}" +globus_xfer_id="{{globus_xfer_id}}" -cwd=$(pwd) -file="${cwd}/${1}" log_file="${log_directory}/hpss_status_${globus_xfer_id}.log" hpss_stat=$(tail -1 ${log_file}) From 92c5e276569be38054e6043bf510d94b8673817b Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 21 Jan 2025 08:35:38 -0600 Subject: [PATCH 12/83] Add double quotes --- parm/globus/verify.sh.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parm/globus/verify.sh.j2 b/parm/globus/verify.sh.j2 index 69b6f263a0..82f64b0caf 100644 --- a/parm/globus/verify.sh.j2 +++ b/parm/globus/verify.sh.j2 @@ -5,6 +5,6 @@ globus_xfer_id="{{globus_xfer_id}}" log_file="${log_directory}/hpss_status_${globus_xfer_id}.log" -hpss_stat=$(tail -1 ${log_file}) +hpss_stat=$(tail -1 "${log_file}") echo "${hpss_stat}" From df5ef0fd0b035e912ed38fa1c329ad45e37a3aa9 Mon Sep 17 00:00:00 2001 From: David Huber Date: Thu, 23 Jan 2025 08:18:41 -0600 Subject: [PATCH 13/83] Update cleanup dependencies depending on whether globus is running --- workflow/rocoto/gefs_tasks.py | 8 ++++---- workflow/rocoto/gfs_tasks.py | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/workflow/rocoto/gefs_tasks.py b/workflow/rocoto/gefs_tasks.py index d89b6de1d8..e53d2250b8 100644 --- a/workflow/rocoto/gefs_tasks.py +++ b/workflow/rocoto/gefs_tasks.py @@ -610,13 +610,13 @@ def globus(self): def cleanup(self): deps = [] - dep_dict = {'type': 'task', 'name': 'gefs_arch'} - deps.append(rocoto.add_dependency(dep_dict)) - dependencies = rocoto.create_dependency(dep=deps) if self.options['do_globusarch']: dep_dict = {'type': 'task', 'name': 'gefs_globus'} deps.append(rocoto.add_dependency(dep_dict)) - dependencies = rocoto.create_dependency(dep=deps, dep_condition='and') + else: + dep_dict = {'type': 'task', 'name': 'gefs_arch'} + deps.append(rocoto.add_dependency(dep_dict)) + dependencies = rocoto.create_dependency(dep=deps) resources = self.get_resource('cleanup') task_name = 'gefs_cleanup' diff --git a/workflow/rocoto/gfs_tasks.py b/workflow/rocoto/gfs_tasks.py index 3b90c48cdf..701e03f6d6 100644 --- a/workflow/rocoto/gfs_tasks.py +++ b/workflow/rocoto/gfs_tasks.py @@ -2421,11 +2421,12 @@ def cleanup(self): dep_dict = {'type': 'metatask', 'name': f'{self.run}_ens_globus'} deps.append(rocoto.add_dependency(dep_dict)) else: - dep_dict = {'type': 'task', 'name': f'{self.run}_arch'} - deps.append(rocoto.add_dependency(dep_dict)) if self.options['do_globusarch']: dep_dict = {'type': 'task', 'name': f'{self.run}_globus'} deps.append(rocoto.add_dependency(dep_dict)) + else: + dep_dict = {'type': 'task', 'name': f'{self.run}_arch'} + deps.append(rocoto.add_dependency(dep_dict)) if self.options['do_gempak']: if self.run in ['gdas']: From 30acb6bc13907f20cc991352d870fe7cb658ae46 Mon Sep 17 00:00:00 2001 From: David Huber Date: Thu, 23 Jan 2025 08:25:32 -0600 Subject: [PATCH 14/83] Add the globus task to forecast-only mode --- workflow/applications/gfs_forecast_only.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/workflow/applications/gfs_forecast_only.py b/workflow/applications/gfs_forecast_only.py index de1c8cef27..89b5604cd4 100644 --- a/workflow/applications/gfs_forecast_only.py +++ b/workflow/applications/gfs_forecast_only.py @@ -81,6 +81,9 @@ def _get_app_configs(self, run): 'mos_stn_prdgen', 'mos_grd_prdgen', 'mos_ext_stn_prdgen', 'mos_ext_grd_prdgen', 'mos_wx_prdgen', 'mos_wx_ext_prdgen'] + if options['do_globusarch']: + configs += ['globus'] + return configs @staticmethod @@ -162,6 +165,12 @@ def get_task_names(self): 'mos_stn_prdgen', 'mos_grd_prdgen', 'mos_ext_stn_prdgen', 'mos_ext_grd_prdgen', 'mos_wx_prdgen', 'mos_wx_ext_prdgen'] - tasks += ['arch', 'cleanup'] # arch and cleanup **must** be the last tasks + tasks += ['arch'] + + if options['do_globusarch']: + tasks += ['globus'] + + # cleanup **must** be the last task + tasks += ['cleanup'] return {f"{self.run}": tasks} From 39d73a319eac7044c3e406bc1adc37f82c533de9 Mon Sep 17 00:00:00 2001 From: David Huber Date: Thu, 23 Jan 2025 08:29:34 -0600 Subject: [PATCH 15/83] Add globus task to gfs_cycled and gefs --- workflow/applications/gefs.py | 6 +++++- workflow/applications/gfs_cycled.py | 8 ++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/workflow/applications/gefs.py b/workflow/applications/gefs.py index cf9e6ccf4c..1716b68500 100644 --- a/workflow/applications/gefs.py +++ b/workflow/applications/gefs.py @@ -96,6 +96,10 @@ def get_task_names(self): if options['do_extractvars']: tasks += ['extractvars'] - tasks += ['arch', 'cleanup'] + tasks += ['arch'] + if options['do_globusarch']: + tasks += ['globus'] + + tasks += ['cleanup'] return {f"{self.run}": tasks} diff --git a/workflow/applications/gfs_cycled.py b/workflow/applications/gfs_cycled.py index 02f206fe61..d384f1ed98 100644 --- a/workflow/applications/gfs_cycled.py +++ b/workflow/applications/gfs_cycled.py @@ -297,8 +297,12 @@ def get_task_names(self): 'mos_stn_prdgen', 'mos_grd_prdgen', 'mos_ext_stn_prdgen', 'mos_ext_grd_prdgen', 'mos_wx_prdgen', 'mos_wx_ext_prdgen'] - # Last two items - task_names[run] += ['arch', 'cleanup'] + task_names[run] += ['arch'] + if options['do_globusarch']: + task_names[run] += ['globus'] + + # Cleanup is always last + task_names[run] += ['cleanup'] # Ensemble tasks elif 'enkf' in run: From 20190a8a4af3e7d15330983622750a61f96a81a1 Mon Sep 17 00:00:00 2001 From: David Huber Date: Thu, 23 Jan 2025 08:49:04 -0600 Subject: [PATCH 16/83] Disable GLOBUSARCH by default --- parm/config/gfs/yaml/defaults.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/parm/config/gfs/yaml/defaults.yaml b/parm/config/gfs/yaml/defaults.yaml index 55f4b03f50..b63c0ea3d6 100644 --- a/parm/config/gfs/yaml/defaults.yaml +++ b/parm/config/gfs/yaml/defaults.yaml @@ -24,6 +24,7 @@ base: FHMAX_ENKF_GFS: 12 DOHYBVAR_OCN: "NO" DO_TEST_MODE: "NO" + GLOBUSARCH: "NO" atmanl: JCB_ALGO_YAML_VAR: "${PARMgfs}/gdas/atm/jcb-prototype_3dvar.yaml.j2" From 32052601e3a5a1307ea5ec6251649b6248725f8b Mon Sep 17 00:00:00 2001 From: David Huber Date: Thu, 23 Jan 2025 14:57:34 -0600 Subject: [PATCH 17/83] Improve globus job templating --- jobs/JGDAS_ENKF_GLOBUS | 4 +-- jobs/JGLOBAL_GLOBUS | 7 +++- jobs/rocoto/globus.sh | 24 +++++++++++++ parm/config/gfs/config.globus | 3 ++ parm/globus/dm.conf.j2 | 1 - parm/globus/places.inc.j2 | 4 +-- parm/globus/return.j2 | 2 +- parm/globus/run_doorman.sh.j2 | 21 +++++++----- scripts/exglobal_globus.py | 2 +- ush/python/pygfs/task/globus_hpss.py | 51 +++++++++++----------------- 10 files changed, 70 insertions(+), 49 deletions(-) mode change 100644 => 100755 jobs/JGDAS_ENKF_GLOBUS mode change 100644 => 100755 jobs/JGLOBAL_GLOBUS create mode 100755 jobs/rocoto/globus.sh mode change 100644 => 100755 scripts/exglobal_globus.py diff --git a/jobs/JGDAS_ENKF_GLOBUS b/jobs/JGDAS_ENKF_GLOBUS old mode 100644 new mode 100755 index f6bb40ec9e..c5b8616f63 --- a/jobs/JGDAS_ENKF_GLOBUS +++ b/jobs/JGDAS_ENKF_GLOBUS @@ -5,10 +5,10 @@ source "${HOMEgfs}/ush/jjob_header.sh" -e "globus" -c "base earc ens_group_globu ############################################################### -# Run archive script +# Run globus transfer script ############################################################### -"${SCRgfs}/exgdas_enkf_globus.sh" +"${SCRgfs}/exgdas_enkf_globus.py" status=$? [[ ${status} -ne 0 ]] && exit "${status}" diff --git a/jobs/JGLOBAL_GLOBUS b/jobs/JGLOBAL_GLOBUS old mode 100644 new mode 100755 index 2caf5df8cf..04a1b02c33 --- a/jobs/JGLOBAL_GLOBUS +++ b/jobs/JGLOBAL_GLOBUS @@ -1,8 +1,13 @@ #! /usr/bin/env bash source "${HOMEgfs}/ush/preamble.sh" -source "${HOMEgfs}/ush/jjob_header.sh" -e "arch" -c "base arch globus" +source "${HOMEgfs}/ush/jjob_header.sh" -e "globus" -c "base arch globus" +############################################## +# Set variables used in the script +############################################## +YMD=${PDY} HH=${cyc} declare_from_tmpl -rx \ + COMIN_CONF:COM_CONF_TMPL ############################################################### # Run globus script diff --git a/jobs/rocoto/globus.sh b/jobs/rocoto/globus.sh new file mode 100755 index 0000000000..bcfbc584cb --- /dev/null +++ b/jobs/rocoto/globus.sh @@ -0,0 +1,24 @@ +#! /usr/bin/env bash + +source "${HOMEgfs}/ush/preamble.sh" + +############################################################### +# Source FV3GFS workflow modules +. "${HOMEgfs}"/ush/load_fv3gfs_modules.sh +status=$? +[[ ${status} -ne 0 ]] && exit "${status}" + +############################################################### +# setup python path for workflow utilities and tasks +PYTHONPATH="${PYTHONPATH:+${PYTHONPATH}:}${HOMEgfs}/ush/python" +export PYTHONPATH + +export job="globus" +export jobid="${job}.$$" + +############################################################### +# Execute the JJOB +"${HOMEgfs}"/jobs/JGLOBAL_GLOBUS +status=$? + +exit "${status}" diff --git a/parm/config/gfs/config.globus b/parm/config/gfs/config.globus index 712c5f3d95..60dc8176ff 100644 --- a/parm/config/gfs/config.globus +++ b/parm/config/gfs/config.globus @@ -27,4 +27,7 @@ export GENERAL_DELIVERY_ROOT="/collab1/data_untrusted/${niagara_uid}/GENERAL_DEL # Sven's dropbox export SVEN_DROPBOX_ROOT="${DATAROOT}/archive_rotdir/${RUN}/SVEN_DROPBOX" +# Location of the doorman package on Niagara +export DOORMAN_ROOT="/home/Georgy.Fekete/sven" + echo "END: config.globus" diff --git a/parm/globus/dm.conf.j2 b/parm/globus/dm.conf.j2 index 0954c047d7..39e7ec1943 100644 --- a/parm/globus/dm.conf.j2 +++ b/parm/globus/dm.conf.j2 @@ -1,5 +1,4 @@ #!/usr/bin/env bash -#shellcheck disable=all # The location on the sending client (e.g. Hercules) of Sven's dropbox. export dropbox="{{sven_dropbox}}" diff --git a/parm/globus/places.inc.j2 b/parm/globus/places.inc.j2 index 12e5a4166f..c4c670ba32 100644 --- a/parm/globus/places.inc.j2 +++ b/parm/globus/places.inc.j2 @@ -3,9 +3,9 @@ # This is the staging area on Niagara. # This is where tarballs will be received and confirmations are written and sent. -export GENDEL="{{general_delivery_dir}}" +export GENDEL="{{doorman_gendel}}" # The globus UUID for the sending platform (e.g. Hercules) -export CLIENT_ENDPOINT="{{HERC_GLC}}" +export CLIENT_ENDPOINT="{{CLIENT_GLOBUS_UUID}}" # The location of the dropbox on the sending platform export CLIENT_DROPBOX="{{sven_dropbox}}" diff --git a/parm/globus/return.j2 b/parm/globus/return.j2 index c5eed26d0a..06ed38e225 100644 --- a/parm/globus/return.j2 +++ b/parm/globus/return.j2 @@ -1,6 +1,6 @@ #!/usr/bin/env bash # Local globus endpoint UUID -export SENDER_GLC="{{host_globus_uuid}}" +export SENDER_GLC="{{CLIENT_GLOBUS_UUID}}" # Local dropbox location export SENDER_DRP="{{sven_dropbox}}" diff --git a/parm/globus/run_doorman.sh.j2 b/parm/globus/run_doorman.sh.j2 index c4e76c37c2..db4b3d9d42 100644 --- a/parm/globus/run_doorman.sh.j2 +++ b/parm/globus/run_doorman.sh.j2 @@ -1,10 +1,7 @@ #!/usr/bin/env bash -# shellcheck disable=all -# Shellcheck won't be able to parse this file # This script runs on Niagara to interact with the Doorman service -cycle_YMDH="{{cycle_YMDH}}" -subdir="{{RUN}}/{{cycle_YMDH}}" +subdir="{{PSLOT}}/{{RUN}}/{{jobid}}" # Make the working directory doorman_dir="${HOME}/.doorman/${subdir}" @@ -15,18 +12,18 @@ rm -f places.inc rm -f FLIST # Tell the doorman where Sven's dropbox is on the sending client (e.g. Hercules) -echo 'export dropbox="{{sven_dropbox}}"' > dm.conf +echo 'export dropbox="{{sven_dropbox}}' > dm.conf # Tell the doorman where the general delivery space is on Niagara (unique for each RUN/cycle) # This is where tarballs will be received and confirmations are written and sent. -echo 'export GENDEL={{GENERAL_DELIVERY_ROOT}}/{{RUN}}/{{cycle_YMDH}}' > places.inc +echo 'export GENDEL="{{doorman_gendel}}"' > places.inc # Tell the doorman what the sender's UUID is -echo 'export CLIENT_ENDPOINT={{CLIENT_GLOBUS_UUID}}' >> places.inc +echo 'export CLIENT_ENDPOINT="{{CLIENT_GLOBUS_UUID}}"' >> places.inc # Tell the doorman where the sending client's dropbox is (why twice??) -echo 'export CLIENT_DROPBOX={{sven_dropbox}}' >> places.inc +echo 'export CLIENT_DROPBOX="{{sven_dropbox}}"' >> places.inc # Point to the doorman executable scripts -export PATH="${PATH}:{{doorman_root}}/bin" +export PATH="${PATH}:{{DOORMAN_ROOT}}/bin" # Create the general delivery space if it wasn't already initialize.sh @@ -67,4 +64,10 @@ if [[ ${stat} -ne 0 ]]; then exit 3 fi +# Remove the working directory +if [[ "{{KEEPDATA}}" == "NO" ]]; then + cd "${HOME}" + rm -rf "${doorman_dir}" +fi + exit 0 diff --git a/scripts/exglobal_globus.py b/scripts/exglobal_globus.py old mode 100644 new mode 100755 index 3fbb890aa4..5b16ad4d2a --- a/scripts/exglobal_globus.py +++ b/scripts/exglobal_globus.py @@ -18,7 +18,7 @@ def main(): globus = GlobusHpss(config) keys = ['STAGE_DIR', 'current_cycle', 'RUN', 'PDY', 'NMEM_ENS', 'HOMEgfs', 'sven_dir', - 'DATASETS_YAML'] + 'DATASETS_YAML', 'PARMgfs', 'COMIN_CONF'] globus_dict = AttrDict() for key in keys: diff --git a/ush/python/pygfs/task/globus_hpss.py b/ush/python/pygfs/task/globus_hpss.py index 5d42c70662..42768a544a 100644 --- a/ush/python/pygfs/task/globus_hpss.py +++ b/ush/python/pygfs/task/globus_hpss.py @@ -4,8 +4,7 @@ from logging import getLogger from typing import Any, Dict, List -from wxflow import (AttrDict, Task, - logit, parse_yaml) +from wxflow import (AttrDict, Task, to_YMD, strftime, logit, parse_yaml) logger = getLogger(__name__.split('.')[-1]) @@ -29,7 +28,18 @@ def __init__(self, config: Dict[str, Any]) -> None: """ super().__init__(config) - self.task_config = AttrDict(**self.task_config) + # Declare these here so the jinja-templated scripts can be shellchecked + cycle_YMD = to_YMD(self.task_config.current_cycle), + cycle_HH = strftime(self.task_config.current_cycle, '%H') + + local_dict = AttrDict({ + 'sven_dropbox': (f"{self.task_config.SVEN_DROPBOX_ROOT}/" + f"{self.task_config.PSLOT}/{self.task_config.RUN}.{cycle_YMD}/{cycle_HH}"), + 'doorman_gendel': (f"{self.task_config.GENERAL_DELIVERY_ROOT}/" + f"{self.task_config.PSLOT}/{self.task_config.RUN}.{cycle_YMD}/{cycle_HH}") + }) + + self.task_config = AttrDict(**self.task_config, **local_dict) @logit(logger) def configure(self, globus_dict: Dict[str, Any]) -> (Dict[str, Any], List[Dict[str, Any]]): @@ -48,6 +58,7 @@ def configure(self, globus_dict: Dict[str, Any]) -> (Dict[str, Any], List[Dict[s """ globus_parm = os.path.join(globus_dict.PARMgfs, "globus") + print(globus_parm) com_conf = globus_dict.COMIN_CONF @@ -68,49 +79,25 @@ def configure(self, globus_dict: Dict[str, Any]) -> (Dict[str, Any], List[Dict[s return globus_instructions @logit(logger) - def execute_transfer_data(self, atardir_set: Dict[str, Any]) -> None: - """Create a backup tarball from a yaml dict. + def execute_transfer_data(self, tarball_set: Dict[str, Any]) -> None: + """Interface function with Sven to send tarballs to HPSS via Niagara. Parameters ---------- - atardir_set: Dict[str, Any] - Dict defining set of files to backup and the target tarball. + tarball_set: Dict[str, Any] + Set of tarballs and properties to applicable to their transfer. Return ------ None """ - if atardir_set.has_rstprod: - - try: - self.cvf(atardir_set.target, atardir_set.fileset) - # Regardless of exception type, attempt to remove the target - except Exception: - self.rm_cmd(atardir_set.target) - raise RuntimeError(f"FATAL ERROR: Failed to create restricted archive {atardir_set.target}, deleting!") - - self._protect_rstprod(atardir_set) - - else: - self.cvf(atardir_set.target, atardir_set.fileset) - - @logit(logger) - def _protect_rstprod(self, atardir_set: Dict[str, Any]) -> None: - """ - Changes the group of the target tarball to rstprod and the permissions to - 640. If this fails for any reason, attempt to delete the file before exiting. - - """ - - pass + pass @logit(logger) def clean(self): """ Remove the temporary directories/files created by the GlobusHpss task. - Presently, this is only the ROTDIR/expdir directory if EXPDIR archiving - was performed. """ return From 3078ace8536c9b1320d9262cab30208a4ace807a Mon Sep 17 00:00:00 2001 From: David Huber Date: Fri, 24 Jan 2025 09:15:29 -0600 Subject: [PATCH 18/83] Address shellcheck issues --- parm/globus/run_doorman.sh.j2 | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/parm/globus/run_doorman.sh.j2 b/parm/globus/run_doorman.sh.j2 index db4b3d9d42..41e09c42b7 100644 --- a/parm/globus/run_doorman.sh.j2 +++ b/parm/globus/run_doorman.sh.j2 @@ -1,4 +1,5 @@ #!/usr/bin/env bash +set -eu # This script runs on Niagara to interact with the Doorman service subdir="{{PSLOT}}/{{RUN}}/{{jobid}}" @@ -40,7 +41,7 @@ fi # Parse the FLIST file created by receive.sh to get the transfer IDs IDs="" while IFS= read -r line; do - package_name=$(grep -o "package_location_.*\.tgz") + package_name=$(echo "${line}" | grep -o "package_location_.*\.tgz") tmp="${package_name#package_location_}" ID="${tmp%.tgz}" IDs="${IDs} ${ID}" @@ -55,6 +56,7 @@ for ID in ${IDs}; do done # Send the acknowledgement back to the sender +set +e send.sh stat=$? @@ -63,10 +65,12 @@ if [[ ${stat} -ne 0 ]]; then echo "Failed to send status back to client!" exit 3 fi +set -e # Remove the working directory +#shellcheck ignore=SC2050 if [[ "{{KEEPDATA}}" == "NO" ]]; then - cd "${HOME}" + cd "${HOME}" || echo "Failed to navigate to ${HOME}!" && exit 4 rm -rf "${doorman_dir}" fi From 77f88800f93091273577a21fdcfa8bb0cedd09c6 Mon Sep 17 00:00:00 2001 From: David Huber Date: Fri, 24 Jan 2025 09:17:22 -0600 Subject: [PATCH 19/83] Switch ignore for disable --- parm/globus/run_doorman.sh.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parm/globus/run_doorman.sh.j2 b/parm/globus/run_doorman.sh.j2 index 41e09c42b7..6d6332605b 100644 --- a/parm/globus/run_doorman.sh.j2 +++ b/parm/globus/run_doorman.sh.j2 @@ -68,7 +68,7 @@ fi set -e # Remove the working directory -#shellcheck ignore=SC2050 +#shellcheck disable=SC2050 if [[ "{{KEEPDATA}}" == "NO" ]]; then cd "${HOME}" || echo "Failed to navigate to ${HOME}!" && exit 4 rm -rf "${doorman_dir}" From e3f6a50090db87314d078281b4021a41959f742b Mon Sep 17 00:00:00 2001 From: David Huber Date: Fri, 24 Jan 2025 12:41:32 -0600 Subject: [PATCH 20/83] Remove unused scripts --- parm/globus/dm.conf.j2 | 4 ---- parm/globus/doorman.sh.j2 | 3 --- parm/globus/places.inc.j2 | 11 ----------- 3 files changed, 18 deletions(-) delete mode 100644 parm/globus/dm.conf.j2 delete mode 100644 parm/globus/doorman.sh.j2 delete mode 100644 parm/globus/places.inc.j2 diff --git a/parm/globus/dm.conf.j2 b/parm/globus/dm.conf.j2 deleted file mode 100644 index 39e7ec1943..0000000000 --- a/parm/globus/dm.conf.j2 +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env bash - -# The location on the sending client (e.g. Hercules) of Sven's dropbox. -export dropbox="{{sven_dropbox}}" diff --git a/parm/globus/doorman.sh.j2 b/parm/globus/doorman.sh.j2 deleted file mode 100644 index 325176104f..0000000000 --- a/parm/globus/doorman.sh.j2 +++ /dev/null @@ -1,3 +0,0 @@ -#!/usr/bin/env bash - -# This script executes on Niagara to pull the read the todo, verify, return diff --git a/parm/globus/places.inc.j2 b/parm/globus/places.inc.j2 deleted file mode 100644 index c4c670ba32..0000000000 --- a/parm/globus/places.inc.j2 +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash -# This file will be pushed to Niagara and is used to initialize the Doorman server - -# This is the staging area on Niagara. -# This is where tarballs will be received and confirmations are written and sent. -export GENDEL="{{doorman_gendel}}" - -# The globus UUID for the sending platform (e.g. Hercules) -export CLIENT_ENDPOINT="{{CLIENT_GLOBUS_UUID}}" -# The location of the dropbox on the sending platform -export CLIENT_DROPBOX="{{sven_dropbox}}" From 638724da48956c930fad39f8af02e5238400ad39 Mon Sep 17 00:00:00 2001 From: David Huber Date: Fri, 24 Jan 2025 12:47:53 -0600 Subject: [PATCH 21/83] Let GENDEL be determined on Niagara --- parm/config/gfs/config.globus | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/parm/config/gfs/config.globus b/parm/config/gfs/config.globus index 60dc8176ff..1c7b85b4a2 100644 --- a/parm/config/gfs/config.globus +++ b/parm/config/gfs/config.globus @@ -18,11 +18,8 @@ export SERVER_GLOBUS_UUID=1bfd8a79-52b2-4589-88b2-0648e0c0b35d export CLIENT_GLOBUS_UUID=@CLIENT_UUID@ # General delivery location on Niagara (staging area for data) -# This is tricky because user IDs don't match between Hercules and Niagara. -# This will be a user input at setup_expt runtime. -niagara_uid=@niagara_uid@ -# data_untrusted is a misnomer. This just means the data is kept for 5 days instead of 60. -export GENERAL_DELIVERY_ROOT="/collab1/data_untrusted/${niagara_uid}/GENERAL_DELIVERY" +export GENERAL_DELIVERY_ROOT='/collab1/data_untrusted/${LOGNAME}/GENERAL_DELIVERY' +# Side note: data_untrusted is a misnomer. This just means the data is kept for 5 days instead of 60. # Sven's dropbox export SVEN_DROPBOX_ROOT="${DATAROOT}/archive_rotdir/${RUN}/SVEN_DROPBOX" From 169feab8177a99aabf1ce43efcd21e37b0323f15 Mon Sep 17 00:00:00 2001 From: David Huber Date: Fri, 24 Jan 2025 12:48:16 -0600 Subject: [PATCH 22/83] Reorganize --- parm/globus/run_doorman.sh.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parm/globus/run_doorman.sh.j2 b/parm/globus/run_doorman.sh.j2 index 6d6332605b..cfee6dd173 100644 --- a/parm/globus/run_doorman.sh.j2 +++ b/parm/globus/run_doorman.sh.j2 @@ -2,9 +2,9 @@ set -eu # This script runs on Niagara to interact with the Doorman service -subdir="{{PSLOT}}/{{RUN}}/{{jobid}}" # Make the working directory +subdir="{{PSLOT}}/{{RUN}}/{{jobid}}" doorman_dir="${HOME}/.doorman/${subdir}" mkdir -p "${doorman_dir}" cd "${doorman_dir}" From 45cca4859f4b446ba23f662fe440228cc14c8622 Mon Sep 17 00:00:00 2001 From: David Huber Date: Fri, 24 Jan 2025 12:49:07 -0600 Subject: [PATCH 23/83] Let todo script determine if rstprod should be applied --- parm/globus/todo.sh.j2 | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/parm/globus/todo.sh.j2 b/parm/globus/todo.sh.j2 index af802bc92b..3c5de47386 100644 --- a/parm/globus/todo.sh.j2 +++ b/parm/globus/todo.sh.j2 @@ -10,14 +10,17 @@ file_full="${cwd}/${1}" mkdir -p "${log_directory}" log_file="${log_directory}/hpss_status_${globus_xfer_id}.log" touch "${log_file}" +has_rstprod="{{has_rstprod}}" send_to_hpss() { - # Change the group of the file to rstprod - if ! chgrp "rstprod" "${file_full}" > "${log_file}" 2>&1; then - rm -f "${file_full}" - echo "Failed to run chmod on ${file_full}. Deleting and exiting." >> "${log_file}" - return 1 + if [[ "${has_rstprod}" == "true" ]]; then + # Change the group of the file to rstprod + if ! chgrp "rstprod" "${file_full}" > "${log_file}" 2>&1; then + rm -f "${file_full}" + echo "Failed to run chmod on ${file_full}. Deleting and exiting." >> "${log_file}" + return 1 + fi fi # Check that the MD5 checksum matches what was sent @@ -30,17 +33,20 @@ send_to_hpss() # Write a command file to place the file on hpss and protect it local hpss_target="${hpss_target_dir}/${file}" local command_file="command_file_${globus_xfer_id}" - cat >"${command_file}" << EOF - mkdir -p "${hpss_dir}" - put "${file_full}" : "${hpss_target}" - chgrp rstprod "${hpss_target}" - chmod 640 "${hpss_target}" -EOF + echo "mkdir -p ${hpss_dir}" >> "${command_file}" + echo "put ${file_full} : ${hpss_target}" >> "${command_file}" + if [[ "${has_rstprod}" == "true" ]]; then + echo "chgrp rstprod ${hpss_target}" >> "${command_file}" + echo "chmod 640 ${hpss_target}" >> "${command_file}" + fi hsi in "${command_file}" >> "${log_file}" 2>&1 # shellcheck disable=SC2181 if [[ $? != 0 ]]; then - echo "Failed to send ${file} to HPSS and/or protect it. Deleting and exiting." >> "${log_file}" - hsi rm "${hpss_target}" + echo "Failed to send ${file} to HPSS and/or protect it." >> "${log_file}" + if [[ "${has_rstprod}" == "true" ]]; then + echo "Deleting from hpss" >> "${log_file}" + hsi rm "${hpss_target}" + fi return 3 fi From e270fa4b5a4b4a1573ab4f1cf5402bf303ff6a99 Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 28 Jan 2025 10:40:00 -0600 Subject: [PATCH 24/83] Split todo into standard and rstprod versions --- parm/globus/todo.sh.j2 | 37 ++++++------------ parm/globus/todo_rstprod.sh.j2 | 71 ++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 25 deletions(-) create mode 100644 parm/globus/todo_rstprod.sh.j2 diff --git a/parm/globus/todo.sh.j2 b/parm/globus/todo.sh.j2 index 3c5de47386..2322ca1500 100644 --- a/parm/globus/todo.sh.j2 +++ b/parm/globus/todo.sh.j2 @@ -1,33 +1,27 @@ #!/bin/bash # Jinja-templated variables -hpss_target_dir="{{target_dir}}" -globus_xfer_id="{{globus_xfer_id}}" +hpss_target_dir="{{hpss_target_dir}}" log_directory="{{niagara_log_directory}}" cwd=$(pwd) file="${1}" -file_full="${cwd}/${1}" +# In lieu of an actual ID, construct from the filename +# TODO when the doorman can provide an xfer ID, use that instead +globus_xfer_id="${file}" + +file_full="${cwd}/${file}" mkdir -p "${log_directory}" + log_file="${log_directory}/hpss_status_${globus_xfer_id}.log" touch "${log_file}" -has_rstprod="{{has_rstprod}}" send_to_hpss() { - if [[ "${has_rstprod}" == "true" ]]; then - # Change the group of the file to rstprod - if ! chgrp "rstprod" "${file_full}" > "${log_file}" 2>&1; then - rm -f "${file_full}" - echo "Failed to run chmod on ${file_full}. Deleting and exiting." >> "${log_file}" - return 1 - fi - fi - # Check that the MD5 checksum matches what was sent chk=$(md5sum "${file_full}") if [[ "${chk}" != "${globus_xfer_id} ${file_full}" ]]; then echo "MD5 checksum of ${file} does not match. Exiting." >> "${log_file}" - return 2 + return 1 fi # Write a command file to place the file on hpss and protect it @@ -35,19 +29,12 @@ send_to_hpss() local command_file="command_file_${globus_xfer_id}" echo "mkdir -p ${hpss_dir}" >> "${command_file}" echo "put ${file_full} : ${hpss_target}" >> "${command_file}" - if [[ "${has_rstprod}" == "true" ]]; then - echo "chgrp rstprod ${hpss_target}" >> "${command_file}" - echo "chmod 640 ${hpss_target}" >> "${command_file}" - fi + hsi in "${command_file}" >> "${log_file}" 2>&1 # shellcheck disable=SC2181 if [[ $? != 0 ]]; then - echo "Failed to send ${file} to HPSS and/or protect it." >> "${log_file}" - if [[ "${has_rstprod}" == "true" ]]; then - echo "Deleting from hpss" >> "${log_file}" - hsi rm "${hpss_target}" - fi - return 3 + echo "Failed to send ${file} to HPSS." >> "${log_file}" + return 2 fi rm -f "${command_file}" @@ -56,7 +43,7 @@ send_to_hpss() if [[ ${file} == *.tar ]]; then if ! htar -Xvf "${hpss_target}" >> "${log_file}" 2>&1; then echo "Failed to create an index file for ${hpss_target}. Exiting." >> "${log_file}" - return 4 + return 3 fi fi diff --git a/parm/globus/todo_rstprod.sh.j2 b/parm/globus/todo_rstprod.sh.j2 new file mode 100644 index 0000000000..30a245768b --- /dev/null +++ b/parm/globus/todo_rstprod.sh.j2 @@ -0,0 +1,71 @@ +#!/bin/bash +# Jinja-templated variables +hpss_target_dir="{{hpss_target_dir}}" +log_directory="{{niagara_log_directory}}" + +cwd=$(pwd) +file="${1}" +# In lieu of an actual ID, construct from the filename +# TODO when the doorman can provide an xfer ID, use that instead +globus_xfer_id="${file}" + +file_full="${cwd}/${file}" +mkdir -p "${log_directory}" + +log_file="${log_directory}/hpss_status_${globus_xfer_id}.log" +touch "${log_file}" + +send_to_hpss() +{ + # Change the group of the file to rstprod + if ! chgrp "rstprod" "${file_full}" > "${log_file}" 2>&1; then + rm -f "${file_full}" + echo "Failed to run chmod on ${file_full}. Deleting and exiting." >> "${log_file}" + return 1 + fi + + # Check that the MD5 checksum matches what was sent + chk=$(md5sum "${file_full}") + if [[ "${chk}" != "${globus_xfer_id} ${file_full}" ]]; then + echo "MD5 checksum of ${file} does not match. Exiting." >> "${log_file}" + return 2 + fi + + # Write a command file to place the file on hpss and protect it + local hpss_target="${hpss_target_dir}/${file}" + local command_file="command_file_${globus_xfer_id}" + echo "mkdir -p ${hpss_dir}" >> "${command_file}" + echo "put ${file_full} : ${hpss_target}" >> "${command_file}" + echo "chgrp rstprod ${hpss_target}" >> "${command_file}" + echo "chmod 640 ${hpss_target}" >> "${command_file}" + + hsi in "${command_file}" >> "${log_file}" 2>&1 + # shellcheck disable=SC2181 + if [[ $? != 0 ]]; then + echo "Failed to send ${file} to HPSS and/or protect it." >> "${log_file}" + echo "Deleting from hpss. Please verify it was deleted!!" >> "${log_file}" + hsi rm "${hpss_target}" + return 3 + fi + + rm -f "${command_file}" + + # Create an index file if the file is a tarball + if [[ ${file} == *.tar ]]; then + if ! htar -Xvf "${hpss_target}" >> "${log_file}" 2>&1; then + echo "Failed to create an index file for ${hpss_target}. Exiting." >> "${log_file}" + return 4 + fi + fi + + return 0 +} + +send_to_hpss +hpss_stat=$? + +if [[ ${hpss_stat} -eq 0 ]]; then + echo "SUCCESS" >> "${log_file}" +else + echo "FAILURE" >> "${log_file}" +fi From 3dc7377f3748f68a8f31a9af07821d2a19251da6 Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 28 Jan 2025 10:40:54 -0600 Subject: [PATCH 25/83] Simplify Jinja templating of globus scripts --- parm/globus/run_doorman.sh.j2 | 8 ++------ parm/globus/verify.sh.j2 | 4 +++- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/parm/globus/run_doorman.sh.j2 b/parm/globus/run_doorman.sh.j2 index cfee6dd173..066aa0f379 100644 --- a/parm/globus/run_doorman.sh.j2 +++ b/parm/globus/run_doorman.sh.j2 @@ -1,20 +1,16 @@ #!/usr/bin/env bash -set -eu +set -eux # This script runs on Niagara to interact with the Doorman service # Make the working directory -subdir="{{PSLOT}}/{{RUN}}/{{jobid}}" -doorman_dir="${HOME}/.doorman/${subdir}" +doorman_dir='{{homedir}}/doorman_wd' mkdir -p "${doorman_dir}" cd "${doorman_dir}" rm -f dm.conf rm -f places.inc rm -f FLIST -# Tell the doorman where Sven's dropbox is on the sending client (e.g. Hercules) -echo 'export dropbox="{{sven_dropbox}}' > dm.conf - # Tell the doorman where the general delivery space is on Niagara (unique for each RUN/cycle) # This is where tarballs will be received and confirmations are written and sent. echo 'export GENDEL="{{doorman_gendel}}"' > places.inc diff --git a/parm/globus/verify.sh.j2 b/parm/globus/verify.sh.j2 index 82f64b0caf..7ea37e209f 100644 --- a/parm/globus/verify.sh.j2 +++ b/parm/globus/verify.sh.j2 @@ -1,7 +1,9 @@ #!/bin/bash # Jinja-templated variables log_directory="{{niagara_log_directory}}" -globus_xfer_id="{{globus_xfer_id}}" +# In lieu of an actual globus xfer ID, use the filename +# TODO when the Doorman is capable of providing the xfer ID, use that instead +globus_xfer_id="{{target_filename}}" log_file="${log_directory}/hpss_status_${globus_xfer_id}.log" From 49be54b7c11f7e92d03dc0c8fe984d7564734b93 Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 28 Jan 2025 10:41:33 -0600 Subject: [PATCH 26/83] Add information on Niagara --- parm/config/gfs/config.globus | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/parm/config/gfs/config.globus b/parm/config/gfs/config.globus index 1c7b85b4a2..92e7b0cb46 100644 --- a/parm/config/gfs/config.globus +++ b/parm/config/gfs/config.globus @@ -15,11 +15,10 @@ export STAGE_DIR="${DATAROOT}/archive_rotdir/${PSLOT}" # Niagara's globus UUID export SERVER_GLOBUS_UUID=1bfd8a79-52b2-4589-88b2-0648e0c0b35d # Client address -export CLIENT_GLOBUS_UUID=@CLIENT_UUID@ +export CLIENT_GLOBUS_UUID=@CLIENT_GLOBUS_UUID@ # General delivery location on Niagara (staging area for data) -export GENERAL_DELIVERY_ROOT='/collab1/data_untrusted/${LOGNAME}/GENERAL_DELIVERY' -# Side note: data_untrusted is a misnomer. This just means the data is kept for 5 days instead of 60. +export SERVER_HOME='/collab1/data/{{LOGNAME}}' # Sven's dropbox export SVEN_DROPBOX_ROOT="${DATAROOT}/archive_rotdir/${RUN}/SVEN_DROPBOX" @@ -27,4 +26,7 @@ export SVEN_DROPBOX_ROOT="${DATAROOT}/archive_rotdir/${RUN}/SVEN_DROPBOX" # Location of the doorman package on Niagara export DOORMAN_ROOT="/home/Georgy.Fekete/sven" +# Server name (should match ~/.ssh/config) +export SERVER_NAME="niagara" + echo "END: config.globus" From 7cde813105813bab71ff35af3a8dbd8c7c64e77c Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 28 Jan 2025 10:42:26 -0600 Subject: [PATCH 27/83] Add path to sven --- modulefiles/module_base.hercules.lua | 4 +++- versions/run.hercules.ver | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/modulefiles/module_base.hercules.lua b/modulefiles/module_base.hercules.lua index 875e42e539..2137f1cbeb 100644 --- a/modulefiles/module_base.hercules.lua +++ b/modulefiles/module_base.hercules.lua @@ -35,13 +35,15 @@ load(pathJoin("py-python-dateutil", (os.getenv("py_python_dateutil_ver") or "Non load(pathJoin("met", (os.getenv("met_ver") or "None"))) load(pathJoin("metplus", (os.getenv("metplus_ver") or "None"))) load(pathJoin("py-xarray", (os.getenv("py_xarray_ver") or "None"))) -load(pathJoin("globus-cli", (os.getenv("globus_cli_ver") or "None"))) -- TODO remove this once we move to spack-stack 1.8.0+; as WGRIB2 will be assigned by the wgrib2 module setenv("WGRIB2","wgrib2") setenv("UTILROOT",(os.getenv("prod_util_ROOT") or "None")) +-- Set the path for the Sven executables +append_path("PATH", pathJoin((os.getenv("sven_root_path") or "None"), "bin")) + prepend_path("MODULEPATH", pathJoin("/work/noaa/global/glopara/git_rocky9/prepobs/v" .. (os.getenv("prepobs_run_ver") or "None"), "modulefiles")) load(pathJoin("prepobs", (os.getenv("prepobs_run_ver") or "None"))) diff --git a/versions/run.hercules.ver b/versions/run.hercules.ver index 244ad51748..b3e2d747ba 100644 --- a/versions/run.hercules.ver +++ b/versions/run.hercules.ver @@ -5,3 +5,4 @@ export spack_env=gsi-addon-env export globus_cli_ver=3.27 source "${HOMEgfs:-}/versions/spack.ver" export spack_mod_path="/work/noaa/epic/role-epic/spack-stack/hercules/spack-stack-${spack_stack_ver}/envs/${spack_env}/install/modulefiles/Core" +export sven_root_path="/home/gfekete/sven" From aabbb2053e5f340700bf668b9c148dc7735654cc Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 28 Jan 2025 10:43:35 -0600 Subject: [PATCH 28/83] Flesh out globus_hpss.py --- ush/python/pygfs/task/globus_hpss.py | 205 ++++++++++++++++++++++++--- 1 file changed, 184 insertions(+), 21 deletions(-) diff --git a/ush/python/pygfs/task/globus_hpss.py b/ush/python/pygfs/task/globus_hpss.py index 42768a544a..58ce6f1deb 100644 --- a/ush/python/pygfs/task/globus_hpss.py +++ b/ush/python/pygfs/task/globus_hpss.py @@ -4,13 +4,16 @@ from logging import getLogger from typing import Any, Dict, List -from wxflow import (AttrDict, Task, to_YMD, strftime, logit, parse_yaml) +from wxflow import AttrDict, Task, to_YMD, to_YMDH, strftime, logit, parse_yaml, Jinja, which, ProcessError logger = getLogger(__name__.split('.')[-1]) class GlobusHpss(Task): """Task to send tarballs (created by the archive task) to HPSS via Globus + NOTE: For this to work, an entry in ~/.ssh/config titled "niagara" must + be present. If it is not, then see the wiki on how to set it up. + TODO: Add link to the wiki. """ @logit(logger, name="GlobusHpss") @@ -30,21 +33,81 @@ def __init__(self, config: Dict[str, Any]) -> None: # Declare these here so the jinja-templated scripts can be shellchecked cycle_YMD = to_YMD(self.task_config.current_cycle), + cycle_YMDH = to_YMDH(self.task_config.current_cycle), cycle_HH = strftime(self.task_config.current_cycle, '%H') + # Instantiate all of the executables we will need to run + self.forsven = which("forsven") + self.scp = which("scp") + self.ssh = which("ssh") + + if self.forsven is None: + raise FileNotFoundError("FATAL ERROR Could not find the forsven executable!") + if self.scp is None: + raise FileNotFoundError("FATAL ERROR Could not find scp!") + if self.ssh is None: + raise FileNotFoundError("FATAL ERROR Could not find ssh!") + + # Get the user's server username from their ~/.ssh/config file + + if self.ssh is None: + raise FileNotFoundError("FATAL ERROR Failed to locate ssh!") + + server_name = self.task_config.SERVER_NAME + + try: + ssh_output = self.ssh("-G", "{server_name}", output=str) + except ProcessError as pe: + raise ProcessError("FATAL ERROR No host information on niagara!\n" + f"Please add an entry for {server_name} into ~/.ssh/config!") from pe + + # Parse the ssh output to find the user's Niagara username + ssh_output = ssh_output.split("\n") + for line in ssh_output: + if line.startswith("user "): + server_username = line.split()[1] + + # Update the home directory on the server with the username + server_home = self.task_config.SERVER_HOME.replace( + "{{LOGNAME}}", server_username + ) + local_dict = AttrDict({ 'sven_dropbox': (f"{self.task_config.SVEN_DROPBOX_ROOT}/" f"{self.task_config.PSLOT}/{self.task_config.RUN}.{cycle_YMD}/{cycle_HH}"), - 'doorman_gendel': (f"{self.task_config.GENERAL_DELIVERY_ROOT}/" - f"{self.task_config.PSLOT}/{self.task_config.RUN}.{cycle_YMD}/{cycle_HH}") + 'doorman_gendel': (f"{server_home}/GENERAL_DELIVERY/" + f"{self.task_config.PSLOT}/{self.task_config.RUN}.{cycle_YMD}/{cycle_HH}"), + 'hpss_target_dir': f"{self.task_config.ATARDIR}/{cycle_YMDH}", + 'server_home': server_home }) self.task_config = AttrDict(**self.task_config, **local_dict) @logit(logger) def configure(self, globus_dict: Dict[str, Any]) -> (Dict[str, Any], List[Dict[str, Any]]): - """Collects the list of tarballs created by the arch task and writes - instructions to send them to HPSS via Globus. + """Collects the list of tarballs created by the arch task and writes instructions to + send them to HPSS via Globus and verify success. + + There are two services running that handle passing and running scripts. + On the client (e.g. Hercules), there is Sven. On the server (i.e. Niagara), there is + the Doorman. Sven packages up the file list and scripts that need to run on the server + and the Doorman executes the scripts on each of the files. The six files involved are + + dm.conf - One line indicating the location of the the scripts on the client. + location - The locations of files on the client to send to the server. + todo - A bash script that executes on each file once they are transferred to the server. + For our purposes, this is mainly pushing to HPSS and writing a log file with + either "SUCCESS" or "FAILURE" as the last line. + verify - A bash script that reads the log file to verify success. + return - Where to send the output of verify (globus address and folder location). + run_doorman.sh - A bash script to actually run the server-side service. This will be + automated by GDIT at some point, but for now must be executed on the + client via a pseudo terminal (ssh -t). + + The configuration method separates the file list into rstprod and non-rstprod (standard) + files, then constructs the dm.conf, todo, verify, return, and run_doorman.sh scripts in + memory from Jinja templates. The todo is different for rstprod so the data can be + protected. Parameters ---------- @@ -53,46 +116,146 @@ def configure(self, globus_dict: Dict[str, Any]) -> (Dict[str, Any], List[Dict[s Return ------ - globus_targets : List[Dict[str, Any]] - List of tarballs and instructions for sending them to HPSS via Globus + transfer_sets : Dict[str, Any] + Sets of tarballs and instructions for sending them to HPSS via Globus """ globus_parm = os.path.join(globus_dict.PARMgfs, "globus") - print(globus_parm) com_conf = globus_dict.COMIN_CONF # Collect the files and properties from the input YAML backup_yaml = os.path.join(com_conf, globus_dict.DATASETS_YAML) - backup_set = AttrDict(**parse_yaml(backup_yaml)) - - globus_instructions = [] - for name in backup_set.values(): + # Parse the list of tarballs to archive + if os.path.isfile(backup_yaml): + backup_set = AttrDict(**parse_yaml(backup_yaml)) + else: + raise FileNotFoundError("Backup tarball YAML is missing! ({backup_yaml})") - tarball = backup_set[name].target - if backup_set[name].has_rstprod: - globus_instructions.append(self._sven_rstprod_instructions(tarball)) + # Create a standard and rstprod backup sets for any restricted tarballs + standard_backup_set = [] + rstprod_backup_set = [] + for archive_name in backup_set: + if backup_set[archive_name]["has_rstprod"]: + rstprod_backup_set.append(backup_set[archive_name]['target']) else: - globus_instructions.append(self._sven_instructions(tarball)) + standard_backup_set.append(backup_set[archive_name]['target']) + + # Start parsing scripts and storing in the output dictionary + transfer_sets = { + "standard": {"locations": backup_set}, + "rstprod": {"locations": rstprod_backup_set} + } + + # Parse the doorman setup script + doorman_jinja = os.path.join(globus_parm, "run_doorman.sh.j2") + doorman_script = Jinja(doorman_jinja, data=globus_dict, allow_missing=False).render() + + # Write a script with the location of the dropbox on the client + dm_conf = f'export dropbox="{globus_dict.sven_dropbox}' - return globus_instructions + # Parse the return script + return_jinja = os.path.join(globus_parm, "return.sh.j2") + return_script = Jinja(return_jinja, data=globus_dict, allow_missing=False).render() + + # Create a todo script for rstprod and non-rstprod tarballs + todo_jinja = os.path.join(globus_parm, "todo.sh.j2") + todo_script = Jinja(todo_jinja, data=globus_dict, allow_missing=False).render() + transfer_sets["standard"]["todo"] = todo_script + + rstprod_todo_jinja = os.path.join(globus_parm, "rstprod_todo.sh.j2") + rstprod_todo_script = Jinja(rstprod_todo_jinja, data=globus_dict, allow_missing=False).render() + transfer_sets["rstprod"]["todo"] = rstprod_todo_script + + # Create a common verify script for all tarballs + vrfy_jinja = os.path.join(globus_parm, "verify.sh.j2") + vrfy_script = Jinja(vrfy_jinja, data=globus_dict, allow_missing=False).render() + + # Add common scripts to both standard and rstprod + for transfer_set in transfer_sets: + transfer_sets[transfer_set]["run_doorman.sh"] = doorman_script + transfer_sets[transfer_set]["dm.conf"] = dm_conf + transfer_sets[transfer_set]["return"] = return_script + transfer_sets[transfer_set]["verify"] = vrfy_script + transfer_sets[transfer_set]["server_name"] = globus_dict.server_name + transfer_sets[transfer_set]["homedir"] = ( + f"{globus_dict.server_home}/doorman/{globus_dict.jobID}/" + f"{transfer_set}" + ) + + return transfer_sets @logit(logger) - def execute_transfer_data(self, tarball_set: Dict[str, Any]) -> None: + def execute_transfer_data(self, transfer_set: Dict[str, Any]) -> None: """Interface function with Sven to send tarballs to HPSS via Niagara. Parameters ---------- - tarball_set: Dict[str, Any] - Set of tarballs and properties to applicable to their transfer. + transfer_set: Dict[str, Any] + Set of tarballs and properties applicable to their transfer. Return ------ None """ - pass + with open("dm.conf", "w") as conf_f: + conf_f.write(transfer_set["dm.conf"]) + with open("location", "w") as location_f: + location_f.write('\n'.join(location for location in transfer_set["locations"])) + with open("todo", "w") as todo_f: + todo_f.write(transfer_set["todo"]) + with open("verify", "w") as verify_f: + verify_f.write(transfer_set["verify"]) + with open("return", "w") as return_f: + return_f.write(transfer_set["return"]) + with open("run_doorman.sh", "w") as doorman_f: + doorman_f.write(transfer_set["run_doorman.sh"]) + + # Make run_doorman.sh executable + os.chmod("run_doorman.sh", 0o740) + + server_homedir = transfer_set["homedir"] + + # Tell Sven we have a package to send + try: + output = self.forsven() + except ProcessError as pe: + raise ProcessError("FATAL ERROR Sven failed to package the request" + f"with the output\n{output}") from pe + + # Transfer the doorman script to Niagara. + # Note, this assumes we have unattended transfer capability. + try: + # Start by making the directory it will run in + self.ssh("-t", "mkdir", "-p", f"{server_homedir}/doorman_rundir", output=str.split, error=str.split) + except ProcessError as pe: + raise ProcessError("FATAL ERROR Failed to create temporary working directoryon Niagara") from pe + + try: + # Now transfer and rename the script + server_run_script = f"{server_homedir}/doorman_rundir/run_doorman.sh" + self.scp( + "run_doorman.sh", f"{transfer_set['server_name']}:{server_run_script}", + output=str.split, error=str.split + ) + except ProcessError as pe: + raise ProcessError("FATAL ERROR Failed to send doorman run script to Niagara") from pe + + # Now actually run the doorman script + try: + self.ssh( + "-t", "{server_run_script}", + output=str.split, error=str.split + ) + except ProcessError as pe: + raise ProcessError("FATAL ERROR Failed to run the Doorman service on Niagara") from pe + + # Lastly, check the response from the doorman in Sven's dropbox + # TODO + + return @logit(logger) def clean(self): From 516ac499024c70b6a08c86d5839a08141e1ca1f7 Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 28 Jan 2025 10:43:51 -0600 Subject: [PATCH 29/83] Add variables to globus config dict --- scripts/exglobal_globus.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/exglobal_globus.py b/scripts/exglobal_globus.py index 5b16ad4d2a..498ff75260 100755 --- a/scripts/exglobal_globus.py +++ b/scripts/exglobal_globus.py @@ -17,8 +17,10 @@ def main(): # Instantiate the globus object globus = GlobusHpss(config) - keys = ['STAGE_DIR', 'current_cycle', 'RUN', 'PDY', 'NMEM_ENS', 'HOMEgfs', 'sven_dir', - 'DATASETS_YAML', 'PARMgfs', 'COMIN_CONF'] + keys = ['STAGE_DIR', 'current_cycle', 'RUN', 'PDY', 'HOMEgfs', 'sven_dropbox', + 'doorman_gendel', 'DATASETS_YAML', 'PARMgfs', 'COMIN_CONF', 'KEEPDATA', + 'jobID', 'hpss_target_dir', 'server_home', 'SERVER_NAME', 'DOORMAN_ROOT', + 'CLIENT_GLOBUS_UUID'] globus_dict = AttrDict() for key in keys: @@ -28,10 +30,10 @@ def main(): logger.warning(f"WARNING: key ({key}) not found in globus.task_config!") # Determine which tarballs to send - transfer_set = globus.configure(globus_dict) + transfer_sets = globus.configure(globus_dict) # Send the tarballs to HPSS via Niagara - globus.execute_transfer_data(transfer_set) + globus.execute_transfer_data(transfer_sets) # Clean up any temporary files globus.clean() From 895af47eac113c2161ffda2de943b30fba107c1f Mon Sep 17 00:00:00 2001 From: David Huber Date: Thu, 30 Jan 2025 14:53:30 -0600 Subject: [PATCH 30/83] Add globus to service tasks --- workflow/rocoto/tasks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rocoto/tasks.py b/workflow/rocoto/tasks.py index 3c215414b5..d1bc78aa59 100644 --- a/workflow/rocoto/tasks.py +++ b/workflow/rocoto/tasks.py @@ -11,8 +11,8 @@ class Tasks: - SERVICE_TASKS = ['arch', 'earc', 'stage_ic', 'cleanup'] - VALID_TASKS = ['aerosol_init', 'stage_ic', + SERVICE_TASKS = ['arch', 'earc', 'stage_ic', 'cleanup', 'globus'] + VALID_TASKS = ['aerosol_init', 'stage_ic', 'globus', 'prep', 'anal', 'sfcanl', 'analcalc', 'analdiag', 'arch', "cleanup", 'prepatmiodaobs', 'atmanlinit', 'atmanlvar', 'atmanlfv3inc', 'atmanlfinal', 'prepoceanobs', From 1d085770b9410f545c879b29fce923d599eb8056 Mon Sep 17 00:00:00 2001 From: David Huber Date: Thu, 30 Jan 2025 14:54:45 -0600 Subject: [PATCH 31/83] Make doorman script a function --- parm/globus/run_doorman.sh.j2 | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/parm/globus/run_doorman.sh.j2 b/parm/globus/run_doorman.sh.j2 index 066aa0f379..1775397466 100644 --- a/parm/globus/run_doorman.sh.j2 +++ b/parm/globus/run_doorman.sh.j2 @@ -1,10 +1,15 @@ #!/usr/bin/env bash +# Redirect all output to a local file -- it cannot be sent back directly with tty/pty +script_relpath="$(dirname "${BASH_SOURCE[0]}")" +log_file="${script_relpath}/run_doorman.log" + +run_doorman(){ set -eux # This script runs on Niagara to interact with the Doorman service # Make the working directory -doorman_dir='{{homedir}}/doorman_wd' +doorman_dir='{{server_home}}/doorman_wd' mkdir -p "${doorman_dir}" cd "${doorman_dir}" rm -f dm.conf @@ -69,5 +74,8 @@ if [[ "{{KEEPDATA}}" == "NO" ]]; then cd "${HOME}" || echo "Failed to navigate to ${HOME}!" && exit 4 rm -rf "${doorman_dir}" fi +} + +run_doorman >& "${log_file}" exit 0 From 00ca49781beade173bb9d461b10338626542c125 Mon Sep 17 00:00:00 2001 From: David Huber Date: Thu, 30 Jan 2025 14:56:35 -0600 Subject: [PATCH 32/83] Move sven's dropbox to DATA --- parm/config/gfs/config.globus | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parm/config/gfs/config.globus b/parm/config/gfs/config.globus index 92e7b0cb46..d52a5440ef 100644 --- a/parm/config/gfs/config.globus +++ b/parm/config/gfs/config.globus @@ -21,7 +21,7 @@ export CLIENT_GLOBUS_UUID=@CLIENT_GLOBUS_UUID@ export SERVER_HOME='/collab1/data/{{LOGNAME}}' # Sven's dropbox -export SVEN_DROPBOX_ROOT="${DATAROOT}/archive_rotdir/${RUN}/SVEN_DROPBOX" +export SVEN_DROPBOX_ROOT="${DATA}/SVEN_DROPBOX" # Location of the doorman package on Niagara export DOORMAN_ROOT="/home/Georgy.Fekete/sven" From 0d5ed3b6c50daa5ed502e3187fc877e4e52efeb4 Mon Sep 17 00:00:00 2001 From: David Huber Date: Thu, 30 Jan 2025 14:57:38 -0600 Subject: [PATCH 33/83] Clean up templating --- parm/globus/todo.sh.j2 | 2 +- parm/globus/todo_rstprod.sh.j2 | 2 +- parm/globus/verify.sh.j2 | 15 +++++++++------ 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/parm/globus/todo.sh.j2 b/parm/globus/todo.sh.j2 index 2322ca1500..b6559a0c8c 100644 --- a/parm/globus/todo.sh.j2 +++ b/parm/globus/todo.sh.j2 @@ -1,7 +1,7 @@ #!/bin/bash # Jinja-templated variables hpss_target_dir="{{hpss_target_dir}}" -log_directory="{{niagara_log_directory}}" +log_directory="{{server_home}}/doorman/{{jobid}}/logs/" cwd=$(pwd) file="${1}" diff --git a/parm/globus/todo_rstprod.sh.j2 b/parm/globus/todo_rstprod.sh.j2 index 30a245768b..8d7d2d3322 100644 --- a/parm/globus/todo_rstprod.sh.j2 +++ b/parm/globus/todo_rstprod.sh.j2 @@ -1,7 +1,7 @@ #!/bin/bash # Jinja-templated variables hpss_target_dir="{{hpss_target_dir}}" -log_directory="{{niagara_log_directory}}" +log_directory="{{server_home}}/doorman/{{jobid}}/logs/" cwd=$(pwd) file="${1}" diff --git a/parm/globus/verify.sh.j2 b/parm/globus/verify.sh.j2 index 7ea37e209f..ea24241e54 100644 --- a/parm/globus/verify.sh.j2 +++ b/parm/globus/verify.sh.j2 @@ -1,12 +1,15 @@ #!/bin/bash # Jinja-templated variables -log_directory="{{niagara_log_directory}}" -# In lieu of an actual globus xfer ID, use the filename +# For now, there is no easy way to determine which log file belongs to which +# tarball. So, the verify script simply checks all log files for "SUCCESS". # TODO when the Doorman is capable of providing the xfer ID, use that instead -globus_xfer_id="{{target_filename}}" +# globus_xfer_id="" +log_directory="{{server_home}}/doorman/{{jobid}}/logs/" -log_file="${log_directory}/hpss_status_${globus_xfer_id}.log" - -hpss_stat=$(tail -1 "${log_file}") +hpss_stat="" +for log in ${log_directory}/*.log; do + hpss_stat_tmp=$(tail -1 "${log}") + hpss_stat="${log}: ${hpss_stat}\n${hpss_stat_tmp}" +done echo "${hpss_stat}" From f629cb9fe38e15cd01b528c32de9c024d595f405 Mon Sep 17 00:00:00 2001 From: David Huber Date: Thu, 30 Jan 2025 14:58:27 -0600 Subject: [PATCH 34/83] Fix jobid variable name --- scripts/exglobal_globus.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/exglobal_globus.py b/scripts/exglobal_globus.py index 498ff75260..63cbc3aaf4 100755 --- a/scripts/exglobal_globus.py +++ b/scripts/exglobal_globus.py @@ -19,7 +19,7 @@ def main(): keys = ['STAGE_DIR', 'current_cycle', 'RUN', 'PDY', 'HOMEgfs', 'sven_dropbox', 'doorman_gendel', 'DATASETS_YAML', 'PARMgfs', 'COMIN_CONF', 'KEEPDATA', - 'jobID', 'hpss_target_dir', 'server_home', 'SERVER_NAME', 'DOORMAN_ROOT', + 'jobid', 'hpss_target_dir', 'server_home', 'SERVER_NAME', 'DOORMAN_ROOT', 'CLIENT_GLOBUS_UUID'] globus_dict = AttrDict() @@ -32,8 +32,9 @@ def main(): # Determine which tarballs to send transfer_sets = globus.configure(globus_dict) - # Send the tarballs to HPSS via Niagara - globus.execute_transfer_data(transfer_sets) + # Send the tarballs to HPSS via Niagara. Start with non-rstprod (standard) data + for transfer_set in ["standard", "rstprod"]: + globus.execute_transfer_data(transfer_sets[transfer_set]) # Clean up any temporary files globus.clean() From 9b041051f0eb27a31453a56a2b53a427268f9199 Mon Sep 17 00:00:00 2001 From: David Huber Date: Thu, 30 Jan 2025 14:59:16 -0600 Subject: [PATCH 35/83] Fix multiple issues with Globus class --- ush/python/pygfs/task/globus_hpss.py | 83 +++++++++++++++++++--------- 1 file changed, 57 insertions(+), 26 deletions(-) diff --git a/ush/python/pygfs/task/globus_hpss.py b/ush/python/pygfs/task/globus_hpss.py index 58ce6f1deb..fe243c26e4 100644 --- a/ush/python/pygfs/task/globus_hpss.py +++ b/ush/python/pygfs/task/globus_hpss.py @@ -2,6 +2,7 @@ import os from logging import getLogger +from pathlib import Path from typing import Any, Dict, List from wxflow import AttrDict, Task, to_YMD, to_YMDH, strftime, logit, parse_yaml, Jinja, which, ProcessError @@ -56,7 +57,7 @@ def __init__(self, config: Dict[str, Any]) -> None: server_name = self.task_config.SERVER_NAME try: - ssh_output = self.ssh("-G", "{server_name}", output=str) + ssh_output = self.ssh("-G", f"{server_name}", output=str) except ProcessError as pe: raise ProcessError("FATAL ERROR No host information on niagara!\n" f"Please add an entry for {server_name} into ~/.ssh/config!") from pe @@ -72,9 +73,10 @@ def __init__(self, config: Dict[str, Any]) -> None: "{{LOGNAME}}", server_username ) + logger.debug(f"Server username detected as {server_username}") + local_dict = AttrDict({ - 'sven_dropbox': (f"{self.task_config.SVEN_DROPBOX_ROOT}/" - f"{self.task_config.PSLOT}/{self.task_config.RUN}.{cycle_YMD}/{cycle_HH}"), + 'sven_dropbox': (f"{self.task_config.SVEN_DROPBOX_ROOT}"), 'doorman_gendel': (f"{server_home}/GENERAL_DELIVERY/" f"{self.task_config.PSLOT}/{self.task_config.RUN}.{cycle_YMD}/{cycle_HH}"), 'hpss_target_dir': f"{self.task_config.ATARDIR}/{cycle_YMDH}", @@ -94,7 +96,7 @@ def configure(self, globus_dict: Dict[str, Any]) -> (Dict[str, Any], List[Dict[s and the Doorman executes the scripts on each of the files. The six files involved are dm.conf - One line indicating the location of the the scripts on the client. - location - The locations of files on the client to send to the server. + location - The location of the file on the client to send to the server. todo - A bash script that executes on each file once they are transferred to the server. For our purposes, this is mainly pushing to HPSS and writing a log file with either "SUCCESS" or "FAILURE" as the last line. @@ -144,33 +146,36 @@ def configure(self, globus_dict: Dict[str, Any]) -> (Dict[str, Any], List[Dict[s # Start parsing scripts and storing in the output dictionary transfer_sets = { - "standard": {"locations": backup_set}, + "standard": {"locations": standard_backup_set}, "rstprod": {"locations": rstprod_backup_set} } # Parse the doorman setup script doorman_jinja = os.path.join(globus_parm, "run_doorman.sh.j2") - doorman_script = Jinja(doorman_jinja, data=globus_dict, allow_missing=False).render() + doorman_script = Jinja(doorman_jinja, data=globus_dict, allow_missing=False).render # Write a script with the location of the dropbox on the client - dm_conf = f'export dropbox="{globus_dict.sven_dropbox}' + dm_conf = f'export dropbox="{globus_dict.sven_dropbox}"' + + # Make the dropbox and clean it out + Path(globus_dict.sven_dropbox).mkdir(exist_ok=True) # Parse the return script return_jinja = os.path.join(globus_parm, "return.sh.j2") - return_script = Jinja(return_jinja, data=globus_dict, allow_missing=False).render() + return_script = Jinja(return_jinja, data=globus_dict, allow_missing=False).render # Create a todo script for rstprod and non-rstprod tarballs todo_jinja = os.path.join(globus_parm, "todo.sh.j2") - todo_script = Jinja(todo_jinja, data=globus_dict, allow_missing=False).render() + todo_script = Jinja(todo_jinja, data=globus_dict, allow_missing=False).render transfer_sets["standard"]["todo"] = todo_script rstprod_todo_jinja = os.path.join(globus_parm, "rstprod_todo.sh.j2") - rstprod_todo_script = Jinja(rstprod_todo_jinja, data=globus_dict, allow_missing=False).render() + rstprod_todo_script = Jinja(rstprod_todo_jinja, data=globus_dict, allow_missing=False).render transfer_sets["rstprod"]["todo"] = rstprod_todo_script # Create a common verify script for all tarballs vrfy_jinja = os.path.join(globus_parm, "verify.sh.j2") - vrfy_script = Jinja(vrfy_jinja, data=globus_dict, allow_missing=False).render() + vrfy_script = Jinja(vrfy_jinja, data=globus_dict, allow_missing=False).render # Add common scripts to both standard and rstprod for transfer_set in transfer_sets: @@ -178,9 +183,9 @@ def configure(self, globus_dict: Dict[str, Any]) -> (Dict[str, Any], List[Dict[s transfer_sets[transfer_set]["dm.conf"] = dm_conf transfer_sets[transfer_set]["return"] = return_script transfer_sets[transfer_set]["verify"] = vrfy_script - transfer_sets[transfer_set]["server_name"] = globus_dict.server_name + transfer_sets[transfer_set]["server_name"] = globus_dict.SERVER_NAME transfer_sets[transfer_set]["homedir"] = ( - f"{globus_dict.server_home}/doorman/{globus_dict.jobID}/" + f"{globus_dict.server_home}/doorman/{globus_dict.jobid}/" f"{transfer_set}" ) @@ -202,8 +207,6 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any]) -> None: with open("dm.conf", "w") as conf_f: conf_f.write(transfer_set["dm.conf"]) - with open("location", "w") as location_f: - location_f.write('\n'.join(location for location in transfer_set["locations"])) with open("todo", "w") as todo_f: todo_f.write(transfer_set["todo"]) with open("verify", "w") as verify_f: @@ -217,27 +220,35 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any]) -> None: os.chmod("run_doorman.sh", 0o740) server_homedir = transfer_set["homedir"] - - # Tell Sven we have a package to send - try: - output = self.forsven() - except ProcessError as pe: - raise ProcessError("FATAL ERROR Sven failed to package the request" - f"with the output\n{output}") from pe + server_name = transfer_set["server_name"] + + # Tell Sven we have files to send, one at a time + for location in transfer_set["locations"]: + print(location) + with open("location", "w") as location_f: + location_f.write(location+"\n") + try: + logger.info(f"Preparing package for {location}") + self.forsven(output=str.split) + except ProcessError as pe: + raise ProcessError("FATAL ERROR Sven failed to package the request" + f"for {location}") from pe # Transfer the doorman script to Niagara. # Note, this assumes we have unattended transfer capability. try: # Start by making the directory it will run in - self.ssh("-t", "mkdir", "-p", f"{server_homedir}/doorman_rundir", output=str.split, error=str.split) + logger.debug(f"Making the run directory {server_homedir}/doorman_rundir on {server_name}") + self.ssh("-tt", server_name, f"mkdir -p {server_homedir}/doorman_rundir", output=str.split, error=str.split) except ProcessError as pe: raise ProcessError("FATAL ERROR Failed to create temporary working directoryon Niagara") from pe try: # Now transfer and rename the script server_run_script = f"{server_homedir}/doorman_rundir/run_doorman.sh" + logger.debug(f"Transfer run_doorman.sh to {server_name}:{server_run_script}") self.scp( - "run_doorman.sh", f"{transfer_set['server_name']}:{server_run_script}", + "run_doorman.sh", f"{server_name}:{server_run_script}", output=str.split, error=str.split ) except ProcessError as pe: @@ -245,12 +256,32 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any]) -> None: # Now actually run the doorman script try: + logger.debug(f"Run {server_run_script} remotely") self.ssh( - "-t", "{server_run_script}", + "-tt", server_name, f"{server_run_script}", output=str.split, error=str.split ) except ProcessError as pe: - raise ProcessError("FATAL ERROR Failed to run the Doorman service on Niagara") from pe + # Try and retrieve the log file + try: + self.scp(f"{server_name}:{server_homedir}/run_doorman.log", ".") + except ProcessError: + logger.warning("WARNING unable to transfer the doorman log back after failure") + else: + logger.info("The doorman failed to run. Printing output of the log:") + with open('run_doorman.log', 'r') as doorman_log: + print(doorman_log.read()) + + raise ProcessError(f"FATAL ERROR Failed to run the Doorman service on {server_name}") from pe + + # Retrieve and print the Doorman log file from the server + try: + self.scp(f"{server_name}:{server_homedir}/run_doorman.log", '.') + with open('run_doorman.log', 'r') as doorman_log: + print(doorman_log.read()) + + except ProcessError as pe: + raise ProcessError("FATAL ERROR Failed to retrieve the doorman log file from {server_name}") from pe # Lastly, check the response from the doorman in Sven's dropbox # TODO From 8c516905d5a28e5ec61d2667b445f17ecb06599f Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 4 Feb 2025 07:40:54 -0600 Subject: [PATCH 36/83] Run the doorman in the same directory as the script --- parm/globus/run_doorman.sh.j2 | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/parm/globus/run_doorman.sh.j2 b/parm/globus/run_doorman.sh.j2 index 1775397466..379ecd4d25 100644 --- a/parm/globus/run_doorman.sh.j2 +++ b/parm/globus/run_doorman.sh.j2 @@ -1,20 +1,17 @@ #!/usr/bin/env bash +# This script runs on Niagara to interact with the Doorman service # Redirect all output to a local file -- it cannot be sent back directly with tty/pty script_relpath="$(dirname "${BASH_SOURCE[0]}")" log_file="${script_relpath}/run_doorman.log" +cd "${script_relpath}" run_doorman(){ set -eux -# This script runs on Niagara to interact with the Doorman service - -# Make the working directory -doorman_dir='{{server_home}}/doorman_wd' -mkdir -p "${doorman_dir}" -cd "${doorman_dir}" rm -f dm.conf rm -f places.inc rm -f FLIST +rm -rf "{{doorman_gendel}}" # Tell the doorman where the general delivery space is on Niagara (unique for each RUN/cycle) # This is where tarballs will be received and confirmations are written and sent. @@ -23,6 +20,8 @@ echo 'export GENDEL="{{doorman_gendel}}"' > places.inc echo 'export CLIENT_ENDPOINT="{{CLIENT_GLOBUS_UUID}}"' >> places.inc # Tell the doorman where the sending client's dropbox is (why twice??) echo 'export CLIENT_DROPBOX="{{sven_dropbox}}"' >> places.inc +# Tell the doorman what it's own UUID is +echo 'export NIAG_GLC="{{SERVER_GLOBUS_UUID}}"' >> places.inc # Point to the doorman executable scripts export PATH="${PATH}:{{DOORMAN_ROOT}}/bin" From a60848686814aa9d9809648f39d2ae25530e0f38 Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 4 Feb 2025 07:41:48 -0600 Subject: [PATCH 37/83] Add Niagara UUID --- parm/config/gfs/config.globus | 6 +++++- scripts/exglobal_globus.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/parm/config/gfs/config.globus b/parm/config/gfs/config.globus index d52a5440ef..21aef480b8 100644 --- a/parm/config/gfs/config.globus +++ b/parm/config/gfs/config.globus @@ -24,9 +24,13 @@ export SERVER_HOME='/collab1/data/{{LOGNAME}}' export SVEN_DROPBOX_ROOT="${DATA}/SVEN_DROPBOX" # Location of the doorman package on Niagara -export DOORMAN_ROOT="/home/Georgy.Fekete/sven" +export DOORMAN_ROOT="/home/Gyorgy.Fekete/doorman" # Server name (should match ~/.ssh/config) export SERVER_NAME="niagara" +# Server globus UUID +niagara_UUID="1bfd8a79-52b2-4589-88b2-0648e0c0b35d" +export SERVER_GLOBUS_UUID="${niagara_UUID}" + echo "END: config.globus" diff --git a/scripts/exglobal_globus.py b/scripts/exglobal_globus.py index 63cbc3aaf4..023bcad2d0 100755 --- a/scripts/exglobal_globus.py +++ b/scripts/exglobal_globus.py @@ -20,7 +20,7 @@ def main(): keys = ['STAGE_DIR', 'current_cycle', 'RUN', 'PDY', 'HOMEgfs', 'sven_dropbox', 'doorman_gendel', 'DATASETS_YAML', 'PARMgfs', 'COMIN_CONF', 'KEEPDATA', 'jobid', 'hpss_target_dir', 'server_home', 'SERVER_NAME', 'DOORMAN_ROOT', - 'CLIENT_GLOBUS_UUID'] + 'CLIENT_GLOBUS_UUID', 'SERVER_GLOBUS_UUID'] globus_dict = AttrDict() for key in keys: From 37f0a22601c9cdd4efc181909cd22226b823725c Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 4 Feb 2025 07:42:18 -0600 Subject: [PATCH 38/83] Add robustness to transfer script --- ush/python/pygfs/task/globus_hpss.py | 120 ++++++++++++++++++--------- 1 file changed, 79 insertions(+), 41 deletions(-) diff --git a/ush/python/pygfs/task/globus_hpss.py b/ush/python/pygfs/task/globus_hpss.py index fe243c26e4..11e5839f54 100644 --- a/ush/python/pygfs/task/globus_hpss.py +++ b/ush/python/pygfs/task/globus_hpss.py @@ -3,7 +3,9 @@ import os from logging import getLogger from pathlib import Path +from time import sleep from typing import Any, Dict, List +import re from wxflow import AttrDict, Task, to_YMD, to_YMDH, strftime, logit, parse_yaml, Jinja, which, ProcessError @@ -33,8 +35,8 @@ def __init__(self, config: Dict[str, Any]) -> None: super().__init__(config) # Declare these here so the jinja-templated scripts can be shellchecked - cycle_YMD = to_YMD(self.task_config.current_cycle), - cycle_YMDH = to_YMDH(self.task_config.current_cycle), + cycle_YMD = to_YMD(self.task_config.current_cycle) + cycle_YMDH = to_YMDH(self.task_config.current_cycle) cycle_HH = strftime(self.task_config.current_cycle, '%H') # Instantiate all of the executables we will need to run @@ -49,18 +51,20 @@ def __init__(self, config: Dict[str, Any]) -> None: if self.ssh is None: raise FileNotFoundError("FATAL ERROR Could not find ssh!") - # Get the user's server username from their ~/.ssh/config file - - if self.ssh is None: - raise FileNotFoundError("FATAL ERROR Failed to locate ssh!") + # Disable strict host key checking by default + # This auto-accepts changes to keys + self.scp.add_default_arg("-oStrictHostKeyChecking=no") + # Get the user's server username from their ~/.ssh/config file server_name = self.task_config.SERVER_NAME - try: ssh_output = self.ssh("-G", f"{server_name}", output=str) except ProcessError as pe: - raise ProcessError("FATAL ERROR No host information on niagara!\n" - f"Please add an entry for {server_name} into ~/.ssh/config!") from pe + raise ProcessError( + f"FATAL ERROR No host information on {server_name}!" + "\n" + f"Please add an entry for {server_name} into ~/.ssh/config!" + ) from pe # Parse the ssh output to find the user's Niagara username ssh_output = ssh_output.split("\n") @@ -183,8 +187,9 @@ def configure(self, globus_dict: Dict[str, Any]) -> (Dict[str, Any], List[Dict[s transfer_sets[transfer_set]["dm.conf"] = dm_conf transfer_sets[transfer_set]["return"] = return_script transfer_sets[transfer_set]["verify"] = vrfy_script + transfer_sets[transfer_set]["sven_dropbox"] = globus_dict.sven_dropbox transfer_sets[transfer_set]["server_name"] = globus_dict.SERVER_NAME - transfer_sets[transfer_set]["homedir"] = ( + transfer_sets[transfer_set]["server_homedir"] = ( f"{globus_dict.server_home}/doorman/{globus_dict.jobid}/" f"{transfer_set}" ) @@ -192,7 +197,7 @@ def configure(self, globus_dict: Dict[str, Any]) -> (Dict[str, Any], List[Dict[s return transfer_sets @logit(logger) - def execute_transfer_data(self, transfer_set: Dict[str, Any]) -> None: + def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) -> None: """Interface function with Sven to send tarballs to HPSS via Niagara. Parameters @@ -219,9 +224,13 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any]) -> None: # Make run_doorman.sh executable os.chmod("run_doorman.sh", 0o740) - server_homedir = transfer_set["homedir"] + server_homedir = transfer_set["server_homedir"] server_name = transfer_set["server_name"] + # Initialize a list of status files. + transfer_set["statuses"] = [] + transfer_set["completed"] = [] + # Tell Sven we have files to send, one at a time for location in transfer_set["locations"]: print(location) @@ -229,20 +238,20 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any]) -> None: location_f.write(location+"\n") try: logger.info(f"Preparing package for {location}") - self.forsven(output=str.split) + sven_output = self.forsven(output=str.split) except ProcessError as pe: raise ProcessError("FATAL ERROR Sven failed to package the request" f"for {location}") from pe + # Parse Sven's output to get the name of the return status file + match = re.search("\"(status_.*)\" in your dropbox", sven_output) + transfer_set["status_files"].append(os.path.join(transfer_set["sven_dropbox"], match.group(1))) + + # Initialize 'completed' to false for each file + transfer_set["completed"].append(False) + # Transfer the doorman script to Niagara. # Note, this assumes we have unattended transfer capability. - try: - # Start by making the directory it will run in - logger.debug(f"Making the run directory {server_homedir}/doorman_rundir on {server_name}") - self.ssh("-tt", server_name, f"mkdir -p {server_homedir}/doorman_rundir", output=str.split, error=str.split) - except ProcessError as pe: - raise ProcessError("FATAL ERROR Failed to create temporary working directoryon Niagara") from pe - try: # Now transfer and rename the script server_run_script = f"{server_homedir}/doorman_rundir/run_doorman.sh" @@ -254,37 +263,66 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any]) -> None: except ProcessError as pe: raise ProcessError("FATAL ERROR Failed to send doorman run script to Niagara") from pe - # Now actually run the doorman script - try: - logger.debug(f"Run {server_run_script} remotely") - self.ssh( - "-tt", server_name, f"{server_run_script}", - output=str.split, error=str.split - ) - except ProcessError as pe: - # Try and retrieve the log file - try: - self.scp(f"{server_name}:{server_homedir}/run_doorman.log", ".") - except ProcessError: - logger.warning("WARNING unable to transfer the doorman log back after failure") - else: - logger.info("The doorman failed to run. Printing output of the log:") - with open('run_doorman.log', 'r') as doorman_log: - print(doorman_log.read()) + # Now wait for the doorman script to run via cron on Niagara. + # Once complete, Sven's dropbox should fill up with status files. + wait_count = 0 + sleep_time = 300 # s + timeout_time = 5.75 * 3600 # s + max_wait_count = int(timeout_time / sleep_time) + + # Initialize transfer status + transfer_failed = False + while not all(transfer_set["completed"]) and wait_count < max_wait_count: + sleep(sleep_time) + for i in range(len(transfer_set["status_files"])): + status_file = transfer_set["status_files"][i] + if os.path.exists(status_file): + # If this is a new status file, check if the transfer was successful + if not transfer_set["completed"][i]: + transfer_set["completed"][i] = True + with open(status_file) as status_handle: + transfer_set["successes"][i] = status_handle.readlines()[-1] == "SUCCESS" + + if transfer_set["successes"][i]: + logger.info(f"Successfully archived {transfer_set['locations'][i]} to HPSS!") + else: + # Exit the loop immediately, but allow the log file to be downloaded. + if has_rstprod: + logger.error( + f"FATAL ERROR HPSS archiving of restricted file {transfer_set['locations'][i]} failed!" + "\nPlease verify that the file has been deleted from HPSS!" + ) + transfer_failed = True + break + else: + logger.error(f"FATAL ERROR HPSS archiving failed for {transfer_set['locations'][i]}.") + transfer_failed = True + + if transfer_failed: + break + + wait_count += 1 + wait_time = wait_count * sleep_time + + complete_count = sum(transfer_set["completed"]) + + logger.debug(f"{complete_count} files transferred in {wait_time} seconds.") - raise ProcessError(f"FATAL ERROR Failed to run the Doorman service on {server_name}") from pe + # Sleep a couple more seconds to ensure all status files finish transferring + sleep(2) # Retrieve and print the Doorman log file from the server try: self.scp(f"{server_name}:{server_homedir}/run_doorman.log", '.') with open('run_doorman.log', 'r') as doorman_log: - print(doorman_log.read()) + logger.info(doorman_log.read()) except ProcessError as pe: raise ProcessError("FATAL ERROR Failed to retrieve the doorman log file from {server_name}") from pe - # Lastly, check the response from the doorman in Sven's dropbox - # TODO + # Check for a failed transfer and/or timeouts + if transfer_failed or not all(transfer_set["successes"]): + raise ProcessError("FATAL ERROR Some/all files failed to archive to HPSS") return From ff3bd75df5833d7858558f29643fb50d4d556c67 Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 4 Feb 2025 07:43:37 -0600 Subject: [PATCH 39/83] Initial globus checks in workflow --- workflow/applications/applications.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/workflow/applications/applications.py b/workflow/applications/applications.py index 9ef674c8eb..68af87307e 100644 --- a/workflow/applications/applications.py +++ b/workflow/applications/applications.py @@ -3,6 +3,7 @@ from typing import Dict, List, Any from hosts import Host from wxflow import Configuration +import importlib.util from abc import ABC, ABCMeta, abstractmethod __all__ = ['AppConfig'] @@ -70,6 +71,7 @@ def _get_run_options(self, conf: Configuration) -> Dict[str, Any]: ''' run_options = {run: {} for run in dict.fromkeys(self.runs)} + globus_checked = False for run in self.runs: # Read config.base with RUN specified run_base = conf.parse_config('config.base', RUN=run) @@ -105,6 +107,10 @@ def _get_run_options(self, conf: Configuration) -> Dict[str, Any]: if not AppConfig.is_monotonic(run_options[run]['fcst_segments']): raise ValueError(f'Forecast segments do not increase monotonically: {",".join(self.fcst_segments)}') + if run_options[run]['do_globusarch'] and not globus_checked: + globus_checked = self.check_globus(conf) + self.generate_globus_cron(conf) + # Return the dictionary of run options return run_options @@ -206,3 +212,15 @@ def is_monotonic(test_list: List, check_decreasing: bool = False) -> bool: return all(x > y for x, y in zip(test_list, test_list[1:])) else: return all(x < y for x, y in zip(test_list, test_list[1:])) + + def check_globus(self, conf): + # Test that globus can be imported + spec = importlib.util.find_spec("globus_cli") + if spec is None: + raise ImportError("Globus-cli module not found! Check that the module is loaded!") + + from globus_cli import main as globus + + globus_conf = conf.parse_config('config.globus') + # Check that a globus session is active + globus_cli = which("globus_cli") From 9e6637f26b2491654e2eb795ebf5eca919b9c088 Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 4 Feb 2025 08:26:57 -0600 Subject: [PATCH 40/83] Address shellcheck issues --- parm/globus/run_doorman.sh.j2 | 18 ++++++++++-------- parm/globus/todo_rstprod.sh.j2 | 11 +++++++---- parm/globus/verify.sh.j2 | 2 +- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/parm/globus/run_doorman.sh.j2 b/parm/globus/run_doorman.sh.j2 index 379ecd4d25..a104fed3b1 100644 --- a/parm/globus/run_doorman.sh.j2 +++ b/parm/globus/run_doorman.sh.j2 @@ -14,14 +14,16 @@ rm -f FLIST rm -rf "{{doorman_gendel}}" # Tell the doorman where the general delivery space is on Niagara (unique for each RUN/cycle) -# This is where tarballs will be received and confirmations are written and sent. -echo 'export GENDEL="{{doorman_gendel}}"' > places.inc -# Tell the doorman what the sender's UUID is -echo 'export CLIENT_ENDPOINT="{{CLIENT_GLOBUS_UUID}}"' >> places.inc -# Tell the doorman where the sending client's dropbox is (why twice??) -echo 'export CLIENT_DROPBOX="{{sven_dropbox}}"' >> places.inc -# Tell the doorman what it's own UUID is -echo 'export NIAG_GLC="{{SERVER_GLOBUS_UUID}}"' >> places.inc +{ + # This is where tarballs will be received and confirmations are written and sent. + echo 'export GENDEL="{{doorman_gendel}}"' + # Tell the doorman what the sender's UUID is + echo 'export CLIENT_ENDPOINT="{{CLIENT_GLOBUS_UUID}}"' + # Tell the doorman where the sending client's dropbox is (why twice??) + echo 'export CLIENT_DROPBOX="{{sven_dropbox}}"' + # Tell the doorman what it's own UUID is + echo 'export NIAG_GLC="{{SERVER_GLOBUS_UUID}}"' +} > places.inc # Point to the doorman executable scripts export PATH="${PATH}:{{DOORMAN_ROOT}}/bin" diff --git a/parm/globus/todo_rstprod.sh.j2 b/parm/globus/todo_rstprod.sh.j2 index 8d7d2d3322..1f5260a69d 100644 --- a/parm/globus/todo_rstprod.sh.j2 +++ b/parm/globus/todo_rstprod.sh.j2 @@ -13,6 +13,7 @@ file_full="${cwd}/${file}" mkdir -p "${log_directory}" log_file="${log_directory}/hpss_status_${globus_xfer_id}.log" +rm -f "${log_file}" touch "${log_file}" send_to_hpss() @@ -34,10 +35,12 @@ send_to_hpss() # Write a command file to place the file on hpss and protect it local hpss_target="${hpss_target_dir}/${file}" local command_file="command_file_${globus_xfer_id}" - echo "mkdir -p ${hpss_dir}" >> "${command_file}" - echo "put ${file_full} : ${hpss_target}" >> "${command_file}" - echo "chgrp rstprod ${hpss_target}" >> "${command_file}" - echo "chmod 640 ${hpss_target}" >> "${command_file}" + { + echo "mkdir -p ${hpss_dir}" + echo "put ${file_full} : ${hpss_target}" + echo "chgrp rstprod ${hpss_target}" + echo "chmod 640 ${hpss_target}" + } > "${command_file}" hsi in "${command_file}" >> "${log_file}" 2>&1 # shellcheck disable=SC2181 diff --git a/parm/globus/verify.sh.j2 b/parm/globus/verify.sh.j2 index ea24241e54..9c9db193d5 100644 --- a/parm/globus/verify.sh.j2 +++ b/parm/globus/verify.sh.j2 @@ -7,7 +7,7 @@ log_directory="{{server_home}}/doorman/{{jobid}}/logs/" hpss_stat="" -for log in ${log_directory}/*.log; do +for log in "${log_directory}"/*.log; do hpss_stat_tmp=$(tail -1 "${log}") hpss_stat="${log}: ${hpss_stat}\n${hpss_stat_tmp}" done From 40d3e28edab1efeb539e899471223c7b4cb77d3b Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 4 Feb 2025 08:31:07 -0600 Subject: [PATCH 41/83] Address pycodestyle issues --- ush/python/pygfs/task/globus_hpss.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ush/python/pygfs/task/globus_hpss.py b/ush/python/pygfs/task/globus_hpss.py index 11e5839f54..995d8043e2 100644 --- a/ush/python/pygfs/task/globus_hpss.py +++ b/ush/python/pygfs/task/globus_hpss.py @@ -61,9 +61,9 @@ def __init__(self, config: Dict[str, Any]) -> None: ssh_output = self.ssh("-G", f"{server_name}", output=str) except ProcessError as pe: raise ProcessError( - f"FATAL ERROR No host information on {server_name}!" - "\n" - f"Please add an entry for {server_name} into ~/.ssh/config!" + f"FATAL ERROR No host information on {server_name}!" + "\n" + f"Please add an entry for {server_name} into ~/.ssh/config!" ) from pe # Parse the ssh output to find the user's Niagara username From 14f45c6aac2e0992700b54439f47d76221f546c7 Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 4 Feb 2025 08:39:52 -0600 Subject: [PATCH 42/83] Address more linter issues --- ush/python/pygfs/task/globus_hpss.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/ush/python/pygfs/task/globus_hpss.py b/ush/python/pygfs/task/globus_hpss.py index 995d8043e2..e009c76460 100644 --- a/ush/python/pygfs/task/globus_hpss.py +++ b/ush/python/pygfs/task/globus_hpss.py @@ -61,10 +61,10 @@ def __init__(self, config: Dict[str, Any]) -> None: ssh_output = self.ssh("-G", f"{server_name}", output=str) except ProcessError as pe: raise ProcessError( - f"FATAL ERROR No host information on {server_name}!" - "\n" - f"Please add an entry for {server_name} into ~/.ssh/config!" - ) from pe + f"FATAL ERROR No host information on {server_name}!" + "\n" + f"Please add an entry for {server_name} into ~/.ssh/config!" + ) from pe # Parse the ssh output to find the user's Niagara username ssh_output = ssh_output.split("\n") @@ -74,8 +74,8 @@ def __init__(self, config: Dict[str, Any]) -> None: # Update the home directory on the server with the username server_home = self.task_config.SERVER_HOME.replace( - "{{LOGNAME}}", server_username - ) + "{{LOGNAME}}", server_username + ) logger.debug(f"Server username detected as {server_username}") @@ -150,9 +150,9 @@ def configure(self, globus_dict: Dict[str, Any]) -> (Dict[str, Any], List[Dict[s # Start parsing scripts and storing in the output dictionary transfer_sets = { - "standard": {"locations": standard_backup_set}, - "rstprod": {"locations": rstprod_backup_set} - } + "standard": {"locations": standard_backup_set}, + "rstprod": {"locations": rstprod_backup_set} + } # Parse the doorman setup script doorman_jinja = os.path.join(globus_parm, "run_doorman.sh.j2") @@ -235,7 +235,7 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) for location in transfer_set["locations"]: print(location) with open("location", "w") as location_f: - location_f.write(location+"\n") + location_f.write(location + "\n") try: logger.info(f"Preparing package for {location}") sven_output = self.forsven(output=str.split) @@ -289,8 +289,8 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) # Exit the loop immediately, but allow the log file to be downloaded. if has_rstprod: logger.error( - f"FATAL ERROR HPSS archiving of restricted file {transfer_set['locations'][i]} failed!" - "\nPlease verify that the file has been deleted from HPSS!" + f"FATAL ERROR HPSS archiving of restricted file {transfer_set['locations'][i]} failed!" + "\nPlease verify that the file has been deleted from HPSS!" ) transfer_failed = True break From 599d43b034238153500d0adf7d0c86a91cf18024 Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 4 Feb 2025 08:42:17 -0600 Subject: [PATCH 43/83] Adjust closing brackets --- ush/python/pygfs/task/globus_hpss.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ush/python/pygfs/task/globus_hpss.py b/ush/python/pygfs/task/globus_hpss.py index e009c76460..54d19dc1a9 100644 --- a/ush/python/pygfs/task/globus_hpss.py +++ b/ush/python/pygfs/task/globus_hpss.py @@ -64,7 +64,7 @@ def __init__(self, config: Dict[str, Any]) -> None: f"FATAL ERROR No host information on {server_name}!" "\n" f"Please add an entry for {server_name} into ~/.ssh/config!" - ) from pe + ) from pe # Parse the ssh output to find the user's Niagara username ssh_output = ssh_output.split("\n") @@ -75,7 +75,7 @@ def __init__(self, config: Dict[str, Any]) -> None: # Update the home directory on the server with the username server_home = self.task_config.SERVER_HOME.replace( "{{LOGNAME}}", server_username - ) + ) logger.debug(f"Server username detected as {server_username}") @@ -152,7 +152,7 @@ def configure(self, globus_dict: Dict[str, Any]) -> (Dict[str, Any], List[Dict[s transfer_sets = { "standard": {"locations": standard_backup_set}, "rstprod": {"locations": rstprod_backup_set} - } + } # Parse the doorman setup script doorman_jinja = os.path.join(globus_parm, "run_doorman.sh.j2") From 7527fe6cc36781b24883193fe80b5aa4e640c955 Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 5 Feb 2025 12:33:31 -0600 Subject: [PATCH 44/83] Check that globus and Niagara are configured properly --- workflow/applications/applications.py | 68 ++++++++++++++++++++++++--- workflow/rocoto/workflow_xml.py | 34 ++++++++++++++ 2 files changed, 95 insertions(+), 7 deletions(-) diff --git a/workflow/applications/applications.py b/workflow/applications/applications.py index 0fc2075734..64113c6d31 100644 --- a/workflow/applications/applications.py +++ b/workflow/applications/applications.py @@ -2,9 +2,10 @@ from typing import Dict, List, Any from hosts import Host -from wxflow import Configuration +from wxflow import Configuration, which import importlib.util from abc import ABC, ABCMeta, abstractmethod +import os __all__ = ['AppConfig'] @@ -112,8 +113,10 @@ def _get_run_options(self, conf: Configuration) -> Dict[str, Any]: raise ValueError(f'Forecast segments do not increase monotonically: {",".join(self.fcst_segments)}') if run_options[run]['do_globusarch'] and not globus_checked: - globus_checked = self.check_globus(conf) - self.generate_globus_cron(conf) + status = self.check_globus(conf) + if not status: + raise ConnectionError("The globus server is not configured properly!") + globus_checked = True # Return the dictionary of run options return run_options @@ -218,13 +221,64 @@ def is_monotonic(test_list: List, check_decreasing: bool = False) -> bool: return all(x < y for x, y in zip(test_list, test_list[1:])) def check_globus(self, conf): + # This method checks that globus can be used on this platform + # and is configured properly. + # Test that globus can be imported spec = importlib.util.find_spec("globus_cli") if spec is None: raise ImportError("Globus-cli module not found! Check that the module is loaded!") - from globus_cli import main as globus + globus_conf = conf.parse_config(['config.base', 'config.globus']) + + # Initialize globus + globus = which("globus") + + if globus is None: + raise FileNotFoundError("Could not find the globus command!") + + # Check that a globus connection to the server is open + globus_output = globus("session", "show", output=str).splitlines()[2:] + + local_uid_found = False + rdhpcs_uid_found = False + + # There should be two sessions (MSU and RDHPCS), but if someone is running + # this elsewhere (e.g. NOAA cloud), it may be just one (RDHPCS). + local_uid = os.environ['LOGNAME'].lower() + for line in globus_output: + uid = line.split("|")[0].split("@")[0].lower() + domain = line.split("|")[0].split("@")[1].lower() + + if uid == local_uid: + local_uid_found = True + if "rdhpcs" in domain: + rdhpcs_uid_found = True + + if not local_uid_found or not rdhpcs_uid_found: + print(f"ERROR a globus session is not yet established on {globus_conf.SERVER_NAME}") + print(f" Please establish a globus connection!") + + # Check that there is an entry in the user's ssh config file for the globus server + server_scp_capable = False + scp = which("scp") + sshconfig = os.path.expanduser("~") + "/.ssh/config" + if scp is None: + print(f"ERROR Unable to find the scp command!") + + elif os.path.exists(sshconfig): + with open(sshconfig, "r") as config_f: + ssh_config_lines = config_f.readlines() + + for line in ssh_config_lines: + if globus_conf.SERVER_NAME in line: + server_scp_capable = True + break + else: + print("ERROR Unable to find a configuration file in ~/.ssh/config") + + if not server_scp_capable: + print(f"ERROR an alias for {globus_conf.SERVER_NAME} does not exist yet!") + print(f" Please add a configuration to ~/.ssh/config!") - globus_conf = conf.parse_config('config.globus') - # Check that a globus session is active - globus_cli = which("globus_cli") + return server_scp_capable and rdhpcs_uid_found and local_uid_found diff --git a/workflow/rocoto/workflow_xml.py b/workflow/rocoto/workflow_xml.py index bed19ad5ee..cab008bb30 100644 --- a/workflow/rocoto/workflow_xml.py +++ b/workflow/rocoto/workflow_xml.py @@ -125,6 +125,8 @@ def _assemble_xml(self) -> str: def write(self, xml_file: str = None, crontab_file: str = None): self._write_xml(xml_file=xml_file) self._write_crontab(crontab_file=crontab_file) + if self._base["GLOBUSARCH"]: + self._write_server_crontab() def _write_xml(self, xml_file: str = None) -> None: @@ -178,3 +180,35 @@ def _write_crontab(self, crontab_file: str = None, cronint: int = 5) -> None: fh.write('\n'.join(strings)) return + + def _write_server_crontab(self, cronint: int = 1): + # This method generates a script and a cron entry to run it. + # It is the user's responsibility to add the cron entry to the server's crontab. + + globus_conf = self._app_config.configs[next(iter(self._app_config.configs))]['globus'] + + expdir = globus_conf["EXPDIR"] + pslot = globus_conf["PSLOT"] + server = globus_conf["SERVER_NAME"] + server_home = globus_conf["SERVER_HOME"] + + try: + replyto = os.environ['REPLYTO'] + except KeyError: + replyto = '' + + crontab_file = f"{expdir}/{pslot}.{server}.crontab" + + init_script = f"{server_home}/init_xfer_{pslot}.sh" + strings = ['', + f'#################### {pslot} ####################', + f'MAILTO="{replyto}"' + f'*/{cronint} * * * * [[ -f {init_script} ]] && {init_script} || true' + ] + + with open(crontab_file, 'w') as fh: + fh.write('\n'.join(strings)) + + print("*******************************************************") + print(f"Please add the contents of \n{crontab_file}\nto your {server} crontab.") + print("*******************************************************") From 077c1b8961c97fbe9a689f80d9d4af1cbec11a8d Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 5 Feb 2025 12:37:21 -0600 Subject: [PATCH 45/83] Clean up archiving options --- parm/config/gefs/config.base | 19 +++++++++++++------ parm/config/gfs/config.base | 4 ++-- parm/globus/init_xfer.sh.j2 | 23 +++++++++++++++++++++++ 3 files changed, 38 insertions(+), 8 deletions(-) create mode 100644 parm/globus/init_xfer.sh.j2 diff --git a/parm/config/gefs/config.base b/parm/config/gefs/config.base index e1fb6e587a..cdd3a5e3f2 100644 --- a/parm/config/gefs/config.base +++ b/parm/config/gefs/config.base @@ -334,17 +334,24 @@ export DO_METP="NO" # Run METPLUS jobs - set METPLUS settings in config. export DO_FIT2OBS="NO" # Run fit to observations package # Archiving options -export VRFYARCH="@VRFYARCH@" # save verification data locally export HPSSARCH="@HPSSARCH@" # save data to HPSS archive -export LOCALARCH="@LOCALARCH@" # save data to local archive -if [[ ${HPSSARCH} = "YES" ]] && [[ ${LOCALARCH} = "YES" ]]; then - echo "Both HPSS and local archiving selected. Please choose one or the other." - exit 3 -elif [[ ${HPSSARCH} = "YES" ]] || [[ ${LOCALARCH} = "YES" ]]; then +export LOCALARCH="@LOCALARCH@" # save data to local archive +export GLOBUSARCH="NO" # send data to HPSS via globus xfers to Niagara (intended for MSU) +count_arch_opts=0 +for arch_opt in "${HPSSARCH}" "${LOCALARCH}" "${GLOBUSARCH}"; do + if [[ "${arch_opt}" == "YES" ]]; then + (( count_arch_opts += 1 )); + fi +done +if [[ ${count_arch_opts} -gt 1 ]]; then + echo "FATAL ERROR: More than one archiving option selected. Please choose no more than one." + exit 4 +elif [[ ${HPSSARCH} = "YES" ]] || [[ ${LOCALARCH} = "YES" ]] || [[ ${GLOBUSARCH} = "YES" ]]; then export DO_ARCHTAR="YES" else export DO_ARCHTAR="NO" fi + export ARCH_CYC=00 # Archive data at this cycle for warm start and/or forecast-only capabilities export ARCH_WARMICFREQ=4 # Archive frequency in days for warm start capability export ARCH_FCSTICFREQ=1 # Archive frequency in days for gdas and gfs forecast-only capability diff --git a/parm/config/gfs/config.base b/parm/config/gfs/config.base index 93c09dfd1a..29c84dab44 100644 --- a/parm/config/gfs/config.base +++ b/parm/config/gfs/config.base @@ -482,7 +482,7 @@ export DO_FETCH_LOCAL="NO" # Copy from local disk onto COM # Archiving options export HPSSARCH="@HPSSARCH@" # save data to HPSS archive export LOCALARCH="@LOCALARCH@" # save data to local archive -export GLOBUSARCH="@GLOBUSARCH@" # send data to HPSS via globus xfers to Niagara +export GLOBUSARCH="NO" # send data to HPSS via globus xfers to Niagara (intended for MSU) count_arch_opts=0 for arch_opt in "${HPSSARCH}" "${LOCALARCH}" "${GLOBUSARCH}"; do if [[ "${arch_opt}" == "YES" ]]; then @@ -492,7 +492,7 @@ done if [[ ${count_arch_opts} -gt 1 ]]; then echo "FATAL ERROR: More than one archiving option selected. Please choose no more than one." exit 4 -elif [[ ${HPSSARCH} = "YES" ]] || [[ ${LOCALARCH} = "YES" ]]; then +elif [[ ${HPSSARCH} = "YES" ]] || [[ ${LOCALARCH} = "YES" ]] || [[ ${GLOBUSARCH} = "YES" ]]; then export DO_ARCHTAR="YES" else export DO_ARCHTAR="NO" diff --git a/parm/globus/init_xfer.sh.j2 b/parm/globus/init_xfer.sh.j2 new file mode 100644 index 0000000000..344d4589a4 --- /dev/null +++ b/parm/globus/init_xfer.sh.j2 @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# This script prepares the server directory and launches the doorman scripts for {{pslot}} on {{server}} +mkdir -p "{{server_home}}/doorman" || exit 2 +date > "{{server_home}}/{{pslot}}_crontab_active.log" + +# Look for mkdir requests +for mkdir_req_fl in "{{server_home}}"/req_mkdir.*; do + dir=$(cat "${mkdir_req_fl}") + mkdir -p "${dir}" || exit 2 + rm -f "${mkdir_req_fl}" +done + +# Look for executable scripts +for dir in "{{server_home}}"/doorman/globus.*; do + flist=$(find "${dir}" -executable -name "run_doorman.sh") + for script in ${flist}; do + # Check if the corresponding log has already been written + log="${script/.sh/.log}" + if [[ ! -f "${log}" ]]; then + "${script}" + fi + done +done From 0ec0c529f4c57e448cdf62eef50e4d44afd88832 Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 5 Feb 2025 12:38:12 -0600 Subject: [PATCH 46/83] Clarify server username --- parm/config/gfs/config.globus | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parm/config/gfs/config.globus b/parm/config/gfs/config.globus index 21aef480b8..1aaabae0c1 100644 --- a/parm/config/gfs/config.globus +++ b/parm/config/gfs/config.globus @@ -18,7 +18,7 @@ export SERVER_GLOBUS_UUID=1bfd8a79-52b2-4589-88b2-0648e0c0b35d export CLIENT_GLOBUS_UUID=@CLIENT_GLOBUS_UUID@ # General delivery location on Niagara (staging area for data) -export SERVER_HOME='/collab1/data/{{LOGNAME}}' +export SERVER_HOME='/collab1/data/{{SERVER_USERNAME}}' # Sven's dropbox export SVEN_DROPBOX_ROOT="${DATA}/SVEN_DROPBOX" From 3682cedaf4c21174ce380c3dba3ee17dd50f47fe Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 5 Feb 2025 12:38:41 -0600 Subject: [PATCH 47/83] Add script to initialize Niagara --- ush/python/pygfs/task/globus_hpss.py | 54 ++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/ush/python/pygfs/task/globus_hpss.py b/ush/python/pygfs/task/globus_hpss.py index 54d19dc1a9..991b849943 100644 --- a/ush/python/pygfs/task/globus_hpss.py +++ b/ush/python/pygfs/task/globus_hpss.py @@ -2,7 +2,7 @@ import os from logging import getLogger -from pathlib import Path +import shutil from time import sleep from typing import Any, Dict, List import re @@ -54,6 +54,8 @@ def __init__(self, config: Dict[str, Any]) -> None: # Disable strict host key checking by default # This auto-accepts changes to keys self.scp.add_default_arg("-oStrictHostKeyChecking=no") + # Force using publickey login + self.scp.add_default_arg("-oPreferredAuthentication=publickey") # Get the user's server username from their ~/.ssh/config file server_name = self.task_config.SERVER_NAME @@ -74,7 +76,7 @@ def __init__(self, config: Dict[str, Any]) -> None: # Update the home directory on the server with the username server_home = self.task_config.SERVER_HOME.replace( - "{{LOGNAME}}", server_username + "{{SERVER_USERNAME}}", server_username ) logger.debug(f"Server username detected as {server_username}") @@ -162,7 +164,10 @@ def configure(self, globus_dict: Dict[str, Any]) -> (Dict[str, Any], List[Dict[s dm_conf = f'export dropbox="{globus_dict.sven_dropbox}"' # Make the dropbox and clean it out - Path(globus_dict.sven_dropbox).mkdir(exist_ok=True) + if os.path.exists(globus_dict.sven_dropbox): + shutil.rmtree(globus_dict.sven_dropbox) + + os.mkdir(globus_dict.sven_dropbox) # Parse the return script return_jinja = os.path.join(globus_parm, "return.sh.j2") @@ -187,11 +192,8 @@ def configure(self, globus_dict: Dict[str, Any]) -> (Dict[str, Any], List[Dict[s transfer_sets[transfer_set]["dm.conf"] = dm_conf transfer_sets[transfer_set]["return"] = return_script transfer_sets[transfer_set]["verify"] = vrfy_script - transfer_sets[transfer_set]["sven_dropbox"] = globus_dict.sven_dropbox - transfer_sets[transfer_set]["server_name"] = globus_dict.SERVER_NAME - transfer_sets[transfer_set]["server_homedir"] = ( - f"{globus_dict.server_home}/doorman/{globus_dict.jobid}/" - f"{transfer_set}" + transfer_sets[transfer_set]["server_job_dir"] = ( + f"{globus_dict.server_home}/doorman/globus.{globus_dict.jobid}/{transfer_set}" ) return transfer_sets @@ -220,12 +222,18 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) return_f.write(transfer_set["return"]) with open("run_doorman.sh", "w") as doorman_f: doorman_f.write(transfer_set["run_doorman.sh"]) + with open("init_xfer.sh", "w") as init_f: + init_f.write(transfer_set["init_xfer.sh"]) + + server_job_dir = transfer_set["server_job_dir"] + + # Initialize the server + self._init_server(server_job_dir) # Make run_doorman.sh executable os.chmod("run_doorman.sh", 0o740) - server_homedir = transfer_set["server_homedir"] - server_name = transfer_set["server_name"] + server_name = self.task_config.SERVER_NAME # Initialize a list of status files. transfer_set["statuses"] = [] @@ -233,7 +241,6 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) # Tell Sven we have files to send, one at a time for location in transfer_set["locations"]: - print(location) with open("location", "w") as location_f: location_f.write(location + "\n") try: @@ -245,7 +252,7 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) # Parse Sven's output to get the name of the return status file match = re.search("\"(status_.*)\" in your dropbox", sven_output) - transfer_set["status_files"].append(os.path.join(transfer_set["sven_dropbox"], match.group(1))) + transfer_set["status_files"].append(os.path.join(self.task_config.sven_dropbox, match.group(1))) # Initialize 'completed' to false for each file transfer_set["completed"].append(False) @@ -254,7 +261,7 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) # Note, this assumes we have unattended transfer capability. try: # Now transfer and rename the script - server_run_script = f"{server_homedir}/doorman_rundir/run_doorman.sh" + server_run_script = f"{server_job_dir}/run_doorman.sh" logger.debug(f"Transfer run_doorman.sh to {server_name}:{server_run_script}") self.scp( "run_doorman.sh", f"{server_name}:{server_run_script}", @@ -313,7 +320,7 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) # Retrieve and print the Doorman log file from the server try: - self.scp(f"{server_name}:{server_homedir}/run_doorman.log", '.') + self.scp(f"{server_name}:{server_job_dir}/run_doorman.log", '.') with open('run_doorman.log', 'r') as doorman_log: logger.info(doorman_log.read()) @@ -326,6 +333,25 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) return + @logit(logger) + def _init_server(job_dir: str): + # This method sends a request to create a working directory and transfers + # the initialization script. + + req_file = f"req_mkdir.{self.task_config.jobid}" + with open(f"req_mkdir.{self.task_config.jobid}") as mkdir_f: + mkdir_f.write(f"{job_dir}") + + self.scp(req_file, f"{self.task_config.SERVER_NAME}:{self.task_config.server_home}/{req_file}") + + self.scp( + "init_xfer.sh", + f"{self.task_config.SERVER_NAME}:{self.task_config.server_home}/init_xfer_{self.task_config.PSLOT}.sh" + ) + + logger.info("Sleeping 1 minute to let the server initialize") + sleep(300) + @logit(logger) def clean(self): """ From 14123b13f7083abd337fa11830eac1cb9469dfa1 Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 5 Feb 2025 12:39:45 -0600 Subject: [PATCH 48/83] Add globus-cli module to setup modulefile --- modulefiles/module_gwsetup.hercules.lua | 1 + 1 file changed, 1 insertion(+) diff --git a/modulefiles/module_gwsetup.hercules.lua b/modulefiles/module_gwsetup.hercules.lua index e7735e4aa1..c0bc7760a3 100644 --- a/modulefiles/module_gwsetup.hercules.lua +++ b/modulefiles/module_gwsetup.hercules.lua @@ -16,5 +16,6 @@ load(pathJoin("python", python_ver)) load("py-jinja2") load("py-pyyaml") load("py-numpy") +try_load("globus-cli") whatis("Description: GFS run setup environment") From a2cf1a4e4bfed1a8a26027d0430dae497a71af26 Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 5 Feb 2025 12:44:55 -0600 Subject: [PATCH 49/83] Make server script executable --- ush/python/pygfs/task/globus_hpss.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ush/python/pygfs/task/globus_hpss.py b/ush/python/pygfs/task/globus_hpss.py index 991b849943..bb865423ea 100644 --- a/ush/python/pygfs/task/globus_hpss.py +++ b/ush/python/pygfs/task/globus_hpss.py @@ -230,8 +230,9 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) # Initialize the server self._init_server(server_job_dir) - # Make run_doorman.sh executable + # Make run_doorman.sh and init_xfer.sh executable os.chmod("run_doorman.sh", 0o740) + os.chmod("init_xfer.sh", 0o740) server_name = self.task_config.SERVER_NAME From beac17dc259dcb2122e359dee261815f467c0f66 Mon Sep 17 00:00:00 2001 From: David Huber Date: Thu, 6 Feb 2025 09:01:31 -0600 Subject: [PATCH 50/83] Update arch_tars config name in globus j-job --- jobs/JGLOBAL_GLOBUS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jobs/JGLOBAL_GLOBUS b/jobs/JGLOBAL_GLOBUS index 04a1b02c33..2e0f16f701 100755 --- a/jobs/JGLOBAL_GLOBUS +++ b/jobs/JGLOBAL_GLOBUS @@ -1,7 +1,7 @@ #! /usr/bin/env bash source "${HOMEgfs}/ush/preamble.sh" -source "${HOMEgfs}/ush/jjob_header.sh" -e "globus" -c "base arch globus" +source "${HOMEgfs}/ush/jjob_header.sh" -e "globus" -c "base arch_tars globus" ############################################## # Set variables used in the script From 1a77a6bf7f5685f7773e301f31e55c50c1d032a1 Mon Sep 17 00:00:00 2001 From: David Huber Date: Thu, 6 Feb 2025 09:02:39 -0600 Subject: [PATCH 51/83] Add localarch option to config.arch_tars when running globusarch --- parm/config/gefs/config.arch_tars | 2 +- parm/config/gfs/config.arch_tars | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/parm/config/gefs/config.arch_tars b/parm/config/gefs/config.arch_tars index 6b052ee05a..d2258be1dd 100644 --- a/parm/config/gefs/config.arch_tars +++ b/parm/config/gefs/config.arch_tars @@ -6,7 +6,7 @@ echo "BEGIN: config.arch_tars" # Get task specific resources -. "${EXPDIR}/config.resources" arch_tars +. "${EXPDIR}/config.resources" "arch_tars" export ARCH_GAUSSIAN="YES" export ARCH_GAUSSIAN_FHMAX=${FHMAX_GFS} diff --git a/parm/config/gfs/config.arch_tars b/parm/config/gfs/config.arch_tars index f46ff45a3a..d2258be1dd 100644 --- a/parm/config/gfs/config.arch_tars +++ b/parm/config/gfs/config.arch_tars @@ -12,4 +12,11 @@ export ARCH_GAUSSIAN="YES" export ARCH_GAUSSIAN_FHMAX=${FHMAX_GFS} export ARCH_GAUSSIAN_FHINC=${FHOUT_GFS} +# If we are running globus archiving, create tarballs in a temporary location +if [[ "${GLOBUSARCH}" == "YES" ]]; then + export ATARDIR="${DATAROOT}/archive_rotdir/${RUN}" + export LOCALARCH="YES" + export DATASETS_YAML="backup_tarballs.yaml" +fi + echo "END: config.arch_tars" From 2543063c046bb8639fc950a5545e9f39c44d11cc Mon Sep 17 00:00:00 2001 From: David Huber Date: Thu, 6 Feb 2025 09:03:09 -0600 Subject: [PATCH 52/83] Correct PSLOT and SERVER_NAME variables in init script --- parm/globus/init_xfer.sh.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parm/globus/init_xfer.sh.j2 b/parm/globus/init_xfer.sh.j2 index 344d4589a4..1157373a36 100644 --- a/parm/globus/init_xfer.sh.j2 +++ b/parm/globus/init_xfer.sh.j2 @@ -1,7 +1,7 @@ #!/usr/bin/env bash -# This script prepares the server directory and launches the doorman scripts for {{pslot}} on {{server}} +# This script prepares the server directory and launches the doorman scripts for {{PSLOT}} on {{SERVER_NAME}} mkdir -p "{{server_home}}/doorman" || exit 2 -date > "{{server_home}}/{{pslot}}_crontab_active.log" +date -u '+%Y-%m-%dT%H:%M:%S' > "{{server_home}}/{{PSLOT}}_crontab_active.log" # Look for mkdir requests for mkdir_req_fl in "{{server_home}}"/req_mkdir.*; do From 8b4abe882b5b1f6e7926e68a0913138ff4dbfa6b Mon Sep 17 00:00:00 2001 From: David Huber Date: Thu, 6 Feb 2025 09:05:09 -0600 Subject: [PATCH 53/83] Add input boolean for rstprod --- scripts/exglobal_globus.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/exglobal_globus.py b/scripts/exglobal_globus.py index 023bcad2d0..da55464986 100755 --- a/scripts/exglobal_globus.py +++ b/scripts/exglobal_globus.py @@ -20,7 +20,7 @@ def main(): keys = ['STAGE_DIR', 'current_cycle', 'RUN', 'PDY', 'HOMEgfs', 'sven_dropbox', 'doorman_gendel', 'DATASETS_YAML', 'PARMgfs', 'COMIN_CONF', 'KEEPDATA', 'jobid', 'hpss_target_dir', 'server_home', 'SERVER_NAME', 'DOORMAN_ROOT', - 'CLIENT_GLOBUS_UUID', 'SERVER_GLOBUS_UUID'] + 'CLIENT_GLOBUS_UUID', 'SERVER_GLOBUS_UUID', 'PSLOT'] globus_dict = AttrDict() for key in keys: @@ -34,7 +34,8 @@ def main(): # Send the tarballs to HPSS via Niagara. Start with non-rstprod (standard) data for transfer_set in ["standard", "rstprod"]: - globus.execute_transfer_data(transfer_sets[transfer_set]) + has_rstprod = transfer_set == "rstprod" + globus.execute_transfer_data(transfer_sets[transfer_set], has_rstprod) # Clean up any temporary files globus.clean() From 9a0720052cda33d27cbaee1b2604bf7cfacf20e1 Mon Sep 17 00:00:00 2001 From: David Huber Date: Thu, 6 Feb 2025 09:06:07 -0600 Subject: [PATCH 54/83] Debug/improve interaction with the server --- ush/python/pygfs/task/globus_hpss.py | 44 +++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/ush/python/pygfs/task/globus_hpss.py b/ush/python/pygfs/task/globus_hpss.py index bb865423ea..4eaf894af3 100644 --- a/ush/python/pygfs/task/globus_hpss.py +++ b/ush/python/pygfs/task/globus_hpss.py @@ -6,8 +6,9 @@ from time import sleep from typing import Any, Dict, List import re +from datetime import datetime, timezone -from wxflow import AttrDict, Task, to_YMD, to_YMDH, strftime, logit, parse_yaml, Jinja, which, ProcessError +from wxflow import AttrDict, Task, to_YMD, to_YMDH, strftime, logit, parse_yaml, Jinja, which, ProcessError, to_datetime logger = getLogger(__name__.split('.')[-1]) @@ -55,7 +56,7 @@ def __init__(self, config: Dict[str, Any]) -> None: # This auto-accepts changes to keys self.scp.add_default_arg("-oStrictHostKeyChecking=no") # Force using publickey login - self.scp.add_default_arg("-oPreferredAuthentication=publickey") + self.scp.add_default_arg("-oPreferredAuthentications=publickey") # Get the user's server username from their ~/.ssh/config file server_name = self.task_config.SERVER_NAME @@ -186,12 +187,17 @@ def configure(self, globus_dict: Dict[str, Any]) -> (Dict[str, Any], List[Dict[s vrfy_jinja = os.path.join(globus_parm, "verify.sh.j2") vrfy_script = Jinja(vrfy_jinja, data=globus_dict, allow_missing=False).render + # Create the server initialization script + init_xfer_jinja = os.path.join(globus_parm, "init_xfer.sh.j2") + init_xfer_script = Jinja(init_xfer_jinja, data=globus_dict, allow_missing=False).render + # Add common scripts to both standard and rstprod for transfer_set in transfer_sets: transfer_sets[transfer_set]["run_doorman.sh"] = doorman_script transfer_sets[transfer_set]["dm.conf"] = dm_conf transfer_sets[transfer_set]["return"] = return_script transfer_sets[transfer_set]["verify"] = vrfy_script + transfer_sets[transfer_set]["init_xfer.sh"] = init_xfer_script transfer_sets[transfer_set]["server_job_dir"] = ( f"{globus_dict.server_home}/doorman/globus.{globus_dict.jobid}/{transfer_set}" ) @@ -335,23 +341,47 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) return @logit(logger) - def _init_server(job_dir: str): + def _init_server(self, job_dir: str): # This method sends a request to create a working directory and transfers # the initialization script. req_file = f"req_mkdir.{self.task_config.jobid}" - with open(f"req_mkdir.{self.task_config.jobid}") as mkdir_f: + with open(f"req_mkdir.{self.task_config.jobid}", "w") as mkdir_f: mkdir_f.write(f"{job_dir}") - self.scp(req_file, f"{self.task_config.SERVER_NAME}:{self.task_config.server_home}/{req_file}") + server_name = self.task_config.SERVER_NAME + server_home = self.task_config.server_home + pslot = self.task_config.PSLOT + + self.scp(req_file, f"{server_name}:{server_home}/{req_file}") self.scp( "init_xfer.sh", - f"{self.task_config.SERVER_NAME}:{self.task_config.server_home}/init_xfer_{self.task_config.PSLOT}.sh" + f"{server_name}:{server_home}/init_xfer_{self.task_config.PSLOT}.sh" ) logger.info("Sleeping 1 minute to let the server initialize") - sleep(300) + sleep(60) + + # Check that the server initialized successfully + try: + self.scp(f"{server_name}:{server_home}/{pslot}_crontab_active.log", "crontab.log") + except ProcessError as pe: + raise ProcessError( + "FATAL ERROR failed to retrieve the server log file!\n" + f"Check that the crontab is active on {server_name}." + ) from pe + + # Check the date in the log + with open("crontab.log", "r") as crontab_f: + cron_date = crontab_f.read() + + cron_datetime = to_datetime(cron_date) + cron_td = datetime.now(timezone.utc) - cron_datetime + + if cron_td.total_seconds() > 600: + # The log file is too old (from another test case) + raise ProcessError("FATAL ERROR The server failed to initialize!") @logit(logger) def clean(self): From 43ef61251ca179a505e6df0a8924f96fb4650e3f Mon Sep 17 00:00:00 2001 From: David Huber Date: Thu, 6 Feb 2025 09:07:12 -0600 Subject: [PATCH 55/83] Determine server username at setup time --- workflow/rocoto/workflow_xml.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/workflow/rocoto/workflow_xml.py b/workflow/rocoto/workflow_xml.py index cab008bb30..01530c2f64 100644 --- a/workflow/rocoto/workflow_xml.py +++ b/workflow/rocoto/workflow_xml.py @@ -7,7 +7,7 @@ from typing import Dict from applications.applications import AppConfig from rocoto.workflow_tasks import get_wf_tasks -from wxflow import to_timedelta +from wxflow import to_timedelta, which, ProcessError import rocoto.rocoto as rocoto from abc import ABC, abstractmethod @@ -192,6 +192,25 @@ def _write_server_crontab(self, cronint: int = 1): server = globus_conf["SERVER_NAME"] server_home = globus_conf["SERVER_HOME"] + # Get the server username from ~/.ssh/config + # TODO move this to an earlier point and actually amend config.globus with the username + ssh = which("ssh") + if ssh is None: + raise ProcessError("Failed to locate the ssh command!") + + try: + ssh_output = ssh("-G", server, output=str).split("\n") + except ProcessError: + raise ProcessError(f"Failed to run ssh -G {server} to identify the server username!") + + for line in ssh_output: + if line.startswith("user "): + server_username = line.split()[1] + + server_home = server_home.replace( + "{{SERVER_USERNAME}}", server_username + ) + try: replyto = os.environ['REPLYTO'] except KeyError: @@ -202,8 +221,9 @@ def _write_server_crontab(self, cronint: int = 1): init_script = f"{server_home}/init_xfer_{pslot}.sh" strings = ['', f'#################### {pslot} ####################', - f'MAILTO="{replyto}"' - f'*/{cronint} * * * * [[ -f {init_script} ]] && {init_script} || true' + f'MAILTO="{replyto}"', + f'*/{cronint} * * * * [[ -f {init_script} ]] && {init_script} || true', + "" ] with open(crontab_file, 'w') as fh: From 313512a4376e1a47e3ce7b31d3cd98588d69a1ff Mon Sep 17 00:00:00 2001 From: David Huber Date: Thu, 6 Feb 2025 09:56:32 -0600 Subject: [PATCH 56/83] Test for existence before running init for loops --- parm/globus/init_xfer.sh.j2 | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/parm/globus/init_xfer.sh.j2 b/parm/globus/init_xfer.sh.j2 index 1157373a36..569da6207b 100644 --- a/parm/globus/init_xfer.sh.j2 +++ b/parm/globus/init_xfer.sh.j2 @@ -4,20 +4,24 @@ mkdir -p "{{server_home}}/doorman" || exit 2 date -u '+%Y-%m-%dT%H:%M:%S' > "{{server_home}}/{{PSLOT}}_crontab_active.log" # Look for mkdir requests -for mkdir_req_fl in "{{server_home}}"/req_mkdir.*; do - dir=$(cat "${mkdir_req_fl}") - mkdir -p "${dir}" || exit 2 - rm -f "${mkdir_req_fl}" -done +if compgen -G "/collab1/data/David.Huber"/req_mkdir.*; then + for mkdir_req_fl in "{{server_home}}"/req_mkdir.*; do + dir=$(cat "${mkdir_req_fl}") + mkdir -p "${dir}" || exit 2 + rm -f "${mkdir_req_fl}" + done +fi # Look for executable scripts -for dir in "{{server_home}}"/doorman/globus.*; do - flist=$(find "${dir}" -executable -name "run_doorman.sh") - for script in ${flist}; do - # Check if the corresponding log has already been written - log="${script/.sh/.log}" - if [[ ! -f "${log}" ]]; then - "${script}" - fi +if compgen -G "{{server_home}}"/doorman/globus.*; then + for dir in "{{server_home}}"/doorman/globus.*; do + flist=$(find "${dir}" -executable -name "run_doorman.sh") + for script in ${flist}; do + # Check if the corresponding log has already been written + log="${script/.sh/.log}" + if [[ ! -f "${log}" ]]; then + "${script}" + fi + done done -done +fi From 385d5587279c443ba01303fada8d6479899ef748 Mon Sep 17 00:00:00 2001 From: David Huber Date: Thu, 6 Feb 2025 11:41:39 -0600 Subject: [PATCH 57/83] Make init script more robust --- parm/globus/init_xfer.sh.j2 | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/parm/globus/init_xfer.sh.j2 b/parm/globus/init_xfer.sh.j2 index 569da6207b..5a6084f835 100644 --- a/parm/globus/init_xfer.sh.j2 +++ b/parm/globus/init_xfer.sh.j2 @@ -1,10 +1,17 @@ #!/usr/bin/env bash # This script prepares the server directory and launches the doorman scripts for {{PSLOT}} on {{SERVER_NAME}} mkdir -p "{{server_home}}/doorman" || exit 2 -date -u '+%Y-%m-%dT%H:%M:%S' > "{{server_home}}/{{PSLOT}}_crontab_active.log" +run_time=$(date -u '+%Y-%m-%dT%H:%M:%S') +echo "${run_time}" > "{{server_home}}/{{PSLOT}}_crontab_active.log" + +runtime_log="{{server_home}}/{{PSLOT}}_last_runtime.log" +if [[ ! -f "${runtime_log}" ]]; then + echo "${run_time}" > "${runtime_log}" +fi # Look for mkdir requests -if compgen -G "/collab1/data/David.Huber"/req_mkdir.*; then +if compgen -G "/collab1/data/David.Huber"/req_mkdir.* > /dev/null 2>&1 ; then + echo "${run_time}" > "${runtime_log}" for mkdir_req_fl in "{{server_home}}"/req_mkdir.*; do dir=$(cat "${mkdir_req_fl}") mkdir -p "${dir}" || exit 2 @@ -13,7 +20,8 @@ if compgen -G "/collab1/data/David.Huber"/req_mkdir.*; then fi # Look for executable scripts -if compgen -G "{{server_home}}"/doorman/globus.*; then +if compgen -G "{{server_home}}"/doorman/globus.* > /dev/null 2>&1 ; then + echo "${run_time}" > "${runtime_log}" for dir in "{{server_home}}"/doorman/globus.*; do flist=$(find "${dir}" -executable -name "run_doorman.sh") for script in ${flist}; do @@ -25,3 +33,21 @@ if compgen -G "{{server_home}}"/doorman/globus.*; then done done fi + +# Check if it has been a while since this script had anything to do +last_runtime=$(cat "${runtime_log}") +last_runtime_s=$(date -d "${last_runtime}" '+%s') +current_time_s=$(date -d "${run_time}" '+%s') + +diff_d=$(( (current_time_s - last_runtime_s) / 86400 )) + +if [[ ${diff_d} -gt 3 ]]; then + # If the user added their email to REPLYTO, they will get this message + echo "WARNING the {{SERVER_NAME}} service for {{PSLOT}} has not run in > 3 days!" + echo "Turning off the crontab for {{PSLOT}}!!" + scriptpath="$( realpath "${0}" )" + entry=$(crontab -l | grep -i "${scriptpath}") + echo "Deleting crontab entry" + echo "${entry}" + crontab -l | grep -v "${scriptpath}" | crontab - +fi From b9d4abac8793ae16d2e22a07538a651eb55d4190 Mon Sep 17 00:00:00 2001 From: David Huber Date: Thu, 6 Feb 2025 11:44:15 -0600 Subject: [PATCH 58/83] Add logging --- ush/python/pygfs/task/globus_hpss.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/ush/python/pygfs/task/globus_hpss.py b/ush/python/pygfs/task/globus_hpss.py index 4eaf894af3..c1ee494de3 100644 --- a/ush/python/pygfs/task/globus_hpss.py +++ b/ush/python/pygfs/task/globus_hpss.py @@ -231,15 +231,15 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) with open("init_xfer.sh", "w") as init_f: init_f.write(transfer_set["init_xfer.sh"]) + # Make run_doorman.sh and init_xfer.sh executable + os.chmod("run_doorman.sh", 0o740) + os.chmod("init_xfer.sh", 0o740) + server_job_dir = transfer_set["server_job_dir"] # Initialize the server self._init_server(server_job_dir) - # Make run_doorman.sh and init_xfer.sh executable - os.chmod("run_doorman.sh", 0o740) - os.chmod("init_xfer.sh", 0o740) - server_name = self.task_config.SERVER_NAME # Initialize a list of status files. @@ -377,12 +377,16 @@ def _init_server(self, job_dir: str): cron_date = crontab_f.read() cron_datetime = to_datetime(cron_date) + # Establish the timezone + cron_datetime = cron_datetime.replace(tzinfo=timezone.utc) cron_td = datetime.now(timezone.utc) - cron_datetime if cron_td.total_seconds() > 600: - # The log file is too old (from another test case) + # The log file is too old (perhaps from another test case) raise ProcessError("FATAL ERROR The server failed to initialize!") + logger.info("Server initialized successfully!") + @logit(logger) def clean(self): """ From 5139f472f7bfeb8fff462bfa01f564c01b21d867 Mon Sep 17 00:00:00 2001 From: David Huber Date: Fri, 7 Feb 2025 07:18:13 -0600 Subject: [PATCH 59/83] Move client UUID to config.base so it can be filled by setup_expt --- parm/config/gefs/config.base | 3 +++ parm/config/gfs/config.base | 3 +++ parm/config/gfs/config.globus | 2 -- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/parm/config/gefs/config.base b/parm/config/gefs/config.base index cdd3a5e3f2..acd5eb9175 100644 --- a/parm/config/gefs/config.base +++ b/parm/config/gefs/config.base @@ -352,6 +352,9 @@ else export DO_ARCHTAR="NO" fi +# Globus UUID for this machine +export CLIENT_GLOBUS_UUID='@CLIENT_GLOBUS_UUID@' + export ARCH_CYC=00 # Archive data at this cycle for warm start and/or forecast-only capabilities export ARCH_WARMICFREQ=4 # Archive frequency in days for warm start capability export ARCH_FCSTICFREQ=1 # Archive frequency in days for gdas and gfs forecast-only capability diff --git a/parm/config/gfs/config.base b/parm/config/gfs/config.base index 29c84dab44..8a4faf6eb7 100644 --- a/parm/config/gfs/config.base +++ b/parm/config/gfs/config.base @@ -498,6 +498,9 @@ else export DO_ARCHTAR="NO" fi +# Globus UUID for this machine +export CLIENT_GLOBUS_UUID='@CLIENT_GLOBUS_UUID@' + export ARCH_CYC=00 # Archive data at this cycle for warm start and/or forecast-only capabilities export ARCH_WARMICFREQ=4 # Archive frequency in days for warm start capability export ARCH_FCSTICFREQ=1 # Archive frequency in days for gdas and gfs forecast-only capability diff --git a/parm/config/gfs/config.globus b/parm/config/gfs/config.globus index 1aaabae0c1..b07cde797b 100644 --- a/parm/config/gfs/config.globus +++ b/parm/config/gfs/config.globus @@ -14,8 +14,6 @@ export STAGE_DIR="${DATAROOT}/archive_rotdir/${PSLOT}" # Set variables used by the Sven and Doorman services # Niagara's globus UUID export SERVER_GLOBUS_UUID=1bfd8a79-52b2-4589-88b2-0648e0c0b35d -# Client address -export CLIENT_GLOBUS_UUID=@CLIENT_GLOBUS_UUID@ # General delivery location on Niagara (staging area for data) export SERVER_HOME='/collab1/data/{{SERVER_USERNAME}}' From 718a1cac684eb4c0d234e1902cf143960394276f Mon Sep 17 00:00:00 2001 From: David Huber Date: Fri, 7 Feb 2025 07:19:01 -0600 Subject: [PATCH 60/83] Error check run_doorman --- parm/globus/run_doorman.sh.j2 | 54 +++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/parm/globus/run_doorman.sh.j2 b/parm/globus/run_doorman.sh.j2 index a104fed3b1..d650285b65 100644 --- a/parm/globus/run_doorman.sh.j2 +++ b/parm/globus/run_doorman.sh.j2 @@ -5,6 +5,9 @@ script_relpath="$(dirname "${BASH_SOURCE[0]}")" log_file="${script_relpath}/run_doorman.log" cd "${script_relpath}" +# Initialize modules (this is a Niagara-specific path; parameterize if needed) +. /apps/lmod/lmod/init/bash + run_doorman(){ set -eux @@ -28,18 +31,29 @@ rm -rf "{{doorman_gendel}}" # Point to the doorman executable scripts export PATH="${PATH}:{{DOORMAN_ROOT}}/bin" +set +e + # Create the general delivery space if it wasn't already -initialize.sh +bash -e initialize.sh +init_stat=$? + +if [[ ${init_stat} -ne 0 ]]; then + echo "initialize.sh failed!" + return 1 +fi # Transfer the data from the sender and execute the 'todo' script -receive.sh --go +bash -e receive.sh --go +receive_stat=$? # If receive didn't produce an FLIST file, then something went wrong -if [[ ! -f FLIST ]]; then +if [[ ! -f FLIST || ${receive_stat} -ne 0 ]]; then echo "receive.sh failed!" return 2 fi +set -e + # Parse the FLIST file created by receive.sh to get the transfer IDs IDs="" while IFS= read -r line; do @@ -49,34 +63,50 @@ while IFS= read -r line; do IDs="${IDs} ${ID}" done < FLIST +set +e + # Sleep for a minute to allow time for all globus artifacts to resolve sleep 1m # Validate and generate the acknowledgement for each transfer ID for ID in ${IDs}; do - ack.sh "${ID}" + bash -e ack.sh "${ID}" + ack_stat=$? + if [[ ${ack_stat} != 0 ]]; then + echo "ack.sh failed for file ID ${ID}!" + return 3 + fi done # Send the acknowledgement back to the sender -set +e -send.sh +bash -e send.sh -stat=$? +send_stat=$? -if [[ ${stat} -ne 0 ]]; then +if [[ ${send_stat} -ne 0 ]]; then echo "Failed to send status back to client!" - exit 3 + return 4 fi -set -e # Remove the working directory #shellcheck disable=SC2050 -if [[ "{{KEEPDATA}}" == "NO" ]]; then - cd "${HOME}" || echo "Failed to navigate to ${HOME}!" && exit 4 +if [[ "{{KEEPDATA}}" == "False" || "{{KEEPDATA}}" == "NO" ]]; then + cd "${HOME}" || echo "Failed to navigate to ${HOME}!" && return 5 rm -rf "${doorman_dir}" fi + +set +eux } run_doorman >& "${log_file}" +if [[ $? -ne 0 ]]; then + echo "Failed to run the doorman service!" + set +eux + echo "FAILURE" >> "${log_file}" + exit 9 +fi + +echo "SUCCESS" >> "${log_file}" + exit 0 From 2b6e2d13c219f3db37096c68e43894685d8153a7 Mon Sep 17 00:00:00 2001 From: David Huber Date: Fri, 7 Feb 2025 07:20:10 -0600 Subject: [PATCH 61/83] Debug globus_hpss class, add robustness --- ush/python/pygfs/task/globus_hpss.py | 42 +++++++++++++++++++--------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/ush/python/pygfs/task/globus_hpss.py b/ush/python/pygfs/task/globus_hpss.py index c1ee494de3..1fc6dbd495 100644 --- a/ush/python/pygfs/task/globus_hpss.py +++ b/ush/python/pygfs/task/globus_hpss.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import os -from logging import getLogger +import logging import shutil from time import sleep from typing import Any, Dict, List @@ -10,7 +10,8 @@ from wxflow import AttrDict, Task, to_YMD, to_YMDH, strftime, logit, parse_yaml, Jinja, which, ProcessError, to_datetime -logger = getLogger(__name__.split('.')[-1]) +logger = logging.getLogger(__name__.split('.')[-1]) +logging.basicConfig(encoding='utf-8', level=logging.DEBUG, format='%(asctime)s %(message)s') class GlobusHpss(Task): @@ -199,7 +200,7 @@ def configure(self, globus_dict: Dict[str, Any]) -> (Dict[str, Any], List[Dict[s transfer_sets[transfer_set]["verify"] = vrfy_script transfer_sets[transfer_set]["init_xfer.sh"] = init_xfer_script transfer_sets[transfer_set]["server_job_dir"] = ( - f"{globus_dict.server_home}/doorman/globus.{globus_dict.jobid}/{transfer_set}" + f"{globus_dict.server_home}/doorman/{globus_dict.jobid}/{transfer_set}" ) return transfer_sets @@ -243,7 +244,7 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) server_name = self.task_config.SERVER_NAME # Initialize a list of status files. - transfer_set["statuses"] = [] + transfer_set["status_files"] = [] transfer_set["completed"] = [] # Tell Sven we have files to send, one at a time @@ -252,7 +253,8 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) location_f.write(location + "\n") try: logger.info(f"Preparing package for {location}") - sven_output = self.forsven(output=str.split) + sven_output = self.forsven(output=str) + logger.debug(sven_output) except ProcessError as pe: raise ProcessError("FATAL ERROR Sven failed to package the request" f"for {location}") from pe @@ -274,6 +276,7 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) "run_doorman.sh", f"{server_name}:{server_run_script}", output=str.split, error=str.split ) + logger.debug("Successfully transferred the doorman script") except ProcessError as pe: raise ProcessError("FATAL ERROR Failed to send doorman run script to Niagara") from pe @@ -286,6 +289,8 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) # Initialize transfer status transfer_failed = False + check_log_count = 0 + logger.debug(f"Waiting for the service to complete on {server_name}") while not all(transfer_set["completed"]) and wait_count < max_wait_count: sleep(sleep_time) for i in range(len(transfer_set["status_files"])): @@ -312,6 +317,22 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) logger.error(f"FATAL ERROR HPSS archiving failed for {transfer_set['locations'][i]}.") transfer_failed = True + # Retrieve the log file (if it exists) from the server and check if it failed + try: + self.scp(f"{server_name}:{server_job_dir}/run_doorman.log", '.') + except ProcessError: + check_log_count += 1 + if check_log_count > 3: + logger.error(f"FATAL ERROR Unable to retrieve the run_doorman.log file") + transfer_failed = True + else: + with open("run_doorman.log") as doorman_log: + doorman_lines = doorman_log.readlines() + + if "FAILURE" in doorman_lines[-1]: + logger.error(f"FATAL ERROR The doorman failed to run on {server_name}") + transfer_failed = True + if transfer_failed: break @@ -325,14 +346,9 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) # Sleep a couple more seconds to ensure all status files finish transferring sleep(2) - # Retrieve and print the Doorman log file from the server - try: - self.scp(f"{server_name}:{server_job_dir}/run_doorman.log", '.') - with open('run_doorman.log', 'r') as doorman_log: - logger.info(doorman_log.read()) - - except ProcessError as pe: - raise ProcessError("FATAL ERROR Failed to retrieve the doorman log file from {server_name}") from pe + # Write out the log file if it is present + if doorman_lines in locals(): + logger.debug('\n'.join(doorman_lines)) # Check for a failed transfer and/or timeouts if transfer_failed or not all(transfer_set["successes"]): From 2a501253e862e9216d12ccd49af646032db90903 Mon Sep 17 00:00:00 2001 From: David Huber Date: Fri, 7 Feb 2025 15:06:18 -0600 Subject: [PATCH 62/83] Do not source config.arch_tars for the globus job --- jobs/JGLOBAL_GLOBUS | 2 +- parm/config/gfs/config.arch_tars | 1 - parm/config/gfs/config.globus | 2 +- ush/python/pygfs/task/archive.py | 2 +- 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/jobs/JGLOBAL_GLOBUS b/jobs/JGLOBAL_GLOBUS index 2e0f16f701..eff7e54b6d 100755 --- a/jobs/JGLOBAL_GLOBUS +++ b/jobs/JGLOBAL_GLOBUS @@ -1,7 +1,7 @@ #! /usr/bin/env bash source "${HOMEgfs}/ush/preamble.sh" -source "${HOMEgfs}/ush/jjob_header.sh" -e "globus" -c "base arch_tars globus" +source "${HOMEgfs}/ush/jjob_header.sh" -e "globus" -c "base globus" ############################################## # Set variables used in the script diff --git a/parm/config/gfs/config.arch_tars b/parm/config/gfs/config.arch_tars index d2258be1dd..0e99bf1972 100644 --- a/parm/config/gfs/config.arch_tars +++ b/parm/config/gfs/config.arch_tars @@ -16,7 +16,6 @@ export ARCH_GAUSSIAN_FHINC=${FHOUT_GFS} if [[ "${GLOBUSARCH}" == "YES" ]]; then export ATARDIR="${DATAROOT}/archive_rotdir/${RUN}" export LOCALARCH="YES" - export DATASETS_YAML="backup_tarballs.yaml" fi echo "END: config.arch_tars" diff --git a/parm/config/gfs/config.globus b/parm/config/gfs/config.globus index b07cde797b..6a3cd3b53c 100644 --- a/parm/config/gfs/config.globus +++ b/parm/config/gfs/config.globus @@ -22,7 +22,7 @@ export SERVER_HOME='/collab1/data/{{SERVER_USERNAME}}' export SVEN_DROPBOX_ROOT="${DATA}/SVEN_DROPBOX" # Location of the doorman package on Niagara -export DOORMAN_ROOT="/home/Gyorgy.Fekete/doorman" +export DOORMAN_ROOT="/home/David.Huber/doorman" # Server name (should match ~/.ssh/config) export SERVER_NAME="niagara" diff --git a/ush/python/pygfs/task/archive.py b/ush/python/pygfs/task/archive.py index 3594f5fff7..820b676a05 100644 --- a/ush/python/pygfs/task/archive.py +++ b/ush/python/pygfs/task/archive.py @@ -617,7 +617,7 @@ def _create_datasets_yaml(self, datasets): return com_conf = self.task_config.COMOUT_CONF - yaml_filename = self.task_config.DATASETS_YAML + yaml_filename = "backup_tarballs.yaml" yaml_filename = os.path.join(com_conf, yaml_filename) output_yaml = {} From 2c2de8824cd52b2f18ea3b6684de5c1ffe0fa8a0 Mon Sep 17 00:00:00 2001 From: David Huber Date: Fri, 7 Feb 2025 15:07:02 -0600 Subject: [PATCH 63/83] Convert return.j2 to a config file, not a script --- parm/globus/return.j2 | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/parm/globus/return.j2 b/parm/globus/return.j2 index 06ed38e225..4d72e9ae83 100644 --- a/parm/globus/return.j2 +++ b/parm/globus/return.j2 @@ -1,6 +1,4 @@ -#!/usr/bin/env bash - -# Local globus endpoint UUID -export SENDER_GLC="{{CLIENT_GLOBUS_UUID}}" -# Local dropbox location -export SENDER_DRP="{{sven_dropbox}}" +# This is not a script, it is a configuration file for the doorman. +# This tells the doorman where to send the confirmation. +SENDER_GLC={{CLIENT_GLOBUS_UUID}} +SENDER_DRP={{sven_dropbox}} From 3ce207f9efa778714ac41cec7527fd17aa1e5343 Mon Sep 17 00:00:00 2001 From: David Huber Date: Fri, 7 Feb 2025 15:07:38 -0600 Subject: [PATCH 64/83] Add process step to run_doorman --- parm/globus/run_doorman.sh.j2 | 40 +++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/parm/globus/run_doorman.sh.j2 b/parm/globus/run_doorman.sh.j2 index d650285b65..2467497ca6 100644 --- a/parm/globus/run_doorman.sh.j2 +++ b/parm/globus/run_doorman.sh.j2 @@ -5,8 +5,10 @@ script_relpath="$(dirname "${BASH_SOURCE[0]}")" log_file="${script_relpath}/run_doorman.log" cd "${script_relpath}" -# Initialize modules (this is a Niagara-specific path; parameterize if needed) -. /apps/lmod/lmod/init/bash +# Initialize the shell (hpss, modules, etc) (this is a Niagara-specific path; parameterize if needed) +if [[ -f /etc/bashrc ]]; then + . /etc/bashrc +fi run_doorman(){ set -eux @@ -14,12 +16,13 @@ set -eux rm -f dm.conf rm -f places.inc rm -f FLIST -rm -rf "{{doorman_gendel}}" +gendel="{{run_directory}}/GENERAL_DELIVERY" +rm -rf "${gendel}" # Tell the doorman where the general delivery space is on Niagara (unique for each RUN/cycle) { # This is where tarballs will be received and confirmations are written and sent. - echo 'export GENDEL="{{doorman_gendel}}"' + echo "export GENDEL=${gendel}" # Tell the doorman what the sender's UUID is echo 'export CLIENT_ENDPOINT="{{CLIENT_GLOBUS_UUID}}"' # Tell the doorman where the sending client's dropbox is (why twice??) @@ -34,7 +37,7 @@ export PATH="${PATH}:{{DOORMAN_ROOT}}/bin" set +e # Create the general delivery space if it wasn't already -bash -e initialize.sh +bash -ex initialize.sh init_stat=$? if [[ ${init_stat} -ne 0 ]]; then @@ -43,7 +46,7 @@ if [[ ${init_stat} -ne 0 ]]; then fi # Transfer the data from the sender and execute the 'todo' script -bash -e receive.sh --go +bash -ex receive.sh --go receive_stat=$? # If receive didn't produce an FLIST file, then something went wrong @@ -65,33 +68,44 @@ done < FLIST set +e -# Sleep for a minute to allow time for all globus artifacts to resolve -sleep 1m +# Sleep allow time for all globus artifacts to resolve +sleep 10s + +# Process the transfer requests (this is the long step) +bash -ex process.sh +proc_stat=$? +if [[ ${proc_stat} -ne 0 ]]; then + echo "Failed to process the globus requests!" + return 3 +fi # Validate and generate the acknowledgement for each transfer ID for ID in ${IDs}; do - bash -e ack.sh "${ID}" + bash -ex ack.sh "${ID}" ack_stat=$? if [[ ${ack_stat} != 0 ]]; then echo "ack.sh failed for file ID ${ID}!" - return 3 + return 4 fi done +# Sleep again to allow time for all globus artifacts to resolve +sleep 10s + # Send the acknowledgement back to the sender -bash -e send.sh +bash -ex send.sh send_stat=$? if [[ ${send_stat} -ne 0 ]]; then echo "Failed to send status back to client!" - return 4 + return 5 fi # Remove the working directory #shellcheck disable=SC2050 if [[ "{{KEEPDATA}}" == "False" || "{{KEEPDATA}}" == "NO" ]]; then - cd "${HOME}" || echo "Failed to navigate to ${HOME}!" && return 5 + cd "${HOME}" || echo "Failed to navigate to ${HOME}!" && return 6 rm -rf "${doorman_dir}" fi From ea83c65fd8f64956ced65aec8bcf437f7e5074d3 Mon Sep 17 00:00:00 2001 From: David Huber Date: Fri, 7 Feb 2025 15:08:54 -0600 Subject: [PATCH 65/83] Remove md5 checksum check, note issue with htar indexing --- parm/globus/todo.sh.j2 | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/parm/globus/todo.sh.j2 b/parm/globus/todo.sh.j2 index b6559a0c8c..a0628a727c 100644 --- a/parm/globus/todo.sh.j2 +++ b/parm/globus/todo.sh.j2 @@ -13,38 +13,40 @@ file_full="${cwd}/${file}" mkdir -p "${log_directory}" log_file="${log_directory}/hpss_status_${globus_xfer_id}.log" -touch "${log_file}" +rm -f "${log_file}" send_to_hpss() { - # Check that the MD5 checksum matches what was sent - chk=$(md5sum "${file_full}") - if [[ "${chk}" != "${globus_xfer_id} ${file_full}" ]]; then - echo "MD5 checksum of ${file} does not match. Exiting." >> "${log_file}" - return 1 - fi - # Write a command file to place the file on hpss and protect it local hpss_target="${hpss_target_dir}/${file}" local command_file="command_file_${globus_xfer_id}" - echo "mkdir -p ${hpss_dir}" >> "${command_file}" + echo "mkdir -p ${hpss_target_dir}" >> "${command_file}" echo "put ${file_full} : ${hpss_target}" >> "${command_file}" hsi in "${command_file}" >> "${log_file}" 2>&1 # shellcheck disable=SC2181 if [[ $? != 0 ]]; then echo "Failed to send ${file} to HPSS." >> "${log_file}" - return 2 + return 1 fi rm -f "${command_file}" # Create an index file if the file is a tarball if [[ ${file} == *.tar ]]; then - if ! htar -Xvf "${hpss_target}" >> "${log_file}" 2>&1; then - echo "Failed to create an index file for ${hpss_target}. Exiting." >> "${log_file}" - return 3 + # For now, ignore errors coming from the index command. + # Issue RDHPCS#2025020754000121 opened with RDHPCS. + save_e=$- + set +e + htar -Xvf "${hpss_target}" > /dev/null 2>&1 + if [[ "${save_e}" =~ e ]]; then + set -e fi + # TODO Reinstate error checking and logging when creating index files + #if ! htar -Xvf "${hpss_target}" >> "${log_file}" 2>&1; then + #echo "Failed to create an index file for ${hpss_target}. Exiting." >> "${log_file}" + #return 2 + #fi fi return 0 From da65e7af3cbb832fd5a6680b483633fc9cf57988 Mon Sep 17 00:00:00 2001 From: David Huber Date: Fri, 7 Feb 2025 15:09:22 -0600 Subject: [PATCH 66/83] Simplify verify checks --- parm/globus/verify.sh.j2 | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/parm/globus/verify.sh.j2 b/parm/globus/verify.sh.j2 index 9c9db193d5..88250bd5e6 100644 --- a/parm/globus/verify.sh.j2 +++ b/parm/globus/verify.sh.j2 @@ -8,8 +8,13 @@ log_directory="{{server_home}}/doorman/{{jobid}}/logs/" hpss_stat="" for log in "${log_directory}"/*.log; do - hpss_stat_tmp=$(tail -1 "${log}") - hpss_stat="${log}: ${hpss_stat}\n${hpss_stat_tmp}" + hpss_stat_tmp=$(tail -n1 "${log}") + if [[ "${hpss_stat_tmp}" == "SUCCESS" ]]; then + hpss_stat="SUCCESS" + else + hpss_stat="FAILURE" + break + fi done echo "${hpss_stat}" From 3ae599008307acd4ec2b23e70444ce95415d8f2e Mon Sep 17 00:00:00 2001 From: David Huber Date: Fri, 7 Feb 2025 15:10:22 -0600 Subject: [PATCH 67/83] Move the doorman's general deliver location --- ush/python/pygfs/task/globus_hpss.py | 30 ++++++++++++++++------------ 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/ush/python/pygfs/task/globus_hpss.py b/ush/python/pygfs/task/globus_hpss.py index 1fc6dbd495..7eaab0fc08 100644 --- a/ush/python/pygfs/task/globus_hpss.py +++ b/ush/python/pygfs/task/globus_hpss.py @@ -85,8 +85,6 @@ def __init__(self, config: Dict[str, Any]) -> None: local_dict = AttrDict({ 'sven_dropbox': (f"{self.task_config.SVEN_DROPBOX_ROOT}"), - 'doorman_gendel': (f"{server_home}/GENERAL_DELIVERY/" - f"{self.task_config.PSLOT}/{self.task_config.RUN}.{cycle_YMD}/{cycle_HH}"), 'hpss_target_dir': f"{self.task_config.ATARDIR}/{cycle_YMDH}", 'server_home': server_home }) @@ -135,7 +133,7 @@ def configure(self, globus_dict: Dict[str, Any]) -> (Dict[str, Any], List[Dict[s com_conf = globus_dict.COMIN_CONF # Collect the files and properties from the input YAML - backup_yaml = os.path.join(com_conf, globus_dict.DATASETS_YAML) + backup_yaml = os.path.join(com_conf, "backup_tarballs.yaml") # Parse the list of tarballs to archive if os.path.isfile(backup_yaml): @@ -158,10 +156,6 @@ def configure(self, globus_dict: Dict[str, Any]) -> (Dict[str, Any], List[Dict[s "rstprod": {"locations": rstprod_backup_set} } - # Parse the doorman setup script - doorman_jinja = os.path.join(globus_parm, "run_doorman.sh.j2") - doorman_script = Jinja(doorman_jinja, data=globus_dict, allow_missing=False).render - # Write a script with the location of the dropbox on the client dm_conf = f'export dropbox="{globus_dict.sven_dropbox}"' @@ -172,7 +166,7 @@ def configure(self, globus_dict: Dict[str, Any]) -> (Dict[str, Any], List[Dict[s os.mkdir(globus_dict.sven_dropbox) # Parse the return script - return_jinja = os.path.join(globus_parm, "return.sh.j2") + return_jinja = os.path.join(globus_parm, "return.j2") return_script = Jinja(return_jinja, data=globus_dict, allow_missing=False).render # Create a todo script for rstprod and non-rstprod tarballs @@ -192,16 +186,23 @@ def configure(self, globus_dict: Dict[str, Any]) -> (Dict[str, Any], List[Dict[s init_xfer_jinja = os.path.join(globus_parm, "init_xfer.sh.j2") init_xfer_script = Jinja(init_xfer_jinja, data=globus_dict, allow_missing=False).render - # Add common scripts to both standard and rstprod + # Add the remaining scripts and definitions to transfer_sets for transfer_set in transfer_sets: + server_job_dir = f"{globus_dict.server_home}/doorman/{globus_dict.jobid}/{transfer_set}" + transfer_sets[transfer_set]["server_job_dir"] = server_job_dir + + # Render the run_doorman script + doorman_dict = globus_dict + doorman_dict["run_directory"] = server_job_dir + doorman_jinja = os.path.join(globus_parm, "run_doorman.sh.j2") + doorman_script = Jinja(doorman_jinja, data=doorman_dict, allow_missing=False).render transfer_sets[transfer_set]["run_doorman.sh"] = doorman_script + + # Common scripts transfer_sets[transfer_set]["dm.conf"] = dm_conf transfer_sets[transfer_set]["return"] = return_script transfer_sets[transfer_set]["verify"] = vrfy_script transfer_sets[transfer_set]["init_xfer.sh"] = init_xfer_script - transfer_sets[transfer_set]["server_job_dir"] = ( - f"{globus_dict.server_home}/doorman/{globus_dict.jobid}/{transfer_set}" - ) return transfer_sets @@ -290,6 +291,7 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) # Initialize transfer status transfer_failed = False check_log_count = 0 + log_read = False logger.debug(f"Waiting for the service to complete on {server_name}") while not all(transfer_set["completed"]) and wait_count < max_wait_count: sleep(sleep_time) @@ -329,6 +331,8 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) with open("run_doorman.log") as doorman_log: doorman_lines = doorman_log.readlines() + log_read = True + if "FAILURE" in doorman_lines[-1]: logger.error(f"FATAL ERROR The doorman failed to run on {server_name}") transfer_failed = True @@ -347,7 +351,7 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) sleep(2) # Write out the log file if it is present - if doorman_lines in locals(): + if log_read: logger.debug('\n'.join(doorman_lines)) # Check for a failed transfer and/or timeouts From 88bf985577f41322f4254042c08641bbf0a932d9 Mon Sep 17 00:00:00 2001 From: David Huber Date: Fri, 7 Feb 2025 15:28:50 -0600 Subject: [PATCH 68/83] Correct variable name --- parm/globus/run_doorman.sh.j2 | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/parm/globus/run_doorman.sh.j2 b/parm/globus/run_doorman.sh.j2 index 2467497ca6..35a111729b 100644 --- a/parm/globus/run_doorman.sh.j2 +++ b/parm/globus/run_doorman.sh.j2 @@ -105,8 +105,11 @@ fi # Remove the working directory #shellcheck disable=SC2050 if [[ "{{KEEPDATA}}" == "False" || "{{KEEPDATA}}" == "NO" ]]; then - cd "${HOME}" || echo "Failed to navigate to ${HOME}!" && return 6 - rm -rf "${doorman_dir}" + if ! cd "${HOME}"; then + echo "Failed to navigate to ${HOME}!" + return 6 + fi + rm -rf "${gendel}" fi set +eux From 039488dc1acc47e21777bbdc12a7cbd51ff26c23 Mon Sep 17 00:00:00 2001 From: David Huber Date: Fri, 7 Feb 2025 15:35:05 -0600 Subject: [PATCH 69/83] Add todo fixes to todo_rstprod --- parm/globus/todo_rstprod.sh.j2 | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/parm/globus/todo_rstprod.sh.j2 b/parm/globus/todo_rstprod.sh.j2 index 1f5260a69d..b690d1e618 100644 --- a/parm/globus/todo_rstprod.sh.j2 +++ b/parm/globus/todo_rstprod.sh.j2 @@ -14,7 +14,6 @@ mkdir -p "${log_directory}" log_file="${log_directory}/hpss_status_${globus_xfer_id}.log" rm -f "${log_file}" -touch "${log_file}" send_to_hpss() { @@ -25,18 +24,11 @@ send_to_hpss() return 1 fi - # Check that the MD5 checksum matches what was sent - chk=$(md5sum "${file_full}") - if [[ "${chk}" != "${globus_xfer_id} ${file_full}" ]]; then - echo "MD5 checksum of ${file} does not match. Exiting." >> "${log_file}" - return 2 - fi - # Write a command file to place the file on hpss and protect it local hpss_target="${hpss_target_dir}/${file}" local command_file="command_file_${globus_xfer_id}" { - echo "mkdir -p ${hpss_dir}" + echo "mkdir -p ${hpss_target_dir}" echo "put ${file_full} : ${hpss_target}" echo "chgrp rstprod ${hpss_target}" echo "chmod 640 ${hpss_target}" @@ -47,7 +39,9 @@ send_to_hpss() if [[ $? != 0 ]]; then echo "Failed to send ${file} to HPSS and/or protect it." >> "${log_file}" echo "Deleting from hpss. Please verify it was deleted!!" >> "${log_file}" + set +e hsi rm "${hpss_target}" + set -e return 3 fi @@ -55,10 +49,19 @@ send_to_hpss() # Create an index file if the file is a tarball if [[ ${file} == *.tar ]]; then - if ! htar -Xvf "${hpss_target}" >> "${log_file}" 2>&1; then - echo "Failed to create an index file for ${hpss_target}. Exiting." >> "${log_file}" - return 4 + # For now, ignore errors coming from the index command. + # Issue RDHPCS#2025020754000121 opened with RDHPCS. + save_e=$- + set +e + htar -Xvf "${hpss_target}" > /dev/null 2>&1 + if [[ "${save_e}" =~ e ]]; then + set -e fi + # TODO Reinstate error checking and logging when creating index files + #if ! htar -Xvf "${hpss_target}" >> "${log_file}" 2>&1; then + #echo "Failed to create an index file for ${hpss_target}. Exiting." >> "${log_file}" + #return 2 + #fi fi return 0 From 6472f92aec4cdcb9db750d5112cc5e23e5e60963 Mon Sep 17 00:00:00 2001 From: David Huber Date: Mon, 10 Feb 2025 09:10:34 -0600 Subject: [PATCH 70/83] Enable rmdir functionality --- parm/globus/init_xfer.sh.j2 | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/parm/globus/init_xfer.sh.j2 b/parm/globus/init_xfer.sh.j2 index 5a6084f835..4ead016d71 100644 --- a/parm/globus/init_xfer.sh.j2 +++ b/parm/globus/init_xfer.sh.j2 @@ -34,6 +34,17 @@ if compgen -G "{{server_home}}"/doorman/globus.* > /dev/null 2>&1 ; then done fi +# Look for rmdir requests +if compgen -G "/collab1/data/David.Huber"/req_rmdir.* > /dev/null 2>&1 ; then + echo "${run_time}" > "${runtime_log}" + for rmdir_req_fl in "{{server_home}}"/req_rmdir.*; do + dir=$(cat "${rmdir_req_fl}") + rm -rf "${dir}" + rm -f "${mkdir_req_fl}" + done +fi + + # Check if it has been a while since this script had anything to do last_runtime=$(cat "${runtime_log}") last_runtime_s=$(date -d "${last_runtime}" '+%s') From e463c64a97e20f90ec4010450dd0946b5d44cd91 Mon Sep 17 00:00:00 2001 From: David Huber Date: Mon, 10 Feb 2025 09:11:54 -0600 Subject: [PATCH 71/83] Run doorman with better error checking; do not remove gendel yet --- parm/globus/run_doorman.sh.j2 | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/parm/globus/run_doorman.sh.j2 b/parm/globus/run_doorman.sh.j2 index 35a111729b..813b70192d 100644 --- a/parm/globus/run_doorman.sh.j2 +++ b/parm/globus/run_doorman.sh.j2 @@ -11,7 +11,7 @@ if [[ -f /etc/bashrc ]]; then fi run_doorman(){ -set -eux +set -ux rm -f dm.conf rm -f places.inc @@ -64,7 +64,7 @@ while IFS= read -r line; do tmp="${package_name#package_location_}" ID="${tmp%.tgz}" IDs="${IDs} ${ID}" -done < FLIST +done < <(grep "package_location" FLIST) set +e @@ -102,24 +102,15 @@ if [[ ${send_stat} -ne 0 ]]; then return 5 fi -# Remove the working directory -#shellcheck disable=SC2050 -if [[ "{{KEEPDATA}}" == "False" || "{{KEEPDATA}}" == "NO" ]]; then - if ! cd "${HOME}"; then - echo "Failed to navigate to ${HOME}!" - return 6 - fi - rm -rf "${gendel}" -fi - set +eux } -run_doorman >& "${log_file}" +# Run the function, halting the function on errors, and capture the output +( set -e; run_doorman ) >& "${log_file}" +doorman_stat=$? -if [[ $? -ne 0 ]]; then +if [[ ${doorman_stat} -ne 0 ]]; then echo "Failed to run the doorman service!" - set +eux echo "FAILURE" >> "${log_file}" exit 9 fi From e10966957852eb8fe5ea67d9f3dd19d222bb10d4 Mon Sep 17 00:00:00 2001 From: David Huber Date: Mon, 10 Feb 2025 09:12:27 -0600 Subject: [PATCH 72/83] Improve error checking --- ush/python/pygfs/task/globus_hpss.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/ush/python/pygfs/task/globus_hpss.py b/ush/python/pygfs/task/globus_hpss.py index 7eaab0fc08..aec469b108 100644 --- a/ush/python/pygfs/task/globus_hpss.py +++ b/ush/python/pygfs/task/globus_hpss.py @@ -246,7 +246,9 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) # Initialize a list of status files. transfer_set["status_files"] = [] + transfer_set["xfer_ids"] = [] transfer_set["completed"] = [] + transfer_set["successes"] = [] # Tell Sven we have files to send, one at a time for location in transfer_set["locations"]: @@ -262,10 +264,13 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) # Parse Sven's output to get the name of the return status file match = re.search("\"(status_.*)\" in your dropbox", sven_output) - transfer_set["status_files"].append(os.path.join(self.task_config.sven_dropbox, match.group(1))) + status_file = match.group(1) + transfer_set["xfer_ids"].append(status_file.replace("status_", "")) + transfer_set["status_files"].append(os.path.join(self.task_config.sven_dropbox, status_file)) - # Initialize 'completed' to false for each file + # Initialize 'completed' and 'success' to false for each file transfer_set["completed"].append(False) + transfer_set["successes"].append(False) # Transfer the doorman script to Niagara. # Note, this assumes we have unattended transfer capability. @@ -284,7 +289,7 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) # Now wait for the doorman script to run via cron on Niagara. # Once complete, Sven's dropbox should fill up with status files. wait_count = 0 - sleep_time = 300 # s + sleep_time = 60 # s timeout_time = 5.75 * 3600 # s max_wait_count = int(timeout_time / sleep_time) @@ -295,14 +300,15 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) logger.debug(f"Waiting for the service to complete on {server_name}") while not all(transfer_set["completed"]) and wait_count < max_wait_count: sleep(sleep_time) - for i in range(len(transfer_set["status_files"])): + for i in range(len(transfer_set["locations"])): status_file = transfer_set["status_files"][i] if os.path.exists(status_file): # If this is a new status file, check if the transfer was successful if not transfer_set["completed"][i]: transfer_set["completed"][i] = True with open(status_file) as status_handle: - transfer_set["successes"][i] = status_handle.readlines()[-1] == "SUCCESS" + status_string = status_handle.readline().rstrip() + transfer_set["successes"][i] = status_string == f"status.{transfer_set['xfer_ids'][i]} SUCCESS" if transfer_set["successes"][i]: logger.info(f"Successfully archived {transfer_set['locations'][i]} to HPSS!") From 49ba53f9518656b401a192c641d0af7068e8d6e4 Mon Sep 17 00:00:00 2001 From: David Huber Date: Mon, 10 Feb 2025 09:50:13 -0600 Subject: [PATCH 73/83] Address lint issues --- parm/globus/init_xfer.sh.j2 | 6 ++++-- parm/globus/run_doorman.sh.j2 | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/parm/globus/init_xfer.sh.j2 b/parm/globus/init_xfer.sh.j2 index 4ead016d71..108df88b8d 100644 --- a/parm/globus/init_xfer.sh.j2 +++ b/parm/globus/init_xfer.sh.j2 @@ -57,8 +57,10 @@ if [[ ${diff_d} -gt 3 ]]; then echo "WARNING the {{SERVER_NAME}} service for {{PSLOT}} has not run in > 3 days!" echo "Turning off the crontab for {{PSLOT}}!!" scriptpath="$( realpath "${0}" )" - entry=$(crontab -l | grep -i "${scriptpath}") + cron_out=$(crontab -l) + entry=$(crontab -l | grep "${scriptpath}") echo "Deleting crontab entry" echo "${entry}" - crontab -l | grep -v "${scriptpath}" | crontab - + cron_in=$(echo "${cron_out}" | grep -v "${scriptpath}") + echo "${cron_in}" | crontab -l fi diff --git a/parm/globus/run_doorman.sh.j2 b/parm/globus/run_doorman.sh.j2 index 813b70192d..e3fdac412e 100644 --- a/parm/globus/run_doorman.sh.j2 +++ b/parm/globus/run_doorman.sh.j2 @@ -58,13 +58,14 @@ fi set -e # Parse the FLIST file created by receive.sh to get the transfer IDs +packages=$(grep "package_location" FLIST) IDs="" while IFS= read -r line; do package_name=$(echo "${line}" | grep -o "package_location_.*\.tgz") tmp="${package_name#package_location_}" ID="${tmp%.tgz}" IDs="${IDs} ${ID}" -done < <(grep "package_location" FLIST) +done < <"(echo ${packages})" set +e From 3fb210cb10fdb5d21b77ca35deb14d6709a38a00 Mon Sep 17 00:00:00 2001 From: David Huber Date: Mon, 10 Feb 2025 09:53:22 -0600 Subject: [PATCH 74/83] Remove erroneous quotes --- parm/globus/run_doorman.sh.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parm/globus/run_doorman.sh.j2 b/parm/globus/run_doorman.sh.j2 index e3fdac412e..4011d47408 100644 --- a/parm/globus/run_doorman.sh.j2 +++ b/parm/globus/run_doorman.sh.j2 @@ -65,7 +65,7 @@ while IFS= read -r line; do tmp="${package_name#package_location_}" ID="${tmp%.tgz}" IDs="${IDs} ${ID}" -done < <"(echo ${packages})" +done < <(echo ${packages}) set +e From 96ce799d333f56e6601c3bf7eb213b3254132a51 Mon Sep 17 00:00:00 2001 From: David Huber Date: Mon, 10 Feb 2025 09:54:35 -0600 Subject: [PATCH 75/83] Linter issue --- parm/globus/run_doorman.sh.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parm/globus/run_doorman.sh.j2 b/parm/globus/run_doorman.sh.j2 index 4011d47408..1c9ab86dc8 100644 --- a/parm/globus/run_doorman.sh.j2 +++ b/parm/globus/run_doorman.sh.j2 @@ -65,7 +65,7 @@ while IFS= read -r line; do tmp="${package_name#package_location_}" ID="${tmp%.tgz}" IDs="${IDs} ${ID}" -done < <(echo ${packages}) +done < <(echo "${packages}") set +e From 5b984fbe96f3d005266adf538235946fd4edefd8 Mon Sep 17 00:00:00 2001 From: David Huber Date: Mon, 10 Feb 2025 09:56:51 -0600 Subject: [PATCH 76/83] Linter issue --- parm/globus/init_xfer.sh.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parm/globus/init_xfer.sh.j2 b/parm/globus/init_xfer.sh.j2 index 108df88b8d..8f1d33ac59 100644 --- a/parm/globus/init_xfer.sh.j2 +++ b/parm/globus/init_xfer.sh.j2 @@ -58,7 +58,7 @@ if [[ ${diff_d} -gt 3 ]]; then echo "Turning off the crontab for {{PSLOT}}!!" scriptpath="$( realpath "${0}" )" cron_out=$(crontab -l) - entry=$(crontab -l | grep "${scriptpath}") + entry=$(echo "${cron_out}" | grep "${scriptpath}") echo "Deleting crontab entry" echo "${entry}" cron_in=$(echo "${cron_out}" | grep -v "${scriptpath}") From ffcc1dc8f02f8ce9707ff8e13dd24e6add3adf9c Mon Sep 17 00:00:00 2001 From: David Huber Date: Mon, 10 Feb 2025 11:08:07 -0600 Subject: [PATCH 77/83] Create links for config.arch* files --- parm/config/gefs/config.arch_tars | 23 +---------------------- parm/config/gefs/config.arch_vrfy | 16 +--------------- 2 files changed, 2 insertions(+), 37 deletions(-) mode change 100644 => 120000 parm/config/gefs/config.arch_tars mode change 100644 => 120000 parm/config/gefs/config.arch_vrfy diff --git a/parm/config/gefs/config.arch_tars b/parm/config/gefs/config.arch_tars deleted file mode 100644 index d2258be1dd..0000000000 --- a/parm/config/gefs/config.arch_tars +++ /dev/null @@ -1,22 +0,0 @@ -#! /usr/bin/env bash - -########## config.arch_tars ########## -# Archive specific - -echo "BEGIN: config.arch_tars" - -# Get task specific resources -. "${EXPDIR}/config.resources" "arch_tars" - -export ARCH_GAUSSIAN="YES" -export ARCH_GAUSSIAN_FHMAX=${FHMAX_GFS} -export ARCH_GAUSSIAN_FHINC=${FHOUT_GFS} - -# If we are running globus archiving, create tarballs in a temporary location -if [[ "${GLOBUSARCH}" == "YES" ]]; then - export ATARDIR="${DATAROOT}/archive_rotdir/${RUN}" - export LOCALARCH="YES" - export DATASETS_YAML="backup_tarballs.yaml" -fi - -echo "END: config.arch_tars" diff --git a/parm/config/gefs/config.arch_tars b/parm/config/gefs/config.arch_tars new file mode 120000 index 0000000000..f06a65078f --- /dev/null +++ b/parm/config/gefs/config.arch_tars @@ -0,0 +1 @@ +../gfs/config.arch_tars \ No newline at end of file diff --git a/parm/config/gefs/config.arch_vrfy b/parm/config/gefs/config.arch_vrfy deleted file mode 100644 index cb668a48e2..0000000000 --- a/parm/config/gefs/config.arch_vrfy +++ /dev/null @@ -1,15 +0,0 @@ -#! /usr/bin/env bash - -########## config.arch_vrfy ########## -# Archive specific - -echo "BEGIN: config.arch_vrfy" - -# Get task specific resources -. "${EXPDIR}/config.resources" arch_vrfy - -export ARCH_GAUSSIAN="YES" -export ARCH_GAUSSIAN_FHMAX=${FHMAX_GFS} -export ARCH_GAUSSIAN_FHINC=${FHOUT_GFS} - -echo "END: config.arch_vrfy" diff --git a/parm/config/gefs/config.arch_vrfy b/parm/config/gefs/config.arch_vrfy new file mode 120000 index 0000000000..d5d4274b9d --- /dev/null +++ b/parm/config/gefs/config.arch_vrfy @@ -0,0 +1 @@ +../gfs/config.arch_vrfy \ No newline at end of file From e9a2099357ffe1fa2e01fde8bc465f61fab39069 Mon Sep 17 00:00:00 2001 From: David Huber Date: Mon, 10 Feb 2025 11:12:50 -0600 Subject: [PATCH 78/83] Move arch-specific features to config.arch_tars --- parm/config/gfs/config.arch_tars | 12 ++++++++++-- parm/config/gfs/config.arch_vrfy | 4 ---- parm/config/gfs/config.base | 8 -------- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/parm/config/gfs/config.arch_tars b/parm/config/gfs/config.arch_tars index 0e99bf1972..e6d1693782 100644 --- a/parm/config/gfs/config.arch_tars +++ b/parm/config/gfs/config.arch_tars @@ -8,9 +8,17 @@ echo "BEGIN: config.arch_tars" # Get task specific resources . "${EXPDIR}/config.resources" "arch_tars" +export ARCH_CYC=00 # Archive data at this cycle for warm start and/or forecast-only capabilities +export ARCH_WARMICFREQ=4 # Archive frequency in days for warm start capability +export ARCH_FCSTICFREQ=1 # Archive frequency in days for gdas and gfs forecast-only capability +export ARCH_EXPDIR='YES' # Archive the EXPDIR configs, XML, and database +export ARCH_EXPDIR_FREQ=0 # How often to archive the EXPDIR in hours or 0 for first and last cycle only +export ARCH_HASHES='YES' # Archive the hashes of the GW and submodules and 'git status' for each; requires ARCH_EXPDIR +export ARCH_DIFFS='NO' # Archive the output of 'git diff' for the GW; requires ARCH_EXPDIR + export ARCH_GAUSSIAN="YES" -export ARCH_GAUSSIAN_FHMAX=${FHMAX_GFS} -export ARCH_GAUSSIAN_FHINC=${FHOUT_GFS} +export ARCH_GAUSSIAN_FHMAX=${FHMAX_GFS:-} +export ARCH_GAUSSIAN_FHINC=${FHOUT_GFS:-} # If we are running globus archiving, create tarballs in a temporary location if [[ "${GLOBUSARCH}" == "YES" ]]; then diff --git a/parm/config/gfs/config.arch_vrfy b/parm/config/gfs/config.arch_vrfy index 6bcbdb57fc..420a269056 100644 --- a/parm/config/gfs/config.arch_vrfy +++ b/parm/config/gfs/config.arch_vrfy @@ -8,8 +8,4 @@ echo "BEGIN: config.arch_vrfy" # Get task specific resources . "${EXPDIR}/config.resources" "arch_vrfy" -export ARCH_GAUSSIAN="YES" -export ARCH_GAUSSIAN_FHMAX=${FHMAX_GFS} -export ARCH_GAUSSIAN_FHINC=${FHOUT_GFS} - echo "END: config.arch_vrfy" diff --git a/parm/config/gfs/config.base b/parm/config/gfs/config.base index 8a4faf6eb7..dcb0ec12da 100644 --- a/parm/config/gfs/config.base +++ b/parm/config/gfs/config.base @@ -501,14 +501,6 @@ fi # Globus UUID for this machine export CLIENT_GLOBUS_UUID='@CLIENT_GLOBUS_UUID@' -export ARCH_CYC=00 # Archive data at this cycle for warm start and/or forecast-only capabilities -export ARCH_WARMICFREQ=4 # Archive frequency in days for warm start capability -export ARCH_FCSTICFREQ=1 # Archive frequency in days for gdas and gfs forecast-only capability -export ARCH_EXPDIR='YES' # Archive the EXPDIR configs, XML, and database -export ARCH_EXPDIR_FREQ=0 # How often to archive the EXPDIR in hours or 0 for first and last cycle only -export ARCH_HASHES='YES' # Archive the hashes of the GW and submodules and 'git status' for each; requires ARCH_EXPDIR -export ARCH_DIFFS='NO' # Archive the output of 'git diff' for the GW; requires ARCH_EXPDIR - # The monitor jobs are not yet supported for JEDIATMVAR. if [[ ${DO_JEDIATMVAR} = "YES" ]]; then export DO_FIT2OBS="NO" # Run fit to observations package From edcc9c3b8d5eae265632fae78b046fea2334a5a5 Mon Sep 17 00:00:00 2001 From: David Huber Date: Mon, 10 Feb 2025 11:30:20 -0600 Subject: [PATCH 79/83] Bump status download wait time --- ush/python/pygfs/task/globus_hpss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ush/python/pygfs/task/globus_hpss.py b/ush/python/pygfs/task/globus_hpss.py index aec469b108..a69c8aa63b 100644 --- a/ush/python/pygfs/task/globus_hpss.py +++ b/ush/python/pygfs/task/globus_hpss.py @@ -354,7 +354,7 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) logger.debug(f"{complete_count} files transferred in {wait_time} seconds.") # Sleep a couple more seconds to ensure all status files finish transferring - sleep(2) + sleep(5) # Write out the log file if it is present if log_read: From b66ee0cc46142fb563b56afa5ce21f3e59ab644c Mon Sep 17 00:00:00 2001 From: David Huber Date: Mon, 10 Feb 2025 11:32:09 -0600 Subject: [PATCH 80/83] Expand gendel variable name --- parm/globus/run_doorman.sh.j2 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parm/globus/run_doorman.sh.j2 b/parm/globus/run_doorman.sh.j2 index 1c9ab86dc8..77bace4abd 100644 --- a/parm/globus/run_doorman.sh.j2 +++ b/parm/globus/run_doorman.sh.j2 @@ -16,13 +16,13 @@ set -ux rm -f dm.conf rm -f places.inc rm -f FLIST -gendel="{{run_directory}}/GENERAL_DELIVERY" -rm -rf "${gendel}" +general_delivery_root="{{run_directory}}/GENERAL_DELIVERY" +rm -rf "${general_delivery_root}" # Tell the doorman where the general delivery space is on Niagara (unique for each RUN/cycle) { # This is where tarballs will be received and confirmations are written and sent. - echo "export GENDEL=${gendel}" + echo "export GENDEL=${general_delivery_root}" # Tell the doorman what the sender's UUID is echo 'export CLIENT_ENDPOINT="{{CLIENT_GLOBUS_UUID}}"' # Tell the doorman where the sending client's dropbox is (why twice??) From bdfc6a68a787066d381bc0306aeb650e38e5b140 Mon Sep 17 00:00:00 2001 From: David Huber Date: Mon, 10 Feb 2025 11:35:24 -0600 Subject: [PATCH 81/83] Removed arch flags from config.base (already in config.arch_tars) --- parm/config/gefs/config.base | 8 -------- 1 file changed, 8 deletions(-) diff --git a/parm/config/gefs/config.base b/parm/config/gefs/config.base index acd5eb9175..d49d49eef3 100644 --- a/parm/config/gefs/config.base +++ b/parm/config/gefs/config.base @@ -355,14 +355,6 @@ fi # Globus UUID for this machine export CLIENT_GLOBUS_UUID='@CLIENT_GLOBUS_UUID@' -export ARCH_CYC=00 # Archive data at this cycle for warm start and/or forecast-only capabilities -export ARCH_WARMICFREQ=4 # Archive frequency in days for warm start capability -export ARCH_FCSTICFREQ=1 # Archive frequency in days for gdas and gfs forecast-only capability -export ARCH_EXPDIR='YES' # Archive the EXPDIR configs, XML, and database -export ARCH_EXPDIR_FREQ=0 # How often to archive the EXPDIR in hours or 0 for first and last cycle only -export ARCH_HASHES='YES' # Archive the hashes of the GW and submodules and 'git status' for each; requires ARCH_EXPDIR -export ARCH_DIFFS='NO' # Archive the output of 'git diff' for the GW; requires ARCH_EXPDIR - # Number of regional collectives to create soundings for export NUM_SND_COLLECTIVES=${NUM_SND_COLLECTIVES:-9} From f9fd22caec19f9337adfdefefdaa38e86d07f01a Mon Sep 17 00:00:00 2001 From: David Huber Date: Mon, 10 Feb 2025 12:19:01 -0600 Subject: [PATCH 82/83] Generalize globus configs to make it easier to add more jobs --- ...DAS_ENKF_GLOBUS => JGDAS_ENKF_GLOBUS_ARCH} | 4 ++-- jobs/{JGLOBAL_GLOBUS => JGLOBAL_GLOBUS_ARCH} | 4 ++-- jobs/rocoto/{globus.sh => globus_arch.sh} | 4 ++-- parm/config/gfs/config.earc_tars | 2 +- parm/config/gfs/config.globus | 4 ++-- parm/config/gfs/config.resources | 2 +- parm/globus/init_xfer.sh.j2 | 4 ++-- ...obal_globus.py => exglobal_globus_arch.py} | 0 workflow/applications/gfs_cycled.py | 8 +++++++ workflow/rocoto/gefs_tasks.py | 6 ++--- workflow/rocoto/gfs_tasks.py | 22 +++++++++---------- 11 files changed, 34 insertions(+), 26 deletions(-) rename jobs/{JGDAS_ENKF_GLOBUS => JGDAS_ENKF_GLOBUS_ARCH} (87%) rename jobs/{JGLOBAL_GLOBUS => JGLOBAL_GLOBUS_ARCH} (88%) rename jobs/rocoto/{globus.sh => globus_arch.sh} (90%) rename scripts/{exglobal_globus.py => exglobal_globus_arch.py} (100%) diff --git a/jobs/JGDAS_ENKF_GLOBUS b/jobs/JGDAS_ENKF_GLOBUS_ARCH similarity index 87% rename from jobs/JGDAS_ENKF_GLOBUS rename to jobs/JGDAS_ENKF_GLOBUS_ARCH index c5b8616f63..fe429b9b2b 100755 --- a/jobs/JGDAS_ENKF_GLOBUS +++ b/jobs/JGDAS_ENKF_GLOBUS_ARCH @@ -1,14 +1,14 @@ #! /usr/bin/env bash source "${HOMEgfs}/ush/preamble.sh" -source "${HOMEgfs}/ush/jjob_header.sh" -e "globus" -c "base earc ens_group_globus" +source "${HOMEgfs}/ush/jjob_header.sh" -e "globus_earc" -c "base globus globus_earc" ############################################################### # Run globus transfer script ############################################################### -"${SCRgfs}/exgdas_enkf_globus.py" +"${SCRgfs}/exgdas_enkf_globus_earc.py" status=$? [[ ${status} -ne 0 ]] && exit "${status}" diff --git a/jobs/JGLOBAL_GLOBUS b/jobs/JGLOBAL_GLOBUS_ARCH similarity index 88% rename from jobs/JGLOBAL_GLOBUS rename to jobs/JGLOBAL_GLOBUS_ARCH index eff7e54b6d..7337a872e7 100755 --- a/jobs/JGLOBAL_GLOBUS +++ b/jobs/JGLOBAL_GLOBUS_ARCH @@ -1,7 +1,7 @@ #! /usr/bin/env bash source "${HOMEgfs}/ush/preamble.sh" -source "${HOMEgfs}/ush/jjob_header.sh" -e "globus" -c "base globus" +source "${HOMEgfs}/ush/jjob_header.sh" -e "globus_arch" -c "base globus" ############################################## # Set variables used in the script @@ -13,7 +13,7 @@ YMD=${PDY} HH=${cyc} declare_from_tmpl -rx \ # Run globus script ############################################################### -${GLOBALGLOBUSSH:-${SCRgfs}/exglobal_globus.py} +${GLOBALGLOBUSARCHSH:-${SCRgfs}/exglobal_globus_arch.py} status=$? [[ ${status} -ne 0 ]] && exit "${status}" diff --git a/jobs/rocoto/globus.sh b/jobs/rocoto/globus_arch.sh similarity index 90% rename from jobs/rocoto/globus.sh rename to jobs/rocoto/globus_arch.sh index bcfbc584cb..9c9da02238 100755 --- a/jobs/rocoto/globus.sh +++ b/jobs/rocoto/globus_arch.sh @@ -13,12 +13,12 @@ status=$? PYTHONPATH="${PYTHONPATH:+${PYTHONPATH}:}${HOMEgfs}/ush/python" export PYTHONPATH -export job="globus" +export job="globus_arch" export jobid="${job}.$$" ############################################################### # Execute the JJOB -"${HOMEgfs}"/jobs/JGLOBAL_GLOBUS +"${HOMEgfs}"/jobs/JGLOBAL_GLOBUS_ARCH status=$? exit "${status}" diff --git a/parm/config/gfs/config.earc_tars b/parm/config/gfs/config.earc_tars index e35d58536f..8bbe758273 100644 --- a/parm/config/gfs/config.earc_tars +++ b/parm/config/gfs/config.earc_tars @@ -6,7 +6,7 @@ echo "BEGIN: config.earc_tars" # Get task specific resources -. "${EXPDIR}/config.resources" "earc_tars" +. "${EXPDIR}/config.resources" "arch_tars" # Set the number of ensemble members to archive per earc_tars job case "${CASE_ENS}" in diff --git a/parm/config/gfs/config.globus b/parm/config/gfs/config.globus index 6a3cd3b53c..f6b451eae3 100644 --- a/parm/config/gfs/config.globus +++ b/parm/config/gfs/config.globus @@ -1,7 +1,7 @@ #! /usr/bin/env bash -########## config.globux ########## -# Globus specific variables +########## config.globus ########## +# Globus-specific variables for all globus-based jobs echo "BEGIN: config.globus" diff --git a/parm/config/gfs/config.resources b/parm/config/gfs/config.resources index 8eb6ef1139..681cacc1f7 100644 --- a/parm/config/gfs/config.resources +++ b/parm/config/gfs/config.resources @@ -1059,7 +1059,7 @@ case ${step} in export is_exclusive=True ;; - "arch_tars" | "earc_tars" | "getic" | "fetch" | "globus" | "ens_globus" ) + "arch_tars" | "getic" | "fetch" | "globus" | "globus_earc" ) walltime="06:00:00" ntasks=1 tasks_per_node=1 diff --git a/parm/globus/init_xfer.sh.j2 b/parm/globus/init_xfer.sh.j2 index 8f1d33ac59..c014a38e47 100644 --- a/parm/globus/init_xfer.sh.j2 +++ b/parm/globus/init_xfer.sh.j2 @@ -20,9 +20,9 @@ if compgen -G "/collab1/data/David.Huber"/req_mkdir.* > /dev/null 2>&1 ; then fi # Look for executable scripts -if compgen -G "{{server_home}}"/doorman/globus.* > /dev/null 2>&1 ; then +if compgen -G "{{server_home}}"/doorman/globus_*.* > /dev/null 2>&1 ; then echo "${run_time}" > "${runtime_log}" - for dir in "{{server_home}}"/doorman/globus.*; do + for dir in "{{server_home}}"/doorman/globus_*.*; do flist=$(find "${dir}" -executable -name "run_doorman.sh") for script in ${flist}; do # Check if the corresponding log has already been written diff --git a/scripts/exglobal_globus.py b/scripts/exglobal_globus_arch.py similarity index 100% rename from scripts/exglobal_globus.py rename to scripts/exglobal_globus_arch.py diff --git a/workflow/applications/gfs_cycled.py b/workflow/applications/gfs_cycled.py index 2338b28f4d..04412b8edb 100644 --- a/workflow/applications/gfs_cycled.py +++ b/workflow/applications/gfs_cycled.py @@ -155,6 +155,9 @@ def _get_app_configs(self, run): if options['do_globusarch']: configs += ['globus'] + # TODO Enable when the globus archiving feature is available for ensembles + # if options['do_hybvar']: + # configs += ['globus_earc'] return configs @@ -337,6 +340,11 @@ def get_task_names(self): task_names[run] += ['stage_ic', 'ecen', 'esfc'] if options['do_archtar']: task_names[run] += ['earc_tars'] + + # TODO Uncomment when globus ensemble archiving is ready + # if options['do_globusarch']: + # task_names[run] += ['globus_earc'] + task_names[run] += ['earc_vrfy', 'cleanup'] return task_names diff --git a/workflow/rocoto/gefs_tasks.py b/workflow/rocoto/gefs_tasks.py index 5a8e0ae2be..69504f31b9 100644 --- a/workflow/rocoto/gefs_tasks.py +++ b/workflow/rocoto/gefs_tasks.py @@ -636,13 +636,13 @@ def globus(self): dependencies = rocoto.create_dependency(dep=deps) resources = self.get_resource('globus') - task_name = 'globus' + task_name = 'globus_arch' task_dict = {'task_name': task_name, 'resources': resources, 'envars': self.envars, 'cycledef': 'gefs', 'dependency': dependencies, - 'command': f'{self.HOMEgfs}/jobs/rocoto/globus.sh', + 'command': f'{self.HOMEgfs}/jobs/rocoto/globus_arch.sh', 'job_name': f'{self.pslot}_{task_name}_@H', 'log': f'{self.rotdir}/logs/@Y@m@d@H/{task_name}.log', 'maxtries': '&MAXTRIES;' @@ -656,7 +656,7 @@ def cleanup(self): deps = [] if self.options['do_archtar']: if self.options['do_globusarch']: - dep_dict = {'type': 'task', 'name': 'gefs_globus'} + dep_dict = {'type': 'task', 'name': 'gefs_globus_arch'} else: dep_dict = {'type': 'task', 'name': 'gefs_arch_tars'} diff --git a/workflow/rocoto/gfs_tasks.py b/workflow/rocoto/gfs_tasks.py index 0c13933c52..29ff2e37b8 100644 --- a/workflow/rocoto/gfs_tasks.py +++ b/workflow/rocoto/gfs_tasks.py @@ -2447,20 +2447,20 @@ def arch_tars(self): return task # Globus transfer for HPSS archiving - def globus(self): + def globus_arch(self): deps = [] dep_dict = {'type': 'task', 'name': f'{self.run}_arch_tars'} deps.append(rocoto.add_dependency(dep_dict)) dependencies = rocoto.create_dependency(dep=deps) resources = self.get_resource('globus') - task_name = f'{self.run}_globus' + task_name = f'{self.run}_globus_arch' task_dict = {'task_name': task_name, 'resources': resources, 'dependency': dependencies, 'envars': self.envars, 'cycledef': self.run.replace('enkf', ''), - 'command': f'{self.HOMEgfs}/jobs/rocoto/globus.sh', + 'command': f'{self.HOMEgfs}/jobs/rocoto/globus_arch.sh', 'job_name': f'{self.pslot}_{task_name}_@H', 'log': f'{self.rotdir}/logs/@Y@m@d@H/{task_name}.log', 'maxtries': '&MAXTRIES;' @@ -2471,7 +2471,7 @@ def globus(self): return task # Ensemble globus transfer for HPSS archiving - def ens_group_globus(self): + def globus_earc(self): deps = [] dep_dict = {'type': 'metatask', 'name': f'{self.run}_earc_tars'} deps.append(rocoto.add_dependency(dep_dict)) @@ -2481,22 +2481,22 @@ def ens_group_globus(self): n_groups = -(self.nmem // -self._configs['earc']['NMEM_EARCGRP']) groups = ' '.join([f'{grp:02d}' for grp in range(0, n_groups + 1)]) - resources = self.get_resource('ens_group_globus') + resources = self.get_resource('globus') var_dict = {'grp': groups} - task_name = f'{self.run}_ens_globus' + task_name = f'{self.run}_globus_earc' task_dict = {'task_name': task_name, 'resources': resources, 'dependency': dependencies, 'envars': self.envars, 'cycledef': self.run.replace('enkf', ''), - 'command': f'{self.HOMEgfs}/jobs/rocoto/globus.sh', + 'command': f'{self.HOMEgfs}/jobs/rocoto/globus_earc.sh', 'job_name': f'{self.pslot}_{task_name}_@H', 'log': f'{self.rotdir}/logs/@Y@m@d@H/{task_name}.log', 'maxtries': '&MAXTRIES;' } - metatask_dict = {'task_name': f'{self.run}_eglobus', + metatask_dict = {'task_name': f'{self.run}_ens_globus_arch', 'var_dict': var_dict, 'task_dict': task_dict } @@ -2513,7 +2513,7 @@ def cleanup(self): deps.append(rocoto.add_dependency(dep_dict)) if self.options['do_archtar']: if self.options['do_globusarch']: - dep_dict = {'type': 'metatask', 'name': f'{self.run}_ens_globus'} + dep_dict = {'type': 'metatask', 'name': f'{self.run}_ens_globus_arch'} else: dep_dict = {'type': 'metatask', 'name': f'{self.run}_earc_tars'} @@ -2524,7 +2524,7 @@ def cleanup(self): deps.append(rocoto.add_dependency(dep_dict)) if self.options['do_archtar']: if self.options['do_globusarch']: - dep_dict = {'type': 'task', 'name': f'{self.run}_globus'} + dep_dict = {'type': 'task', 'name': f'{self.run}_globus_arch'} else: dep_dict = {'type': 'task', 'name': f'{self.run}_arch_tars'} deps.append(rocoto.add_dependency(dep_dict)) @@ -3135,7 +3135,7 @@ def earc_tars(self): n_groups = -(self.nmem // -self._configs['earc_tars']['NMEM_EARCGRP']) groups = ' '.join([f'{grp:02d}' for grp in range(0, n_groups + 1)]) - resources = self.get_resource('earc_tars') + resources = self.get_resource('arch_tars') var_dict = {'grp': groups} From 96f354e27bcb89a40e688cf6e301ccd7b711e95d Mon Sep 17 00:00:00 2001 From: David Huber Date: Mon, 10 Feb 2025 14:00:05 -0600 Subject: [PATCH 83/83] Delete the run direcctory on Niagara when complete --- ush/python/pygfs/task/globus_hpss.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/ush/python/pygfs/task/globus_hpss.py b/ush/python/pygfs/task/globus_hpss.py index a69c8aa63b..3d9380f1de 100644 --- a/ush/python/pygfs/task/globus_hpss.py +++ b/ush/python/pygfs/task/globus_hpss.py @@ -237,10 +237,10 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) os.chmod("run_doorman.sh", 0o740) os.chmod("init_xfer.sh", 0o740) - server_job_dir = transfer_set["server_job_dir"] + self.server_job_dir = transfer_set["server_job_dir"] # Initialize the server - self._init_server(server_job_dir) + self._init_server() server_name = self.task_config.SERVER_NAME @@ -367,13 +367,13 @@ def execute_transfer_data(self, transfer_set: Dict[str, Any], has_rstprod: bool) return @logit(logger) - def _init_server(self, job_dir: str): + def _init_server(self): # This method sends a request to create a working directory and transfers # the initialization script. req_file = f"req_mkdir.{self.task_config.jobid}" - with open(f"req_mkdir.{self.task_config.jobid}", "w") as mkdir_f: - mkdir_f.write(f"{job_dir}") + with open(req_file, "w") as mkdir_f: + mkdir_f.write(f"{self.server_job_dir}") server_name = self.task_config.SERVER_NAME server_home = self.task_config.server_home @@ -419,4 +419,22 @@ def clean(self): Remove the temporary directories/files created by the GlobusHpss task. """ + # Write a request to delete the working directory on Niagara + req_file = f"req_rmdir.{self.task_config.jobid}" + with open(req_file, "w") as rmdir_f: + rmdir_f.write(f"{self.server_job_dir}") + + self.scp(req_file, f"{server_name}:{server_home}/{req_file}") + + logger.info("Sleeping 5 minute to give the server time to delete the working directory") + # It probably takes much less time than this, but it may take a little while at high res + sleep(300) + + # If it was successful, then the request should be gone + try: + self.scp(f"{server_name}:{server_home}/{req_file}", ".") + raise RuntimeError(f"FATAL ERROR Failed to delete the run directory on {server_name}") + except ProcessError: + pass + return