Skip to content

Commit be25dcc

Browse files
Add Fenix_Process_detect_failures
1 parent f8d1873 commit be25dcc

File tree

5 files changed

+31
-0
lines changed

5 files changed

+31
-0
lines changed

include/fenix.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,8 @@ int Fenix_Process_fail_list(int** fail_list);
231231

232232
int Fenix_check_cancelled(MPI_Request *request, MPI_Status *status);
233233

234+
int Fenix_Process_detect_failures(int do_recovery);
235+
234236
#if defined(c_plusplus) || defined(__cplusplus)
235237
}
236238
#endif

include/fenix_ext.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,9 @@ typedef struct {
9696
//Manage state of the comms. Necessary when failures happen rapidly, mussing up state
9797
int new_world_exists, user_world_exists;
9898

99+
int dummy_recv_buffer;
100+
MPI_Request check_failures_req;
101+
99102

100103
MPI_Op agree_op; // This is reserved for the global agreement call for Fenix data recovery API
101104

include/fenix_process_recovery.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,8 @@ void __fenix_set_rank_role(int FenixRankRole);
118118

119119
void __fenix_postinit(int *);
120120

121+
int __fenix_detect_failures(int do_recovery);
122+
121123
void __fenix_finalize();
122124

123125
void __fenix_finalize_spare();

src/fenix.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,3 +209,7 @@ int Fenix_check_cancelled(MPI_Request *request, MPI_Status *status){
209209
//Request was (potentially) cancelled if ret is MPI_ERR_PROC_FAILED
210210
return ret == MPI_ERR_PROC_FAILED || ret == MPI_ERR_REVOKED;
211211
}
212+
213+
int Fenix_Process_detect_failures(int do_recovery){
214+
return __fenix_detect_failures(do_recovery);
215+
}

src/fenix_process_recovery.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -686,6 +686,11 @@ void __fenix_postinit(int *error)
686686
// fenix.role);
687687
//}
688688

689+
if(fenix.new_world_exists){
690+
//Set up dummy irecv to use for checking for failures.
691+
MPI_Irecv(&fenix.dummy_recv_buffer, 1, MPI_INT, MPI_ANY_SOURCE,
692+
34095347, fenix.new_world, &fenix.check_failures_req);
693+
}
689694

690695
if (fenix.repair_result != 0) {
691696
*error = fenix.repair_result;
@@ -707,6 +712,21 @@ void __fenix_postinit(int *error)
707712
}
708713
}
709714

715+
int __fenix_detect_failures(int do_recovery){
716+
if(!fenix.new_world_exists) return FENIX_ERROR_UNINITIALIZED;
717+
718+
int old_ignore_errs = fenix.ignore_errs;
719+
fenix.ignore_errs = !do_recovery;
720+
721+
int req_completed;
722+
int ret = MPI_Test(&fenix.check_failures_req, &req_completed, MPI_STATUS_IGNORE);
723+
724+
if(req_completed) ret = FENIX_ERROR_INTERN;
725+
726+
fenix.ignore_errs = old_ignore_errs;
727+
return ret;
728+
}
729+
710730
void __fenix_finalize()
711731
{
712732
int location = FENIX_FINALIZE_LOC;

0 commit comments

Comments
 (0)