diff --git a/include/fenix.h b/include/fenix.h
index 46e7542..67be4b5 100644
--- a/include/fenix.h
+++ b/include/fenix.h
@@ -146,6 +146,30 @@ typedef enum {
FENIX_ROLE_SURVIVOR_RANK = 2
} Fenix_Rank_role;
+/**
+ * @brief Options for passing control back to application after recovery.
+ */
+typedef enum {
+ //!Return to Fenix_Init via longjmp (default)
+ JUMP,
+ //!Return the error code inline
+ RETURN,
+ //!Throw a Fenix::CommException
+ THROW
+} Fenix_Resume_mode;
+
+/**
+ * @brief Options for dealing with 'unhandled' errors, e.g. invalid rank IDs
+ */
+typedef enum {
+ //!Ignore unhandled errors
+ SILENT,
+ //!Print error and continue without handling
+ PRINT,
+ //!Print error and abort Fenix's world (default)
+ ABORT
+} Fenix_Unhandled_mode;
+
/**
* @fn void Fenix_Init(int* role, MPI_Comm comm, MPI_Comm* newcomm, int** argc, char*** argv, int spare_ranks, int spawn, MPI_Info info, int* error);
* @brief Build a resilient communicator and set the restart point.
@@ -197,14 +221,13 @@ typedef enum {
* @param[in] spawn *Unimplemented*: Whether to enable spawning new ranks to replace
* failed ranks when spares are unavailable.
* @param[in] info Fenix recovery configuration parameters, may be MPI_INFO_NULL
- * Supports the "FENIX_RESUME_MODE" key, used to indicate where execution should resume upon
+ * "FENIX_RESUME_MODE" key is used to indicate where execution should resume upon
* rank failure for all active (non-spare) ranks in any resilient communicators, not only for
- * those ranks in communicators that failed. The following values associated with the
- * "resume_mode" key are supported:
- * - "Fenix_init" (default): execution resumes at logical exit of Fenix_Init.
- * - "NO_JUMP": execution continues from the failing MPI call. Errors are otherwise handled
- * as normal, but return the error code as well. Applications should typically
- * either check for return codes or assign an error callback through Fenix.
+ * those ranks in communicators that failed. The value should be a string with the name of a
+ * Fenix_Resume_mode enum value.
+ * "FENIX_UNHANDLED_MODE" key is used to indicate how Fenix should handle error values
+ * returned by MPI functions that are unrelated to failed processes. The value should be
+ * a string with the name of a Fenix_Unhandled_mode enum value.
* @param[out] error The return status of \c Fenix_Init
* Used to signal that a non-fatal error or special condition was encountered in the execution of
* Fenix_Init, or FENIX_SUCCESS otherwise. It has the same value across all ranks released by
@@ -221,10 +244,8 @@ typedef enum {
*(_role) = __fenix_preinit(_role, _comm, _newcomm, _argc, \
_argv, _spare_ranks, _spawn, _info, \
_error, &bufjmp); \
- if(setjmp(bufjmp)) { \
- *(_role) = FENIX_ROLE_SURVIVOR_RANK; \
- } \
- __fenix_postinit( _error ); \
+ setjmp(bufjmp); \
+ __fenix_postinit(); \
}
diff --git a/include/fenix.hpp b/include/fenix.hpp
index 1549247..0234fe3 100644
--- a/include/fenix.hpp
+++ b/include/fenix.hpp
@@ -72,18 +72,4 @@
*/
int Fenix_Callback_register(std::function callback);
-namespace fenix {
-
-/**
- * @brief Registers a callback that throws a CommException
- *
- * This means no longjmp will occur, and instead applications
- * will continue from their try-catch error handler.
- *
- * @returnstatus
- */
-int register_exception_callback();
-
-} // namespace fenix
-
#endif
diff --git a/include/fenix_ext.hpp b/include/fenix_ext.hpp
index 4977a4a..9cd79a0 100644
--- a/include/fenix_ext.hpp
+++ b/include/fenix_ext.hpp
@@ -66,47 +66,44 @@
typedef struct __fenix_data_recovery fenix_data_recovery_t;
typedef struct {
- int num_inital_ranks; // Keeps the global MPI rank ID at Fenix_init
- int num_survivor_ranks; // Keeps the global information on the number of survived MPI ranks after failure
- int num_recovered_ranks; // Keeps the number of spare ranks brought into MPI communicator recovery
- int resume_mode; // Defines how program resumes after process recovery
- int spawn_policy; // Indicate dynamic process spawning
- int spare_ranks; // Spare ranks entered by user to repair failed ranks
- int repair_result; // Internal global variable to store the result of MPI communicator repair
- int finalized;
+ int num_inital_ranks; // Keeps the global MPI rank ID at Fenix_init
+ int num_survivor_ranks = 0; // Keeps the global information on the number of survived MPI ranks after failure
+ int num_recovered_ranks = 0; // Keeps the number of spare ranks brought into MPI communicator recovery
+ int spare_ranks; // Spare ranks entered by user to repair failed ranks
+
+ int resume_mode = Fenix_Resume_mode::JUMP;
+ int unhandled_mode = Fenix_Unhandled_mode::ABORT;
+ int ignore_errs = false; // Temporarily ignore all errors & recovery
+ int spawn_policy; // Indicate dynamic process spawning
jmp_buf *recover_environment; // Calling environment to fill the jmp_buf structure
+ int repair_result = FENIX_SUCCESS; // Internal variable to store the result of MPI comm repair
+ int role = FENIX_ROLE_INITIAL_RANK;
- //enum FenixRankRole role; // Role of rank: initial, survivor or repair
- int role; // Role of rank: initial, survivor or repair
- int fenix_init_flag = 0;
+ int fenix_init_flag = false;
+ int finalized = false;
- int fail_world_size;
- int* fail_world;
+ int fail_world_size = 0;
+ int* fail_world = nullptr;
//Save the pointer to role and error of Fenix_Init
- int *ret_role;
- int *ret_error;
+ int *ret_role = nullptr;
+ int *ret_error = nullptr;
std::vector callbacks;
- fenix_debug_opt_t options; // This is reserved to store the user options
+ fenix_debug_opt_t options; // This is reserved to store the user options
- MPI_Comm *world; // Duplicate of the MPI communicator provided by user
- MPI_Comm new_world; // Global MPI communicator identical to g_world but without spare ranks
- MPI_Comm *user_world; // MPI communicator with repaired ranks
- //Manage state of the comms. Necessary when failures happen rapidly, mussing up state
- int new_world_exists, user_world_exists;
-
+ MPI_Comm *world; // Duplicate of comm provided by user
+ MPI_Comm *user_world; // User-facing comm with repaired ranks and no spares
+ MPI_Comm new_world; // Internal duplicate of user_world
+ int new_world_exists = false, user_world_exists = false;
+
+ //Values used for Fenix_Process_detect_failures
int dummy_recv_buffer;
MPI_Request check_failures_req;
-
- MPI_Op agree_op; // This is reserved for the global agreement call for Fenix data recovery API
-
-
- MPI_Errhandler mpi_errhandler; // This stores callback info for our custom error handler
- int ignore_errs; // Set this to return errors instead of using the error handler normally. (Don't forget to unset!)
- int print_unhandled; // Set this to print the error string for MPI errors of an unhandled return type.
+ MPI_Op agree_op; // Global agreement call for Fenix data recovery API
+ MPI_Errhandler mpi_errhandler; // Our custom error handler
fenix_data_recovery_t *data_recovery; // Global pointer for Fenix Data Recovery Data Structure
} fenix_t;
diff --git a/include/fenix_init.h b/include/fenix_init.h
index c4ca69b..19471d1 100644
--- a/include/fenix_init.h
+++ b/include/fenix_init.h
@@ -67,7 +67,7 @@ extern "C" {
int __fenix_preinit(int *, MPI_Comm, MPI_Comm *, int *, char ***, int, int, MPI_Info, int *, jmp_buf *);
-void __fenix_postinit(int *);
+void __fenix_postinit();
#if defined(c_plusplus) || defined(__cplusplus)
}
diff --git a/include/fenix_opt.hpp b/include/fenix_opt.hpp
index b032b02..2fb34d4 100644
--- a/include/fenix_opt.hpp
+++ b/include/fenix_opt.hpp
@@ -78,7 +78,7 @@
do { printf("%s(): " fmt, __func__, __VA_ARGS__); } while (0)
typedef struct __fenix_debug_opt_t {
- int verbose;
+ int verbose = -1;
} fenix_debug_opt_t;
diff --git a/include/fenix_process_recovery.hpp b/include/fenix_process_recovery.hpp
index f6ad346..760617f 100644
--- a/include/fenix_process_recovery.hpp
+++ b/include/fenix_process_recovery.hpp
@@ -65,13 +65,11 @@
#include
#include
#include
+#include
#include "fenix_init.h"
#include
-#define __FENIX_RESUME_AT_INIT 0
-#define __FENIX_RESUME_NO_JUMP 200
-
using fenix_callback_func = std::function;
typedef struct __fenix_comm_list_elm {
@@ -85,6 +83,10 @@ typedef struct {
fenix_comm_list_elm_t *tail;
} fenix_comm_list_t;
+void __fenix_set_resume_mode(const std::string_view& name);
+
+void __fenix_set_unhandled_mode(const std::string_view& name);
+
int __fenix_create_new_world();
int __fenix_repair_ranks();
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 50a0233..87c3305 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -16,7 +16,6 @@ FILE(GLOB Fenix_HEADERS ${CMAKE_SOURCE_DIR}/include/*.h*)
set (Fenix_SOURCES
fenix.cpp
-fenix_exception.cpp
fenix_opt.cpp
fenix_process_recovery.cpp
fenix_util.cpp
diff --git a/src/fenix_exception.cpp b/src/fenix_exception.cpp
deleted file mode 100644
index 500f433..0000000
--- a/src/fenix_exception.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-#include "fenix_exception.hpp"
-#include "fenix.h"
-
-namespace fenix {
-
-int register_exception_callback(){
- return Fenix_Callback_register(
- [](MPI_Comm repaired_comm, int fen_err){
- throw CommException(repaired_comm, fen_err);
- }
- );
-}
-
-} // namespace fenix
diff --git a/src/fenix_process_recovery.cpp b/src/fenix_process_recovery.cpp
index eb3b6c2..1da031c 100644
--- a/src/fenix_process_recovery.cpp
+++ b/src/fenix_process_recovery.cpp
@@ -67,15 +67,15 @@
#include
+using namespace fenix;
+
int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, char ***argv,
int spare_ranks,
int spawn,
MPI_Info info, int *error, jmp_buf *jump_environment)
{
-
- int ret;
*role = fenix_rt.role;
- *error = 0;
+ *error = FENIX_SUCCESS;
fenix_rt.user_world = new_comm;
@@ -85,79 +85,25 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha
MPI_Comm_dup(comm, fenix_rt.world);
PMPI_Comm_set_errhandler(*fenix_rt.world, fenix_rt.mpi_errhandler);
- fenix_rt.finalized = 0;
fenix_rt.spare_ranks = spare_ranks;
fenix_rt.spawn_policy = spawn;
fenix_rt.recover_environment = jump_environment;
- fenix_rt.role = FENIX_ROLE_INITIAL_RANK;
- fenix_rt.fail_world_size = 0;
- fenix_rt.ignore_errs = 0;
- fenix_rt.resume_mode = __FENIX_RESUME_AT_INIT;
- fenix_rt.repair_result = 0;
fenix_rt.ret_role = role;
fenix_rt.ret_error = error;
- fenix_rt.options.verbose = -1;
- // __fenix_init_opt(*argc, *argv);
-
- // For request tracking, make sure we can save at least an integer
- // in MPI_Request
- if(sizeof(MPI_Request) < sizeof(int)) {
- fprintf(stderr, "FENIX ERROR: __fenix_preinit: sizeof(MPI_Request) < sizeof(int)!\n");
- MPI_Abort(comm, -1);
- }
-
-
MPI_Op_create((MPI_User_function *) __fenix_ranks_agree, 1, &fenix_rt.agree_op);
/* Check the values in info */
if (info != MPI_INFO_NULL) {
- char value[MPI_MAX_INFO_VAL + 1];
- int vallen = MPI_MAX_INFO_VAL;
- int flag;
-
- MPI_Info_get(info, "FENIX_RESUME_MODE", vallen, value, &flag);
- if (flag == 1) {
- if (strcmp(value, "Fenix_init") == 0) {
- fenix_rt.resume_mode = __FENIX_RESUME_AT_INIT;
- if (fenix_rt.options.verbose == 0) {
- verbose_print("rank: %d, role: %d, value: %s\n",
- __fenix_get_current_rank(*fenix_rt.world), fenix_rt.role, value);
- }
- } else if (strcmp(value, "NO_JUMP") == 0) {
- fenix_rt.resume_mode = __FENIX_RESUME_NO_JUMP;
- if (fenix_rt.options.verbose == 0) {
- verbose_print("rank: %d, role: %d, value: %s\n",
- __fenix_get_current_rank(*fenix_rt.world), fenix_rt.role, value);
- }
+ constexpr int len = MPI_MAX_INFO_VAL;
+ char value[len + 1];
+ int found;
- } else {
- /* No support. Setting it to Fenix_init */
- fenix_rt.resume_mode = __FENIX_RESUME_AT_INIT;
- }
- }
-
+ MPI_Info_get(info, "FENIX_RESUME_MODE", len, value, &found);
+ if (found) __fenix_set_resume_mode(value);
- MPI_Info_get(info, "FENIX_UNHANDLED_MODE", vallen, value, &flag);
- if (flag == 1) {
- if (strcmp(value, "SILENT") == 0) {
- fenix_rt.print_unhandled = 0;
- if (fenix_rt.options.verbose == 0) {
- verbose_print("rank: %d, role: %d, UNHANDLED_MODE: %s\n",
- __fenix_get_current_rank(*fenix_rt.world), fenix_rt.role, value);
- }
- } else if (strcmp(value, "NO_JUMP") == 0) {
- fenix_rt.print_unhandled = 1;
- if (fenix_rt.options.verbose == 0) {
- verbose_print("rank: %d, role: %d, UNHANDLED_MODE: %s\n",
- __fenix_get_current_rank(*fenix_rt.world), fenix_rt.role, value);
- }
-
- } else {
- /* No support. Setting it to silent */
- fenix_rt.print_unhandled = 0;
- }
- }
+ MPI_Info_get(info, "FENIX_UNHANDLED_MODE", len, value, &found);
+ if (found) __fenix_set_unhandled_mode(value);
}
if (fenix_rt.spare_ranks >= __fenix_get_world_size(comm)) {
@@ -175,13 +121,8 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha
/* trigger an abort. */
/*****************************************************/
- ret = 1;
- while (ret) {
- ret = __fenix_create_new_world();
- if (ret) {
- // just_repair_process();
- }
- }
+ //Try to create new_world until success
+ while (__fenix_create_new_world());
if ( __fenix_spare_rank() != 1) {
fenix_rt.num_inital_ranks = __fenix_get_world_size(fenix_rt.new_world);
@@ -201,17 +142,16 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha
}
}
- fenix_rt.num_survivor_ranks = 0;
- fenix_rt.num_recovered_ranks = 0;
+ fenix_rt.fenix_init_flag = true;
while ( __fenix_spare_rank() == 1) {
int a;
int myrank;
MPI_Status mpi_status;
- fenix_rt.ignore_errs = 1;
- ret = PMPI_Recv(&a, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, *fenix_rt.world,
- &mpi_status); // listen for a failure
- fenix_rt.ignore_errs = 0;
+ fenix_rt.ignore_errs = true;
+ int ret = PMPI_Recv(&a, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG,
+ *fenix_rt.world, &mpi_status);
+ fenix_rt.ignore_errs = false;
if (ret == MPI_SUCCESS) {
if (fenix_rt.options.verbose == 0) {
verbose_print("Finalize the program; rank: %d, role: %d\n",
@@ -232,11 +172,37 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha
if(fenix_rt.role != FENIX_ROLE_RECOVERED_RANK) MPI_Comm_dup(fenix_rt.new_world, fenix_rt.user_world);
- fenix_rt.user_world_exists = 1;
+ fenix_rt.user_world_exists = true;
return fenix_rt.role;
}
+void __fenix_set_resume_mode(const std::string_view& name){
+ if (name == "JUMP") {
+ fenix_rt.resume_mode = Fenix_Resume_mode::JUMP;
+ } else if (name == "RETURN") {
+ fenix_rt.resume_mode = Fenix_Resume_mode::RETURN;
+ } else if (name == "THROW") {
+ fenix_rt.resume_mode = Fenix_Resume_mode::THROW;
+ } else {
+ fprintf(stderr, "Unsupported FENIX_RESUME_MODE %s\n", name.data());
+ MPI_Abort(*fenix_rt.world, 1);
+ }
+}
+
+void __fenix_set_unhandled_mode(const std::string_view& name){
+ if (name == "SILENT") {
+ fenix_rt.resume_mode = Fenix_Unhandled_mode::SILENT;
+ } else if (name == "PRINT") {
+ fenix_rt.resume_mode = Fenix_Unhandled_mode::PRINT;
+ } else if (name == "ABORT") {
+ fenix_rt.resume_mode = Fenix_Unhandled_mode::ABORT;
+ } else {
+ fprintf(stderr, "Unsupported FENIX_UNHANDLED_MODE %s\n", name.data());
+ MPI_Abort(*fenix_rt.world, 1);
+ }
+}
+
int __fenix_spare_rank_within(MPI_Comm refcomm)
{
int result = -1;
@@ -271,7 +237,6 @@ int __fenix_create_new_world_from(MPI_Comm from_comm)
ret = PMPI_Comm_split(from_comm, MPI_UNDEFINED, current_rank,
&fenix_rt.new_world);
- //if (ret != MPI_SUCCESS) { debug_print("MPI_Comm_split: %d\n", ret); }
fenix_rt.new_world_exists = 0; //Should already be this
} else {
@@ -334,10 +299,9 @@ int __fenix_repair_ranks()
}
while (!repair_success) {
-
repair_success = 1;
+
ret = MPIX_Comm_shrink(*fenix_rt.world, &world_without_failures);
- //if (ret != MPI_SUCCESS) { debug_print("MPI_Comm_shrink. repair_ranks\n"); }
if (ret != MPI_SUCCESS) {
repair_success = 0;
goto END_LOOP;
@@ -404,7 +368,6 @@ int __fenix_repair_ranks()
}
}
- //if (ret != MPI_SUCCESS) { debug_print("MPI_Allgather. repair_ranks\n"); }
if (ret != MPI_SUCCESS) {
repair_success = 0;
if (ret == MPI_ERR_PROC_FAILED) {
@@ -423,7 +386,6 @@ int __fenix_repair_ranks()
ret = PMPI_Allreduce(&survived_flag, &fenix_rt.num_survivor_ranks, 1,
MPI_INT, MPI_SUM, world_without_failures);
- //if (ret != MPI_SUCCESS) { debug_print("MPI_Allreduce. repair_ranks\n"); }
if (ret != MPI_SUCCESS) {
repair_success = 0;
if (ret == MPI_ERR_PROC_FAILED) {
@@ -489,8 +451,6 @@ int __fenix_repair_ranks()
/* Update the number of spare ranks */
/************************************/
fenix_rt.spare_ranks = 0;
-
- //debug_print("not enough spare ranks to repair rank failures. repair_ranks\n");
}
/****************************************************************/
@@ -506,7 +466,6 @@ int __fenix_repair_ranks()
ret = PMPI_Allgather(¤t_rank, 1, MPI_INT, survivor_world, 1, MPI_INT,
world_without_failures);
- //if (ret != MPI_SUCCESS) { debug_print("MPI_Allgather. repair_ranks\n"); }
if (ret != MPI_SUCCESS) {
repair_success = 0;
if (ret == MPI_ERR_PROC_FAILED) {
@@ -524,7 +483,6 @@ int __fenix_repair_ranks()
ret = PMPI_Allreduce(&survived_flag, &fenix_rt.num_survivor_ranks, 1,
MPI_INT, MPI_SUM, world_without_failures);
- //if (ret != MPI_SUCCESS) { debug_print("MPI_Allreduce. repair_ranks\n"); }
if (ret != MPI_SUCCESS) {
repair_success = 0;
if (ret != MPI_ERR_PROC_FAILED) {
@@ -625,7 +583,6 @@ int __fenix_repair_ranks()
}
ret = PMPI_Barrier(fixed_world);
- /* if (ret != MPI_SUCCESS) { debug_print("MPI_Barrier. repair_ranks\n"); } */
if (ret != MPI_SUCCESS) {
repair_success = 0;
MPIX_Comm_revoke(fixed_world);
@@ -635,16 +592,6 @@ int __fenix_repair_ranks()
END_LOOP:
num_try++;
-
- /*******************************************************/
- /*** Not sure if we should include verbose statement ***/
- /*******************************************************/
-
-/*
- if (current_rank == FENIX_ROOT) {
- LDEBUG("Fenix: communicators repaired\n");
- }
-*/
}
*fenix_rt.world = fixed_world;
@@ -676,13 +623,10 @@ int __fenix_spare_rank(){
return __fenix_spare_rank_within(*fenix_rt.world);
}
-void __fenix_postinit(int *error)
+void __fenix_postinit()
{
-
- //if (fenix_rt.options.verbose == 9) {
- // verbose_print(" postinit: current_rank: %d, role: %d\n", __fenix_get_current_rank(fenix_rt.new_world),
- // fenix_rt.role);
- //}
+ *fenix_rt.ret_role = fenix_rt.role;
+ *fenix_rt.ret_error = fenix_rt.repair_result;
if(fenix_rt.new_world_exists){
//Set up dummy irecv to use for checking for failures.
@@ -690,19 +634,8 @@ void __fenix_postinit(int *error)
34095347, fenix_rt.new_world, &fenix_rt.check_failures_req);
}
- if (fenix_rt.repair_result != 0) {
- *error = fenix_rt.repair_result;
- }
- fenix_rt.fenix_init_flag = 1;
-
-#if 0
- if (fenix_rt.role != FENIX_ROLE_INITIAL_RANK) {
- init_data_recovery();
- }
-#endif
-
if (fenix_rt.role == FENIX_ROLE_SURVIVOR_RANK) {
- __fenix_callback_invoke_all(*error);
+ __fenix_callback_invoke_all(*fenix_rt.ret_error);
}
if (fenix_rt.options.verbose == 9) {
verbose_print("After barrier. current_rank: %d, role: %d\n", __fenix_get_current_rank(fenix_rt.new_world),
@@ -742,7 +675,7 @@ void __fenix_finalize()
int last_spare_rank = __fenix_get_world_size(*fenix_rt.world) - 1;
//If we've reached here, we will finalized regardless of further errors.
- fenix_rt.ignore_errs = 1;
+ fenix_rt.ignore_errs = true;
while(!fenix_rt.finalized){
int user_rank = __fenix_get_current_rank(*fenix_rt.user_world);
@@ -765,7 +698,7 @@ void __fenix_finalize()
} else {
//If rank 0 did contribute, we know sends made it, and regardless
//of any other failures we finalize.
- fenix_rt.finalized = 1;
+ fenix_rt.finalized = true;
}
}
@@ -794,7 +727,7 @@ void __fenix_finalize()
void __fenix_finalize_spare()
{
- fenix_rt.fenix_init_flag = 0;
+ fenix_rt.fenix_init_flag = false;
int unused;
MPI_Request agree_req, recv_req = MPI_REQUEST_NULL;
@@ -842,45 +775,59 @@ void __fenix_test_MPI(MPI_Comm *pcomm, int *pret, ...)
}
switch (ret) {
- case MPI_ERR_PROC_FAILED_PENDING:
- case MPI_ERR_PROC_FAILED:
- MPIX_Comm_revoke(*fenix_rt.world);
- MPIX_Comm_revoke(fenix_rt.new_world);
-
- if(fenix_rt.user_world_exists) MPIX_Comm_revoke(*fenix_rt.user_world);
-
-
- fenix_rt.repair_result = __fenix_repair_ranks();
- break;
- case MPI_ERR_REVOKED:
- fenix_rt.repair_result = __fenix_repair_ranks();
- break;
- case MPI_ERR_INTERN:
- printf("Fenix detected error: MPI_ERR_INTERN\n");
- default:
- if(fenix_rt.print_unhandled){
+ case MPI_ERR_PROC_FAILED_PENDING:
+ case MPI_ERR_PROC_FAILED:
+ MPIX_Comm_revoke(*fenix_rt.world);
+ MPIX_Comm_revoke(fenix_rt.new_world);
+
+ if(fenix_rt.user_world_exists) MPIX_Comm_revoke(*fenix_rt.user_world);
+
+ fenix_rt.repair_result = __fenix_repair_ranks();
+ break;
+ case MPI_ERR_REVOKED:
+ fenix_rt.repair_result = __fenix_repair_ranks();
+ break;
+ default:
int len;
char errstr[MPI_MAX_ERROR_STRING];
MPI_Error_string(ret, errstr, &len);
- fprintf(stderr, "UNHANDLED ERR: %s\n", errstr);
- }
- return;
- break;
+ switch (fenix_rt.unhandled_mode) {
+ case ABORT:
+ fprintf(stderr, "UNHANDLED ERR: %s\n", errstr);
+ MPI_Abort(*fenix_rt.world, 1);
+ break;
+ case PRINT:
+ fprintf(stderr, "UNHANDLED ERR: %s\n", errstr);
+ break;
+ case SILENT:
+ break;
+ default:
+ printf(
+ "Fenix internal error: Unknown unhandled mode %d\n",
+ fenix_rt.unhandled_mode
+ );
+ assert(false);
+ break;
+ }
+ return;
+ break;
}
-
fenix_rt.role = FENIX_ROLE_SURVIVOR_RANK;
+ __fenix_postinit();
if(!fenix_rt.finalized) {
switch(fenix_rt.resume_mode) {
- case __FENIX_RESUME_AT_INIT:
+ case JUMP:
longjmp(*fenix_rt.recover_environment, 1);
break;
- case __FENIX_RESUME_NO_JUMP:
- *(fenix_rt.ret_role) = FENIX_ROLE_SURVIVOR_RANK;
- __fenix_postinit(fenix_rt.ret_error);
+ case RETURN:
+ break;
+ case THROW:
+ throw CommException(*fenix_rt.user_world, *fenix_rt.ret_error);
break;
default:
- printf("Fenix detected error: Unknown resume mode\n");
+ printf("Fenix internal error: Unknown resume mode %d\n",
+ fenix_rt.resume_mode);
assert(false);
break;
}
diff --git a/test/exception_throw/fenix_exceptions.cpp b/test/exception_throw/fenix_exceptions.cpp
index 8142b12..182abc7 100644
--- a/test/exception_throw/fenix_exceptions.cpp
+++ b/test/exception_throw/fenix_exceptions.cpp
@@ -72,12 +72,9 @@ int main(int argc, char **argv) {
MPI_Comm res_comm;
MPI_Info info;
MPI_Info_create(&info);
- MPI_Info_set(info, "FENIX_RESUME_MODE", "NO_JUMP");
- MPI_Info_set(info, "FENIX_UNHANDLED_MODE", "NO_JUMP");
+ MPI_Info_set(info, "FENIX_RESUME_MODE", "THROW");
Fenix_Init(&fenix_role, MPI_COMM_WORLD, &res_comm, &argc, &argv, 0, 0, info, &error);
- fenix::register_exception_callback();
-
if(fenix_role == FENIX_ROLE_SURVIVOR_RANK){
printf("FAILURE: longjmp instead of exception\n");
status = 1;
diff --git a/test/issend/fenix_issend_test.c b/test/issend/fenix_issend_test.c
index 212a7ae..23f3d85 100644
--- a/test/issend/fenix_issend_test.c
+++ b/test/issend/fenix_issend_test.c
@@ -87,7 +87,7 @@ int main(int argc, char **argv) {
MPI_Info info;
MPI_Info_create(&info);
- MPI_Info_set(info, "FENIX_RESUME_MODE", "NO_JUMP");
+ MPI_Info_set(info, "FENIX_RESUME_MODE", "RETURN");
int fenix_status;
int recovered = 0;
diff --git a/test/no_jump/fenix_no_jump_test.c b/test/no_jump/fenix_no_jump_test.c
index 31eb9f3..cf5d261 100644
--- a/test/no_jump/fenix_no_jump_test.c
+++ b/test/no_jump/fenix_no_jump_test.c
@@ -87,7 +87,7 @@ int main(int argc, char **argv) {
MPI_Info info;
MPI_Info_create(&info);
- MPI_Info_set(info, "FENIX_RESUME_MODE", "NO_JUMP");
+ MPI_Info_set(info, "FENIX_RESUME_MODE", "RETURN");
int fenix_status;
int recovered = 0;