From 3fc52869dedd384f82aa788bc3c75e4648b684bc Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Fri, 25 Apr 2025 11:44:47 -0500 Subject: [PATCH] Expand Fenix config options --- include/fenix.h | 43 +++- include/fenix.hpp | 14 -- include/fenix_ext.hpp | 55 +++-- include/fenix_init.h | 2 +- include/fenix_opt.hpp | 2 +- include/fenix_process_recovery.hpp | 8 +- src/CMakeLists.txt | 1 - src/fenix_exception.cpp | 14 -- src/fenix_process_recovery.cpp | 239 +++++++++------------- test/exception_throw/fenix_exceptions.cpp | 5 +- test/issend/fenix_issend_test.c | 2 +- test/no_jump/fenix_no_jump_test.c | 2 +- 12 files changed, 161 insertions(+), 226 deletions(-) delete mode 100644 src/fenix_exception.cpp diff --git a/include/fenix.h b/include/fenix.h index 46e7542..67be4b5 100644 --- a/include/fenix.h +++ b/include/fenix.h @@ -146,6 +146,30 @@ typedef enum { FENIX_ROLE_SURVIVOR_RANK = 2 } Fenix_Rank_role; +/** + * @brief Options for passing control back to application after recovery. + */ +typedef enum { + //!Return to Fenix_Init via longjmp (default) + JUMP, + //!Return the error code inline + RETURN, + //!Throw a Fenix::CommException + THROW +} Fenix_Resume_mode; + +/** + * @brief Options for dealing with 'unhandled' errors, e.g. invalid rank IDs + */ +typedef enum { + //!Ignore unhandled errors + SILENT, + //!Print error and continue without handling + PRINT, + //!Print error and abort Fenix's world (default) + ABORT +} Fenix_Unhandled_mode; + /** * @fn void Fenix_Init(int* role, MPI_Comm comm, MPI_Comm* newcomm, int** argc, char*** argv, int spare_ranks, int spawn, MPI_Info info, int* error); * @brief Build a resilient communicator and set the restart point. @@ -197,14 +221,13 @@ typedef enum { * @param[in] spawn *Unimplemented*: Whether to enable spawning new ranks to replace * failed ranks when spares are unavailable. * @param[in] info Fenix recovery configuration parameters, may be MPI_INFO_NULL - * Supports the "FENIX_RESUME_MODE" key, used to indicate where execution should resume upon + * "FENIX_RESUME_MODE" key is used to indicate where execution should resume upon * rank failure for all active (non-spare) ranks in any resilient communicators, not only for - * those ranks in communicators that failed. The following values associated with the - * "resume_mode" key are supported: - * - "Fenix_init" (default): execution resumes at logical exit of Fenix_Init. - * - "NO_JUMP": execution continues from the failing MPI call. Errors are otherwise handled - * as normal, but return the error code as well. Applications should typically - * either check for return codes or assign an error callback through Fenix. + * those ranks in communicators that failed. The value should be a string with the name of a + * Fenix_Resume_mode enum value. + * "FENIX_UNHANDLED_MODE" key is used to indicate how Fenix should handle error values + * returned by MPI functions that are unrelated to failed processes. The value should be + * a string with the name of a Fenix_Unhandled_mode enum value. * @param[out] error The return status of \c Fenix_Init
* Used to signal that a non-fatal error or special condition was encountered in the execution of * Fenix_Init, or FENIX_SUCCESS otherwise. It has the same value across all ranks released by @@ -221,10 +244,8 @@ typedef enum { *(_role) = __fenix_preinit(_role, _comm, _newcomm, _argc, \ _argv, _spare_ranks, _spawn, _info, \ _error, &bufjmp); \ - if(setjmp(bufjmp)) { \ - *(_role) = FENIX_ROLE_SURVIVOR_RANK; \ - } \ - __fenix_postinit( _error ); \ + setjmp(bufjmp); \ + __fenix_postinit(); \ } diff --git a/include/fenix.hpp b/include/fenix.hpp index 1549247..0234fe3 100644 --- a/include/fenix.hpp +++ b/include/fenix.hpp @@ -72,18 +72,4 @@ */ int Fenix_Callback_register(std::function callback); -namespace fenix { - -/** - * @brief Registers a callback that throws a CommException - * - * This means no longjmp will occur, and instead applications - * will continue from their try-catch error handler. - * - * @returnstatus - */ -int register_exception_callback(); - -} // namespace fenix - #endif diff --git a/include/fenix_ext.hpp b/include/fenix_ext.hpp index 4977a4a..9cd79a0 100644 --- a/include/fenix_ext.hpp +++ b/include/fenix_ext.hpp @@ -66,47 +66,44 @@ typedef struct __fenix_data_recovery fenix_data_recovery_t; typedef struct { - int num_inital_ranks; // Keeps the global MPI rank ID at Fenix_init - int num_survivor_ranks; // Keeps the global information on the number of survived MPI ranks after failure - int num_recovered_ranks; // Keeps the number of spare ranks brought into MPI communicator recovery - int resume_mode; // Defines how program resumes after process recovery - int spawn_policy; // Indicate dynamic process spawning - int spare_ranks; // Spare ranks entered by user to repair failed ranks - int repair_result; // Internal global variable to store the result of MPI communicator repair - int finalized; + int num_inital_ranks; // Keeps the global MPI rank ID at Fenix_init + int num_survivor_ranks = 0; // Keeps the global information on the number of survived MPI ranks after failure + int num_recovered_ranks = 0; // Keeps the number of spare ranks brought into MPI communicator recovery + int spare_ranks; // Spare ranks entered by user to repair failed ranks + + int resume_mode = Fenix_Resume_mode::JUMP; + int unhandled_mode = Fenix_Unhandled_mode::ABORT; + int ignore_errs = false; // Temporarily ignore all errors & recovery + int spawn_policy; // Indicate dynamic process spawning jmp_buf *recover_environment; // Calling environment to fill the jmp_buf structure + int repair_result = FENIX_SUCCESS; // Internal variable to store the result of MPI comm repair + int role = FENIX_ROLE_INITIAL_RANK; - //enum FenixRankRole role; // Role of rank: initial, survivor or repair - int role; // Role of rank: initial, survivor or repair - int fenix_init_flag = 0; + int fenix_init_flag = false; + int finalized = false; - int fail_world_size; - int* fail_world; + int fail_world_size = 0; + int* fail_world = nullptr; //Save the pointer to role and error of Fenix_Init - int *ret_role; - int *ret_error; + int *ret_role = nullptr; + int *ret_error = nullptr; std::vector callbacks; - fenix_debug_opt_t options; // This is reserved to store the user options + fenix_debug_opt_t options; // This is reserved to store the user options - MPI_Comm *world; // Duplicate of the MPI communicator provided by user - MPI_Comm new_world; // Global MPI communicator identical to g_world but without spare ranks - MPI_Comm *user_world; // MPI communicator with repaired ranks - //Manage state of the comms. Necessary when failures happen rapidly, mussing up state - int new_world_exists, user_world_exists; - + MPI_Comm *world; // Duplicate of comm provided by user + MPI_Comm *user_world; // User-facing comm with repaired ranks and no spares + MPI_Comm new_world; // Internal duplicate of user_world + int new_world_exists = false, user_world_exists = false; + + //Values used for Fenix_Process_detect_failures int dummy_recv_buffer; MPI_Request check_failures_req; - - MPI_Op agree_op; // This is reserved for the global agreement call for Fenix data recovery API - - - MPI_Errhandler mpi_errhandler; // This stores callback info for our custom error handler - int ignore_errs; // Set this to return errors instead of using the error handler normally. (Don't forget to unset!) - int print_unhandled; // Set this to print the error string for MPI errors of an unhandled return type. + MPI_Op agree_op; // Global agreement call for Fenix data recovery API + MPI_Errhandler mpi_errhandler; // Our custom error handler fenix_data_recovery_t *data_recovery; // Global pointer for Fenix Data Recovery Data Structure } fenix_t; diff --git a/include/fenix_init.h b/include/fenix_init.h index c4ca69b..19471d1 100644 --- a/include/fenix_init.h +++ b/include/fenix_init.h @@ -67,7 +67,7 @@ extern "C" { int __fenix_preinit(int *, MPI_Comm, MPI_Comm *, int *, char ***, int, int, MPI_Info, int *, jmp_buf *); -void __fenix_postinit(int *); +void __fenix_postinit(); #if defined(c_plusplus) || defined(__cplusplus) } diff --git a/include/fenix_opt.hpp b/include/fenix_opt.hpp index b032b02..2fb34d4 100644 --- a/include/fenix_opt.hpp +++ b/include/fenix_opt.hpp @@ -78,7 +78,7 @@ do { printf("%s(): " fmt, __func__, __VA_ARGS__); } while (0) typedef struct __fenix_debug_opt_t { - int verbose; + int verbose = -1; } fenix_debug_opt_t; diff --git a/include/fenix_process_recovery.hpp b/include/fenix_process_recovery.hpp index f6ad346..760617f 100644 --- a/include/fenix_process_recovery.hpp +++ b/include/fenix_process_recovery.hpp @@ -65,13 +65,11 @@ #include #include #include +#include #include "fenix_init.h" #include -#define __FENIX_RESUME_AT_INIT 0 -#define __FENIX_RESUME_NO_JUMP 200 - using fenix_callback_func = std::function; typedef struct __fenix_comm_list_elm { @@ -85,6 +83,10 @@ typedef struct { fenix_comm_list_elm_t *tail; } fenix_comm_list_t; +void __fenix_set_resume_mode(const std::string_view& name); + +void __fenix_set_unhandled_mode(const std::string_view& name); + int __fenix_create_new_world(); int __fenix_repair_ranks(); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 50a0233..87c3305 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -16,7 +16,6 @@ FILE(GLOB Fenix_HEADERS ${CMAKE_SOURCE_DIR}/include/*.h*) set (Fenix_SOURCES fenix.cpp -fenix_exception.cpp fenix_opt.cpp fenix_process_recovery.cpp fenix_util.cpp diff --git a/src/fenix_exception.cpp b/src/fenix_exception.cpp deleted file mode 100644 index 500f433..0000000 --- a/src/fenix_exception.cpp +++ /dev/null @@ -1,14 +0,0 @@ -#include "fenix_exception.hpp" -#include "fenix.h" - -namespace fenix { - -int register_exception_callback(){ - return Fenix_Callback_register( - [](MPI_Comm repaired_comm, int fen_err){ - throw CommException(repaired_comm, fen_err); - } - ); -} - -} // namespace fenix diff --git a/src/fenix_process_recovery.cpp b/src/fenix_process_recovery.cpp index eb3b6c2..1da031c 100644 --- a/src/fenix_process_recovery.cpp +++ b/src/fenix_process_recovery.cpp @@ -67,15 +67,15 @@ #include +using namespace fenix; + int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, char ***argv, int spare_ranks, int spawn, MPI_Info info, int *error, jmp_buf *jump_environment) { - - int ret; *role = fenix_rt.role; - *error = 0; + *error = FENIX_SUCCESS; fenix_rt.user_world = new_comm; @@ -85,79 +85,25 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha MPI_Comm_dup(comm, fenix_rt.world); PMPI_Comm_set_errhandler(*fenix_rt.world, fenix_rt.mpi_errhandler); - fenix_rt.finalized = 0; fenix_rt.spare_ranks = spare_ranks; fenix_rt.spawn_policy = spawn; fenix_rt.recover_environment = jump_environment; - fenix_rt.role = FENIX_ROLE_INITIAL_RANK; - fenix_rt.fail_world_size = 0; - fenix_rt.ignore_errs = 0; - fenix_rt.resume_mode = __FENIX_RESUME_AT_INIT; - fenix_rt.repair_result = 0; fenix_rt.ret_role = role; fenix_rt.ret_error = error; - fenix_rt.options.verbose = -1; - // __fenix_init_opt(*argc, *argv); - - // For request tracking, make sure we can save at least an integer - // in MPI_Request - if(sizeof(MPI_Request) < sizeof(int)) { - fprintf(stderr, "FENIX ERROR: __fenix_preinit: sizeof(MPI_Request) < sizeof(int)!\n"); - MPI_Abort(comm, -1); - } - - MPI_Op_create((MPI_User_function *) __fenix_ranks_agree, 1, &fenix_rt.agree_op); /* Check the values in info */ if (info != MPI_INFO_NULL) { - char value[MPI_MAX_INFO_VAL + 1]; - int vallen = MPI_MAX_INFO_VAL; - int flag; - - MPI_Info_get(info, "FENIX_RESUME_MODE", vallen, value, &flag); - if (flag == 1) { - if (strcmp(value, "Fenix_init") == 0) { - fenix_rt.resume_mode = __FENIX_RESUME_AT_INIT; - if (fenix_rt.options.verbose == 0) { - verbose_print("rank: %d, role: %d, value: %s\n", - __fenix_get_current_rank(*fenix_rt.world), fenix_rt.role, value); - } - } else if (strcmp(value, "NO_JUMP") == 0) { - fenix_rt.resume_mode = __FENIX_RESUME_NO_JUMP; - if (fenix_rt.options.verbose == 0) { - verbose_print("rank: %d, role: %d, value: %s\n", - __fenix_get_current_rank(*fenix_rt.world), fenix_rt.role, value); - } + constexpr int len = MPI_MAX_INFO_VAL; + char value[len + 1]; + int found; - } else { - /* No support. Setting it to Fenix_init */ - fenix_rt.resume_mode = __FENIX_RESUME_AT_INIT; - } - } - + MPI_Info_get(info, "FENIX_RESUME_MODE", len, value, &found); + if (found) __fenix_set_resume_mode(value); - MPI_Info_get(info, "FENIX_UNHANDLED_MODE", vallen, value, &flag); - if (flag == 1) { - if (strcmp(value, "SILENT") == 0) { - fenix_rt.print_unhandled = 0; - if (fenix_rt.options.verbose == 0) { - verbose_print("rank: %d, role: %d, UNHANDLED_MODE: %s\n", - __fenix_get_current_rank(*fenix_rt.world), fenix_rt.role, value); - } - } else if (strcmp(value, "NO_JUMP") == 0) { - fenix_rt.print_unhandled = 1; - if (fenix_rt.options.verbose == 0) { - verbose_print("rank: %d, role: %d, UNHANDLED_MODE: %s\n", - __fenix_get_current_rank(*fenix_rt.world), fenix_rt.role, value); - } - - } else { - /* No support. Setting it to silent */ - fenix_rt.print_unhandled = 0; - } - } + MPI_Info_get(info, "FENIX_UNHANDLED_MODE", len, value, &found); + if (found) __fenix_set_unhandled_mode(value); } if (fenix_rt.spare_ranks >= __fenix_get_world_size(comm)) { @@ -175,13 +121,8 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha /* trigger an abort. */ /*****************************************************/ - ret = 1; - while (ret) { - ret = __fenix_create_new_world(); - if (ret) { - // just_repair_process(); - } - } + //Try to create new_world until success + while (__fenix_create_new_world()); if ( __fenix_spare_rank() != 1) { fenix_rt.num_inital_ranks = __fenix_get_world_size(fenix_rt.new_world); @@ -201,17 +142,16 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha } } - fenix_rt.num_survivor_ranks = 0; - fenix_rt.num_recovered_ranks = 0; + fenix_rt.fenix_init_flag = true; while ( __fenix_spare_rank() == 1) { int a; int myrank; MPI_Status mpi_status; - fenix_rt.ignore_errs = 1; - ret = PMPI_Recv(&a, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, *fenix_rt.world, - &mpi_status); // listen for a failure - fenix_rt.ignore_errs = 0; + fenix_rt.ignore_errs = true; + int ret = PMPI_Recv(&a, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, + *fenix_rt.world, &mpi_status); + fenix_rt.ignore_errs = false; if (ret == MPI_SUCCESS) { if (fenix_rt.options.verbose == 0) { verbose_print("Finalize the program; rank: %d, role: %d\n", @@ -232,11 +172,37 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha if(fenix_rt.role != FENIX_ROLE_RECOVERED_RANK) MPI_Comm_dup(fenix_rt.new_world, fenix_rt.user_world); - fenix_rt.user_world_exists = 1; + fenix_rt.user_world_exists = true; return fenix_rt.role; } +void __fenix_set_resume_mode(const std::string_view& name){ + if (name == "JUMP") { + fenix_rt.resume_mode = Fenix_Resume_mode::JUMP; + } else if (name == "RETURN") { + fenix_rt.resume_mode = Fenix_Resume_mode::RETURN; + } else if (name == "THROW") { + fenix_rt.resume_mode = Fenix_Resume_mode::THROW; + } else { + fprintf(stderr, "Unsupported FENIX_RESUME_MODE %s\n", name.data()); + MPI_Abort(*fenix_rt.world, 1); + } +} + +void __fenix_set_unhandled_mode(const std::string_view& name){ + if (name == "SILENT") { + fenix_rt.resume_mode = Fenix_Unhandled_mode::SILENT; + } else if (name == "PRINT") { + fenix_rt.resume_mode = Fenix_Unhandled_mode::PRINT; + } else if (name == "ABORT") { + fenix_rt.resume_mode = Fenix_Unhandled_mode::ABORT; + } else { + fprintf(stderr, "Unsupported FENIX_UNHANDLED_MODE %s\n", name.data()); + MPI_Abort(*fenix_rt.world, 1); + } +} + int __fenix_spare_rank_within(MPI_Comm refcomm) { int result = -1; @@ -271,7 +237,6 @@ int __fenix_create_new_world_from(MPI_Comm from_comm) ret = PMPI_Comm_split(from_comm, MPI_UNDEFINED, current_rank, &fenix_rt.new_world); - //if (ret != MPI_SUCCESS) { debug_print("MPI_Comm_split: %d\n", ret); } fenix_rt.new_world_exists = 0; //Should already be this } else { @@ -334,10 +299,9 @@ int __fenix_repair_ranks() } while (!repair_success) { - repair_success = 1; + ret = MPIX_Comm_shrink(*fenix_rt.world, &world_without_failures); - //if (ret != MPI_SUCCESS) { debug_print("MPI_Comm_shrink. repair_ranks\n"); } if (ret != MPI_SUCCESS) { repair_success = 0; goto END_LOOP; @@ -404,7 +368,6 @@ int __fenix_repair_ranks() } } - //if (ret != MPI_SUCCESS) { debug_print("MPI_Allgather. repair_ranks\n"); } if (ret != MPI_SUCCESS) { repair_success = 0; if (ret == MPI_ERR_PROC_FAILED) { @@ -423,7 +386,6 @@ int __fenix_repair_ranks() ret = PMPI_Allreduce(&survived_flag, &fenix_rt.num_survivor_ranks, 1, MPI_INT, MPI_SUM, world_without_failures); - //if (ret != MPI_SUCCESS) { debug_print("MPI_Allreduce. repair_ranks\n"); } if (ret != MPI_SUCCESS) { repair_success = 0; if (ret == MPI_ERR_PROC_FAILED) { @@ -489,8 +451,6 @@ int __fenix_repair_ranks() /* Update the number of spare ranks */ /************************************/ fenix_rt.spare_ranks = 0; - - //debug_print("not enough spare ranks to repair rank failures. repair_ranks\n"); } /****************************************************************/ @@ -506,7 +466,6 @@ int __fenix_repair_ranks() ret = PMPI_Allgather(¤t_rank, 1, MPI_INT, survivor_world, 1, MPI_INT, world_without_failures); - //if (ret != MPI_SUCCESS) { debug_print("MPI_Allgather. repair_ranks\n"); } if (ret != MPI_SUCCESS) { repair_success = 0; if (ret == MPI_ERR_PROC_FAILED) { @@ -524,7 +483,6 @@ int __fenix_repair_ranks() ret = PMPI_Allreduce(&survived_flag, &fenix_rt.num_survivor_ranks, 1, MPI_INT, MPI_SUM, world_without_failures); - //if (ret != MPI_SUCCESS) { debug_print("MPI_Allreduce. repair_ranks\n"); } if (ret != MPI_SUCCESS) { repair_success = 0; if (ret != MPI_ERR_PROC_FAILED) { @@ -625,7 +583,6 @@ int __fenix_repair_ranks() } ret = PMPI_Barrier(fixed_world); - /* if (ret != MPI_SUCCESS) { debug_print("MPI_Barrier. repair_ranks\n"); } */ if (ret != MPI_SUCCESS) { repair_success = 0; MPIX_Comm_revoke(fixed_world); @@ -635,16 +592,6 @@ int __fenix_repair_ranks() END_LOOP: num_try++; - - /*******************************************************/ - /*** Not sure if we should include verbose statement ***/ - /*******************************************************/ - -/* - if (current_rank == FENIX_ROOT) { - LDEBUG("Fenix: communicators repaired\n"); - } -*/ } *fenix_rt.world = fixed_world; @@ -676,13 +623,10 @@ int __fenix_spare_rank(){ return __fenix_spare_rank_within(*fenix_rt.world); } -void __fenix_postinit(int *error) +void __fenix_postinit() { - - //if (fenix_rt.options.verbose == 9) { - // verbose_print(" postinit: current_rank: %d, role: %d\n", __fenix_get_current_rank(fenix_rt.new_world), - // fenix_rt.role); - //} + *fenix_rt.ret_role = fenix_rt.role; + *fenix_rt.ret_error = fenix_rt.repair_result; if(fenix_rt.new_world_exists){ //Set up dummy irecv to use for checking for failures. @@ -690,19 +634,8 @@ void __fenix_postinit(int *error) 34095347, fenix_rt.new_world, &fenix_rt.check_failures_req); } - if (fenix_rt.repair_result != 0) { - *error = fenix_rt.repair_result; - } - fenix_rt.fenix_init_flag = 1; - -#if 0 - if (fenix_rt.role != FENIX_ROLE_INITIAL_RANK) { - init_data_recovery(); - } -#endif - if (fenix_rt.role == FENIX_ROLE_SURVIVOR_RANK) { - __fenix_callback_invoke_all(*error); + __fenix_callback_invoke_all(*fenix_rt.ret_error); } if (fenix_rt.options.verbose == 9) { verbose_print("After barrier. current_rank: %d, role: %d\n", __fenix_get_current_rank(fenix_rt.new_world), @@ -742,7 +675,7 @@ void __fenix_finalize() int last_spare_rank = __fenix_get_world_size(*fenix_rt.world) - 1; //If we've reached here, we will finalized regardless of further errors. - fenix_rt.ignore_errs = 1; + fenix_rt.ignore_errs = true; while(!fenix_rt.finalized){ int user_rank = __fenix_get_current_rank(*fenix_rt.user_world); @@ -765,7 +698,7 @@ void __fenix_finalize() } else { //If rank 0 did contribute, we know sends made it, and regardless //of any other failures we finalize. - fenix_rt.finalized = 1; + fenix_rt.finalized = true; } } @@ -794,7 +727,7 @@ void __fenix_finalize() void __fenix_finalize_spare() { - fenix_rt.fenix_init_flag = 0; + fenix_rt.fenix_init_flag = false; int unused; MPI_Request agree_req, recv_req = MPI_REQUEST_NULL; @@ -842,45 +775,59 @@ void __fenix_test_MPI(MPI_Comm *pcomm, int *pret, ...) } switch (ret) { - case MPI_ERR_PROC_FAILED_PENDING: - case MPI_ERR_PROC_FAILED: - MPIX_Comm_revoke(*fenix_rt.world); - MPIX_Comm_revoke(fenix_rt.new_world); - - if(fenix_rt.user_world_exists) MPIX_Comm_revoke(*fenix_rt.user_world); - - - fenix_rt.repair_result = __fenix_repair_ranks(); - break; - case MPI_ERR_REVOKED: - fenix_rt.repair_result = __fenix_repair_ranks(); - break; - case MPI_ERR_INTERN: - printf("Fenix detected error: MPI_ERR_INTERN\n"); - default: - if(fenix_rt.print_unhandled){ + case MPI_ERR_PROC_FAILED_PENDING: + case MPI_ERR_PROC_FAILED: + MPIX_Comm_revoke(*fenix_rt.world); + MPIX_Comm_revoke(fenix_rt.new_world); + + if(fenix_rt.user_world_exists) MPIX_Comm_revoke(*fenix_rt.user_world); + + fenix_rt.repair_result = __fenix_repair_ranks(); + break; + case MPI_ERR_REVOKED: + fenix_rt.repair_result = __fenix_repair_ranks(); + break; + default: int len; char errstr[MPI_MAX_ERROR_STRING]; MPI_Error_string(ret, errstr, &len); - fprintf(stderr, "UNHANDLED ERR: %s\n", errstr); - } - return; - break; + switch (fenix_rt.unhandled_mode) { + case ABORT: + fprintf(stderr, "UNHANDLED ERR: %s\n", errstr); + MPI_Abort(*fenix_rt.world, 1); + break; + case PRINT: + fprintf(stderr, "UNHANDLED ERR: %s\n", errstr); + break; + case SILENT: + break; + default: + printf( + "Fenix internal error: Unknown unhandled mode %d\n", + fenix_rt.unhandled_mode + ); + assert(false); + break; + } + return; + break; } - fenix_rt.role = FENIX_ROLE_SURVIVOR_RANK; + __fenix_postinit(); if(!fenix_rt.finalized) { switch(fenix_rt.resume_mode) { - case __FENIX_RESUME_AT_INIT: + case JUMP: longjmp(*fenix_rt.recover_environment, 1); break; - case __FENIX_RESUME_NO_JUMP: - *(fenix_rt.ret_role) = FENIX_ROLE_SURVIVOR_RANK; - __fenix_postinit(fenix_rt.ret_error); + case RETURN: + break; + case THROW: + throw CommException(*fenix_rt.user_world, *fenix_rt.ret_error); break; default: - printf("Fenix detected error: Unknown resume mode\n"); + printf("Fenix internal error: Unknown resume mode %d\n", + fenix_rt.resume_mode); assert(false); break; } diff --git a/test/exception_throw/fenix_exceptions.cpp b/test/exception_throw/fenix_exceptions.cpp index 8142b12..182abc7 100644 --- a/test/exception_throw/fenix_exceptions.cpp +++ b/test/exception_throw/fenix_exceptions.cpp @@ -72,12 +72,9 @@ int main(int argc, char **argv) { MPI_Comm res_comm; MPI_Info info; MPI_Info_create(&info); - MPI_Info_set(info, "FENIX_RESUME_MODE", "NO_JUMP"); - MPI_Info_set(info, "FENIX_UNHANDLED_MODE", "NO_JUMP"); + MPI_Info_set(info, "FENIX_RESUME_MODE", "THROW"); Fenix_Init(&fenix_role, MPI_COMM_WORLD, &res_comm, &argc, &argv, 0, 0, info, &error); - fenix::register_exception_callback(); - if(fenix_role == FENIX_ROLE_SURVIVOR_RANK){ printf("FAILURE: longjmp instead of exception\n"); status = 1; diff --git a/test/issend/fenix_issend_test.c b/test/issend/fenix_issend_test.c index 212a7ae..23f3d85 100644 --- a/test/issend/fenix_issend_test.c +++ b/test/issend/fenix_issend_test.c @@ -87,7 +87,7 @@ int main(int argc, char **argv) { MPI_Info info; MPI_Info_create(&info); - MPI_Info_set(info, "FENIX_RESUME_MODE", "NO_JUMP"); + MPI_Info_set(info, "FENIX_RESUME_MODE", "RETURN"); int fenix_status; int recovered = 0; diff --git a/test/no_jump/fenix_no_jump_test.c b/test/no_jump/fenix_no_jump_test.c index 31eb9f3..cf5d261 100644 --- a/test/no_jump/fenix_no_jump_test.c +++ b/test/no_jump/fenix_no_jump_test.c @@ -87,7 +87,7 @@ int main(int argc, char **argv) { MPI_Info info; MPI_Info_create(&info); - MPI_Info_set(info, "FENIX_RESUME_MODE", "NO_JUMP"); + MPI_Info_set(info, "FENIX_RESUME_MODE", "RETURN"); int fenix_status; int recovered = 0;