Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 32 additions & 11 deletions include/fenix.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,30 @@ typedef enum {
FENIX_ROLE_SURVIVOR_RANK = 2
} Fenix_Rank_role;

/**
* @brief Options for passing control back to application after recovery.
*/
typedef enum {
//!Return to Fenix_Init via longjmp (default)
JUMP,
//!Return the error code inline
RETURN,
//!Throw a Fenix::CommException
THROW
} Fenix_Resume_mode;

/**
* @brief Options for dealing with 'unhandled' errors, e.g. invalid rank IDs
*/
typedef enum {
//!Ignore unhandled errors
SILENT,
//!Print error and continue without handling
PRINT,
//!Print error and abort Fenix's world (default)
ABORT
} Fenix_Unhandled_mode;

/**
* @fn void Fenix_Init(int* role, MPI_Comm comm, MPI_Comm* newcomm, int** argc, char*** argv, int spare_ranks, int spawn, MPI_Info info, int* error);
* @brief Build a resilient communicator and set the restart point.
Expand Down Expand Up @@ -197,14 +221,13 @@ typedef enum {
* @param[in] spawn *Unimplemented*: Whether to enable spawning new ranks to replace
* failed ranks when spares are unavailable.
* @param[in] info Fenix recovery configuration parameters, may be MPI_INFO_NULL
* Supports the "FENIX_RESUME_MODE" key, used to indicate where execution should resume upon
* "FENIX_RESUME_MODE" key is used to indicate where execution should resume upon
* rank failure for all active (non-spare) ranks in any resilient communicators, not only for
* those ranks in communicators that failed. The following values associated with the
* "resume_mode" key are supported:
* - "Fenix_init" (default): execution resumes at logical exit of Fenix_Init.
* - "NO_JUMP": execution continues from the failing MPI call. Errors are otherwise handled
* as normal, but return the error code as well. Applications should typically
* either check for return codes or assign an error callback through Fenix.
* those ranks in communicators that failed. The value should be a string with the name of a
* Fenix_Resume_mode enum value.
* "FENIX_UNHANDLED_MODE" key is used to indicate how Fenix should handle error values
* returned by MPI functions that are unrelated to failed processes. The value should be
* a string with the name of a Fenix_Unhandled_mode enum value.
* @param[out] error The return status of \c Fenix_Init<br>
* Used to signal that a non-fatal error or special condition was encountered in the execution of
* Fenix_Init, or FENIX_SUCCESS otherwise. It has the same value across all ranks released by
Expand All @@ -221,10 +244,8 @@ typedef enum {
*(_role) = __fenix_preinit(_role, _comm, _newcomm, _argc, \
_argv, _spare_ranks, _spawn, _info, \
_error, &bufjmp); \
if(setjmp(bufjmp)) { \
*(_role) = FENIX_ROLE_SURVIVOR_RANK; \
} \
__fenix_postinit( _error ); \
setjmp(bufjmp); \
__fenix_postinit(); \
}


Expand Down
14 changes: 0 additions & 14 deletions include/fenix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,18 +72,4 @@
*/
int Fenix_Callback_register(std::function<void(MPI_Comm, int)> callback);

namespace fenix {

/**
* @brief Registers a callback that throws a CommException
*
* This means no longjmp will occur, and instead applications
* will continue from their try-catch error handler.
*
* @returnstatus
*/
int register_exception_callback();

} // namespace fenix

#endif
55 changes: 26 additions & 29 deletions include/fenix_ext.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,47 +66,44 @@
typedef struct __fenix_data_recovery fenix_data_recovery_t;

typedef struct {
int num_inital_ranks; // Keeps the global MPI rank ID at Fenix_init
int num_survivor_ranks; // Keeps the global information on the number of survived MPI ranks after failure
int num_recovered_ranks; // Keeps the number of spare ranks brought into MPI communicator recovery
int resume_mode; // Defines how program resumes after process recovery
int spawn_policy; // Indicate dynamic process spawning
int spare_ranks; // Spare ranks entered by user to repair failed ranks
int repair_result; // Internal global variable to store the result of MPI communicator repair
int finalized;
int num_inital_ranks; // Keeps the global MPI rank ID at Fenix_init
int num_survivor_ranks = 0; // Keeps the global information on the number of survived MPI ranks after failure
int num_recovered_ranks = 0; // Keeps the number of spare ranks brought into MPI communicator recovery
int spare_ranks; // Spare ranks entered by user to repair failed ranks

int resume_mode = Fenix_Resume_mode::JUMP;
int unhandled_mode = Fenix_Unhandled_mode::ABORT;
int ignore_errs = false; // Temporarily ignore all errors & recovery
int spawn_policy; // Indicate dynamic process spawning
jmp_buf *recover_environment; // Calling environment to fill the jmp_buf structure

int repair_result = FENIX_SUCCESS; // Internal variable to store the result of MPI comm repair
int role = FENIX_ROLE_INITIAL_RANK;

//enum FenixRankRole role; // Role of rank: initial, survivor or repair
int role; // Role of rank: initial, survivor or repair
int fenix_init_flag = 0;
int fenix_init_flag = false;
int finalized = false;

int fail_world_size;
int* fail_world;
int fail_world_size = 0;
int* fail_world = nullptr;

//Save the pointer to role and error of Fenix_Init
int *ret_role;
int *ret_error;
int *ret_role = nullptr;
int *ret_error = nullptr;

std::vector<fenix_callback_func> callbacks;
fenix_debug_opt_t options; // This is reserved to store the user options
fenix_debug_opt_t options; // This is reserved to store the user options

MPI_Comm *world; // Duplicate of the MPI communicator provided by user
MPI_Comm new_world; // Global MPI communicator identical to g_world but without spare ranks
MPI_Comm *user_world; // MPI communicator with repaired ranks
//Manage state of the comms. Necessary when failures happen rapidly, mussing up state
int new_world_exists, user_world_exists;

MPI_Comm *world; // Duplicate of comm provided by user
MPI_Comm *user_world; // User-facing comm with repaired ranks and no spares
MPI_Comm new_world; // Internal duplicate of user_world
int new_world_exists = false, user_world_exists = false;

//Values used for Fenix_Process_detect_failures
int dummy_recv_buffer;
MPI_Request check_failures_req;


MPI_Op agree_op; // This is reserved for the global agreement call for Fenix data recovery API


MPI_Errhandler mpi_errhandler; // This stores callback info for our custom error handler
int ignore_errs; // Set this to return errors instead of using the error handler normally. (Don't forget to unset!)
int print_unhandled; // Set this to print the error string for MPI errors of an unhandled return type.
MPI_Op agree_op; // Global agreement call for Fenix data recovery API
MPI_Errhandler mpi_errhandler; // Our custom error handler

fenix_data_recovery_t *data_recovery; // Global pointer for Fenix Data Recovery Data Structure
} fenix_t;
Expand Down
2 changes: 1 addition & 1 deletion include/fenix_init.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ extern "C" {
int __fenix_preinit(int *, MPI_Comm, MPI_Comm *, int *, char ***, int, int, MPI_Info, int *, jmp_buf *);


void __fenix_postinit(int *);
void __fenix_postinit();

#if defined(c_plusplus) || defined(__cplusplus)
}
Expand Down
2 changes: 1 addition & 1 deletion include/fenix_opt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
do { printf("%s(): " fmt, __func__, __VA_ARGS__); } while (0)

typedef struct __fenix_debug_opt_t {
int verbose;
int verbose = -1;
} fenix_debug_opt_t;


Expand Down
8 changes: 5 additions & 3 deletions include/fenix_process_recovery.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,11 @@
#include <stdarg.h>
#include <stdint.h>
#include <signal.h>
#include <string_view>

#include "fenix_init.h"
#include <functional>

#define __FENIX_RESUME_AT_INIT 0
#define __FENIX_RESUME_NO_JUMP 200

using fenix_callback_func = std::function<void(MPI_Comm, int)>;

typedef struct __fenix_comm_list_elm {
Expand All @@ -85,6 +83,10 @@ typedef struct {
fenix_comm_list_elm_t *tail;
} fenix_comm_list_t;

void __fenix_set_resume_mode(const std::string_view& name);

void __fenix_set_unhandled_mode(const std::string_view& name);

int __fenix_create_new_world();

int __fenix_repair_ranks();
Expand Down
1 change: 0 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ FILE(GLOB Fenix_HEADERS ${CMAKE_SOURCE_DIR}/include/*.h*)

set (Fenix_SOURCES
fenix.cpp
fenix_exception.cpp
fenix_opt.cpp
fenix_process_recovery.cpp
fenix_util.cpp
Expand Down
14 changes: 0 additions & 14 deletions src/fenix_exception.cpp

This file was deleted.

Loading