diff --git a/core/environment/manager.go b/core/environment/manager.go index 1c9f6783..ced83214 100644 --- a/core/environment/manager.go +++ b/core/environment/manager.go @@ -116,13 +116,13 @@ func NewEnvManager(tm *task.Manager, incomingEventCh chan event.Event) *Manager WithField("partition", envId.String()). WithField("agentId", typedEvent.GetId().Value). WithError(err). - Error("cannot find environment for incoming executor failed event") + Error("cannot find environment for incoming agent failed event") } log.WithPrefix("scheduler"). WithField("partition", envId.String()). WithField("agentId", typedEvent.GetId().Value). WithField("envState", env.CurrentState()). - Debug("received executor failed event") + Debug("received agent failed event") } case *event.TasksReleasedEvent: diff --git a/core/task/manager.go b/core/task/manager.go index 640fe648..83139c69 100644 --- a/core/task/manager.go +++ b/core/task/manager.go @@ -1047,6 +1047,13 @@ func (m *Manager) updateTaskStatus(status *mesos.TaskStatus) { if taskPtr.GetParent() != nil { taskPtr.GetParent().UpdateStatus(ACTIVE) } + if status.GetAgentID() != nil { + taskPtr.agentId = status.GetAgentID().GetValue() + } + if status.GetExecutorID() != nil { + taskPtr.executorId = status.GetExecutorID().GetValue() + } + case mesos.TASK_DROPPED, mesos.TASK_LOST, mesos.TASK_KILLED, mesos.TASK_FAILED, mesos.TASK_ERROR, mesos.TASK_FINISHED: taskPtr.status = INACTIVE diff --git a/core/task/scheduler.go b/core/task/scheduler.go index 53e54873..aeea651f 100644 --- a/core/task/scheduler.go +++ b/core/task/scheduler.go @@ -246,6 +246,9 @@ func (state *schedulerState) failure(_ context.Context, e *scheduler.Event) erro WithFields(fields). WithField("level", infologger.IL_Support). Error("agent failed") + log.WithField("level", infologger.IL_Ops). + WithField("detector", detector). + Errorf("possible connectivity issues with host '%s'", host) state.taskman.internalEventCh <- event.NewAgentFailedEvent(aid) } return nil diff --git a/docs/handbook/appconfiguration.md b/docs/handbook/appconfiguration.md index 88250ac3..5b589277 100644 --- a/docs/handbook/appconfiguration.md +++ b/docs/handbook/appconfiguration.md @@ -1 +1,12 @@ -# Component Configuration \ No newline at end of file +# Component Configuration + +## Connectivity to controlled nodes + +ECS relies on Mesos to know the state of the controlled nodes. +Thus, losing connection to a Mesos slave can be treated as a node being down or unresponsive. +In case a Mesos slave is lost, tasks belonging to it are set to ERROR state and treated as INACTIVE. +Then, the environment is transitioned to ERROR. + +Mesos slave health check can be configured with `MESOS_MAX_AGENT_PING_TIMEOUTS` (`--max_agent_ping_timeouts`) and `MESOS_AGENT_PING_TIMEOUT` (`--agent_ping_timeout`) parameters for Mesos. +Effectively, the factor of the two parameters is the time needed to consider a slave/agent as lost. +Please refer to Mesos documentation for more details. \ No newline at end of file