Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions core/environment/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,13 +116,13 @@ func NewEnvManager(tm *task.Manager, incomingEventCh chan event.Event) *Manager
WithField("partition", envId.String()).
WithField("agentId", typedEvent.GetId().Value).
WithError(err).
Error("cannot find environment for incoming executor failed event")
Error("cannot find environment for incoming agent failed event")
}
log.WithPrefix("scheduler").
WithField("partition", envId.String()).
WithField("agentId", typedEvent.GetId().Value).
WithField("envState", env.CurrentState()).
Debug("received executor failed event")
Debug("received agent failed event")
}

case *event.TasksReleasedEvent:
Expand Down
7 changes: 7 additions & 0 deletions core/task/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -1047,6 +1047,13 @@ func (m *Manager) updateTaskStatus(status *mesos.TaskStatus) {
if taskPtr.GetParent() != nil {
taskPtr.GetParent().UpdateStatus(ACTIVE)
}
if status.GetAgentID() != nil {
taskPtr.agentId = status.GetAgentID().GetValue()
}
if status.GetExecutorID() != nil {
taskPtr.executorId = status.GetExecutorID().GetValue()
}

case mesos.TASK_DROPPED, mesos.TASK_LOST, mesos.TASK_KILLED, mesos.TASK_FAILED, mesos.TASK_ERROR, mesos.TASK_FINISHED:

taskPtr.status = INACTIVE
Expand Down
3 changes: 3 additions & 0 deletions core/task/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,9 @@ func (state *schedulerState) failure(_ context.Context, e *scheduler.Event) erro
WithFields(fields).
WithField("level", infologger.IL_Support).
Error("agent failed")
log.WithField("level", infologger.IL_Ops).
WithField("detector", detector).
Errorf("possible connectivity issues with host '%s'", host)
state.taskman.internalEventCh <- event.NewAgentFailedEvent(aid)
}
return nil
Expand Down
13 changes: 12 additions & 1 deletion docs/handbook/appconfiguration.md
Original file line number Diff line number Diff line change
@@ -1 +1,12 @@
# Component Configuration
# Component Configuration

## Connectivity to controlled nodes

ECS relies on Mesos to know the state of the controlled nodes.
Thus, losing connection to a Mesos slave can be treated as a node being down or unresponsive.
In case a Mesos slave is lost, tasks belonging to it are set to ERROR state and treated as INACTIVE.
Then, the environment is transitioned to ERROR.

Mesos slave health check can be configured with `MESOS_MAX_AGENT_PING_TIMEOUTS` (`--max_agent_ping_timeouts`) and `MESOS_AGENT_PING_TIMEOUT` (`--agent_ping_timeout`) parameters for Mesos.
Effectively, the factor of the two parameters is the time needed to consider a slave/agent as lost.
Please refer to Mesos documentation for more details.