From 8afbc38a2a73ff12c4384d893803cec88b17d571 Mon Sep 17 00:00:00 2001 From: Piotr Konopka Date: Wed, 15 Jan 2025 10:45:23 +0100 Subject: [PATCH] [core] improvements in error reporting during env deployment and configuration - "task '/opt/o2/bin/o2-readout-exe' on alio2-cr1-mvs11 (id 2sBwA3Z8yWU, name alio2-cr1-hv-gw01.cern.ch:/opt/git/ControlWorkflows/tasks/readout@12b11ac4bb652e1835e3e94806a688c951691d5f#2sBwA3Z8yWU) failed with error..." displayed by COG becomes "task 'readout' on alio2-cr1-mvs11 (id 2sBwA3Z8yWU) failed with error...". I believe the removed information is not useful for typical operators. Details can be always found in logs. - "MesosCommand MesosCommand_Transition timed out for task 2sBwYFZ82wn" becomes "MesosCommand_Transition timed out for task 2sBwYFZ82wn" because by convention, all MesosCommands have "MesosCommand" in their names. - If a transition request times out, more accurate error is reported in COG. "nil response" becomes "CONFIGURE could not complete for critical tasks, errors: task 'readout' on alio2-cr1-mvs11 (id 2sBwA3Z8yWU) failed with error: MesosCommand_Transition timed out for task 2sBwA3Z8yWU". - In case a task transition times out, any other errors which happened at the same time (e.g. a task crashed during transition) are not omitted anymore in COG. - "nil response" in task/manager.go becomes "no response from Mesos to CONFIGURE transition request within 120s timeout", but it will not be typically printed for tasks timing out during a transition. - "nil response" in commandqueue.go becomes "did not receive neither response nor error for MesosCommand_Transition" One could consider further simplification of some of these messages, but perhaps let's see how this goes. --- core/controlcommands/commandqueue.go | 10 +++++++--- core/controlcommands/mesoscommandservent.go | 2 +- core/task/manager.go | 11 ++--------- core/task/task.go | 1 + 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/core/controlcommands/commandqueue.go b/core/controlcommands/commandqueue.go index d14933ff7..0c769dec4 100644 --- a/core/controlcommands/commandqueue.go +++ b/core/controlcommands/commandqueue.go @@ -101,7 +101,7 @@ func (m *CommandQueue) Start() { } if err == nil && response == nil { log.WithField("partition", entry.cmd.GetEnvironmentId().String()). - Error("nil response") + Errorf("did not receive neither response nor error for %s", entry.cmd.GetName()) } entry.callback <- response @@ -198,6 +198,11 @@ func (m *CommandQueue) commit(command MesosCommand) (response MesosCommandRespon // Wait for goroutines to finish for i := 0; i < len(command.targets()); i++ { respSemaphore := <-semaphore + // for the sake of better error propagation, we treat a lack of response as a response with error, + // even though it's not technically the same. it can be surely done better, but it would require a larger refactoring. + if respSemaphore.err != nil && respSemaphore.response == nil { + respSemaphore.response = NewMesosCommandResponse(command, respSemaphore.err) + } responses[respSemaphore.receiver] = respSemaphore.response if respSemaphore.err != nil { sendErrorList = append(sendErrorList, respSemaphore.err) @@ -215,12 +220,11 @@ func (m *CommandQueue) commit(command MesosCommand) (response MesosCommandRespon } return }(), "\n")) - return } response = consolidateResponses(command, responses) log.WithField("partition", command.GetEnvironmentId().String()). Debug("responses consolidated, CommandQueue commit done") - return response, nil + return response, err } diff --git a/core/controlcommands/mesoscommandservent.go b/core/controlcommands/mesoscommandservent.go index 52708043c..2185eea2e 100644 --- a/core/controlcommands/mesoscommandservent.go +++ b/core/controlcommands/mesoscommandservent.go @@ -144,7 +144,7 @@ func (s *Servent) RunCommand(cmd MesosCommand, receiver MesosCommandTarget) (Mes // By the time we get here, ProcessResponse should have already added a Response to the // pending call, and removed it from servent.pending. case <-time.After(cmd.GetResponseTimeout()): - call.Error = fmt.Errorf("MesosCommand %s timed out for task %s", cmd.GetName(), receiver.TaskId.Value) + call.Error = fmt.Errorf("%s timed out for task %s", cmd.GetName(), receiver.TaskId.Value) log.WithPrefix("servent"). WithField("partition", cmd.GetEnvironmentId().String()). diff --git a/core/task/manager.go b/core/task/manager.go index d04964f2a..dcff998b2 100644 --- a/core/task/manager.go +++ b/core/task/manager.go @@ -754,7 +754,7 @@ func (m *Manager) configureTasks(envId uid.ID, tasks Tasks) error { close(notify) if response == nil { - return errors.New("nil response") + return fmt.Errorf("no response from Mesos to CONFIGURE transition request within %ds timeout", int(cmd.ResponseTimeout.Seconds())) } if response.IsMultiResponse() { @@ -765,14 +765,7 @@ func (m *Manager) configureTasks(envId uid.ID, tasks Tasks) error { task := m.GetTask(k.TaskId.Value) var taskDescription string if task != nil { - tci := task.GetTaskCommandInfo() - tciValue := "unknown command" - if tci.Value != nil { - tciValue = *tci.Value - } - - taskDescription = fmt.Sprintf("task '%s' on %s (id %s, name %s) failed with error: %s", tciValue, task.GetHostname(), task.GetTaskId(), task.GetName(), v.Error()) - + taskDescription = fmt.Sprintf("task '%s' on %s (id %s) failed with error: %s", task.GetParent().GetName(), task.GetHostname(), task.GetTaskId(), v.Error()) } else { taskDescription = fmt.Sprintf("unknown task (id %s) failed with error: %s", k.TaskId.Value, v.Error()) } diff --git a/core/task/task.go b/core/task/task.go index 89032ca2c..825dcfc6a 100644 --- a/core/task/task.go +++ b/core/task/task.go @@ -76,6 +76,7 @@ type parentRole interface { ConsolidatedVarStack() (varStack map[string]string, err error) CollectInboundChannels() []channel.Inbound SendEvent(event.Event) + GetName() string } type Traits struct {