From e5d81b5e5ea2064dc3017786a5fc7c6b40c6a7ee Mon Sep 17 00:00:00 2001 From: Likios Date: Fri, 31 Oct 2025 03:06:04 +0000 Subject: [PATCH] fix: Add retry mechanism for status update conflicts in ArksApplication and ArksDisaggregatedApplication Wraps Status().Update() calls with retry.RetryOnConflict() to handle concurrent modifications during reconciliation. This prevents 'the object has been modified' errors when users modify resources while the controller is updating status. Controllers fixed: - ArksDisaggregatedApplication: Confirmed production issue with frequent Prefill/Decode tuning - ArksApplication: Same vulnerability pattern with frequent replica/config changes --- .../controller/arksapplication_controller.go | 22 ++++++++++++++++--- ...arksdisaggregatedapplication_controller.go | 22 ++++++++++++++++--- 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/internal/controller/arksapplication_controller.go b/internal/controller/arksapplication_controller.go index 4b929e8d..9da3efff 100644 --- a/internal/controller/arksapplication_controller.go +++ b/internal/controller/arksapplication_controller.go @@ -30,6 +30,7 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/client-go/kubernetes" + "k8s.io/client-go/util/retry" "k8s.io/klog/v2" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" @@ -94,9 +95,24 @@ func (r *ArksApplicationReconciler) Reconcile(ctx context.Context, req ctrl.Requ // reconcile model result, err := r.reconcile(ctx, application) - // update application status - if err := r.Client.Status().Update(ctx, application); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update status for application %s/%s (%s): %q", application.Namespace, application.Name, application.UID, err) + // update application status with retry on conflict + statusErr := retry.RetryOnConflict(retry.DefaultRetry, func() error { + // Get latest version before updating status + latest := &arksv1.ArksApplication{} + if getErr := r.Client.Get(ctx, req.NamespacedName, latest); getErr != nil { + return getErr + } + + // Copy status from reconciled application to latest version + latest.Status = application.Status + + // Update status on latest version + return r.Client.Status().Update(ctx, latest) + }) + + if statusErr != nil { + klog.Errorf("failed to update status for application %s/%s: %v", application.Namespace, application.Name, statusErr) + return ctrl.Result{}, fmt.Errorf("failed to update status: %w", statusErr) } // handle reconcile error diff --git a/internal/controller/arksdisaggregatedapplication_controller.go b/internal/controller/arksdisaggregatedapplication_controller.go index 0cc49c17..013a429f 100644 --- a/internal/controller/arksdisaggregatedapplication_controller.go +++ b/internal/controller/arksdisaggregatedapplication_controller.go @@ -33,6 +33,7 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/client-go/kubernetes" + "k8s.io/client-go/util/retry" "k8s.io/klog/v2" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" @@ -90,9 +91,24 @@ func (r *ArksDisaggregatedApplicationReconciler) Reconcile(ctx context.Context, // reconcile model result, err := r.reconcile(ctx, application) - // update application status - if err := r.Client.Status().Update(ctx, application); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update status for application %s/%s (%s): %q", application.Namespace, application.Name, application.UID, err) + // update application status with retry on conflict + statusErr := retry.RetryOnConflict(retry.DefaultRetry, func() error { + // Get latest version before updating status + latest := &arksv1.ArksDisaggregatedApplication{} + if getErr := r.Client.Get(ctx, req.NamespacedName, latest); getErr != nil { + return getErr + } + + // Copy status from reconciled application to latest version + latest.Status = application.Status + + // Update status on latest version + return r.Client.Status().Update(ctx, latest) + }) + + if statusErr != nil { + klog.Errorf("failed to update status for application %s/%s: %v", application.Namespace, application.Name, statusErr) + return ctrl.Result{}, fmt.Errorf("failed to update status: %w", statusErr) } // handle reconcile error