From 589045da28427ba2953dda9903960d25df1293d8 Mon Sep 17 00:00:00 2001 From: Rafael Benevides Date: Tue, 13 Jan 2026 16:25:50 -0300 Subject: [PATCH 1/2] HYPERFLEET-454 - feat: implement health endpoints standard Consolidate health endpoints on metrics server (port 8080) following HyperFleet health-endpoints.md standard: - Add /healthz (liveness) and /readyz (readiness) endpoints - Remove separate healthcheck server (port 8083) - Integrate readiness state with graceful shutdown - Update Kubernetes probes in deployment.yaml - Update documentation --- AGENTS.md | 11 +- README.md | 3 +- charts/templates/deployment.yaml | 18 +- charts/templates/service.yaml | 4 - charts/values.yaml | 1 - cmd/hyperfleet-api/servecmd/cmd.go | 13 +- .../server/healthcheck_server.go | 87 -------- .../logging/request_logging_middleware.go | 6 - cmd/hyperfleet-api/server/metrics_server.go | 6 + docs/deployment.md | 7 +- docs/development.md | 3 +- docs/hyperfleet-api.http | 9 +- pkg/config/config.go | 24 +- pkg/config/health_check.go | 26 --- pkg/health/handlers.go | 104 +++++++++ pkg/health/handlers_test.go | 209 ++++++++++++++++++ pkg/health/readiness.go | 68 ++++++ pkg/health/readiness_test.go | 100 +++++++++ test/helper.go | 37 +--- 19 files changed, 542 insertions(+), 194 deletions(-) delete mode 100755 cmd/hyperfleet-api/server/healthcheck_server.go delete mode 100755 pkg/config/health_check.go create mode 100644 pkg/health/handlers.go create mode 100644 pkg/health/handlers_test.go create mode 100644 pkg/health/readiness.go create mode 100644 pkg/health/readiness_test.go diff --git a/AGENTS.md b/AGENTS.md index 54e47e2..2382b9b 100755 --- a/AGENTS.md +++ b/AGENTS.md @@ -344,9 +344,7 @@ Serves the hyperfleet REST API with full authentication, database connectivity, - `--ocm-debug` - Enable OCM API debug logging - **Monitoring & Health Checks:** - - `--health-check-server-bindaddress` - Health check server address (default: "localhost:8083") - - `--enable-health-check-https` - Enable HTTPS for health check server - - `--metrics-server-bindaddress` - Metrics server address (default: "localhost:8080") + - `--metrics-server-bindaddress` - Metrics and health endpoints server address (default: "localhost:8080") - `--enable-metrics-https` - Enable HTTPS for metrics server - **Performance Tuning:** @@ -686,8 +684,7 @@ The server is configured in cmd/hyperfleet/server/: **Ports**: - `8000` - Main API server -- `8080` - Metrics endpoint -- `8083` - Health check endpoint +- `8080` - Metrics and health endpoints (`/metrics`, `/healthz`, `/readyz`) **Middleware Chain**: 1. Request logging @@ -772,7 +769,9 @@ The API is designed to be stateless and horizontally scalable: - No event creation or message queues - Kubernetes-ready (multiple replicas) -**Health Check**: `GET /healthcheck` returns 200 OK when database is accessible +**Health Endpoints**: +- `GET /healthz` - Liveness probe, returns 200 OK if the process is alive +- `GET /readyz` - Readiness probe, returns 200 OK when ready to receive traffic (checks database connection) **Metrics**: Prometheus metrics available at `/metrics` diff --git a/README.md b/README.md index f5e27fe..66fcb90 100755 --- a/README.md +++ b/README.md @@ -80,7 +80,8 @@ The service starts on `localhost:8000`: - **REST API**: `http://localhost:8000/api/hyperfleet/v1/` - **OpenAPI spec**: `http://localhost:8000/api/hyperfleet/v1/openapi` - **Swagger UI**: `http://localhost:8000/api/hyperfleet/v1/openapi.html` -- **Health check**: `http://localhost:8083/healthcheck` +- **Liveness probe**: `http://localhost:8080/healthz` +- **Readiness probe**: `http://localhost:8080/readyz` - **Metrics**: `http://localhost:8080/metrics` ```bash diff --git a/charts/templates/deployment.yaml b/charts/templates/deployment.yaml index 7ea0b7c..5e3d600 100644 --- a/charts/templates/deployment.yaml +++ b/charts/templates/deployment.yaml @@ -53,15 +53,11 @@ spec: args: - serve - --api-server-bindaddress={{ .Values.server.bindAddress | default ":8000" }} - - --health-check-server-bindaddress={{ .Values.server.healthBindAddress | default ":8083" }} - --metrics-server-bindaddress={{ .Values.server.metricsBindAddress | default ":8080" }} ports: - name: http containerPort: 8000 protocol: TCP - - name: health - containerPort: 8083 - protocol: TCP - name: metrics containerPort: 8080 protocol: TCP @@ -82,18 +78,18 @@ spec: {{- end }} livenessProbe: httpGet: - path: /healthcheck - port: health - initialDelaySeconds: 30 - periodSeconds: 10 + path: /healthz + port: metrics + initialDelaySeconds: 15 + periodSeconds: 20 timeoutSeconds: 5 failureThreshold: 3 readinessProbe: httpGet: - path: /healthcheck - port: health + path: /readyz + port: metrics initialDelaySeconds: 5 - periodSeconds: 5 + periodSeconds: 10 timeoutSeconds: 3 failureThreshold: 3 resources: diff --git a/charts/templates/service.yaml b/charts/templates/service.yaml index 6eb3628..6f6bbd3 100644 --- a/charts/templates/service.yaml +++ b/charts/templates/service.yaml @@ -11,10 +11,6 @@ spec: targetPort: http protocol: TCP name: http - - port: 8083 - targetPort: health - protocol: TCP - name: health - port: 8080 targetPort: metrics protocol: TCP diff --git a/charts/values.yaml b/charts/values.yaml index d5535f8..d95e53d 100644 --- a/charts/values.yaml +++ b/charts/values.yaml @@ -17,7 +17,6 @@ fullnameOverride: "" # Use ":PORT" format to bind to all interfaces (required for Kubernetes) server: bindAddress: ":8000" - healthBindAddress: ":8083" metricsBindAddress: ":8080" serviceAccount: diff --git a/cmd/hyperfleet-api/servecmd/cmd.go b/cmd/hyperfleet-api/servecmd/cmd.go index 6835d3f..2d3ce86 100755 --- a/cmd/hyperfleet-api/servecmd/cmd.go +++ b/cmd/hyperfleet-api/servecmd/cmd.go @@ -13,6 +13,7 @@ import ( "github.com/openshift-hyperfleet/hyperfleet-api/cmd/hyperfleet-api/environments" "github.com/openshift-hyperfleet/hyperfleet-api/cmd/hyperfleet-api/server" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/api" + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/health" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/telemetry" ) @@ -75,8 +76,9 @@ func runServe(cmd *cobra.Command, args []string) { metricsServer := server.NewMetricsServer() go metricsServer.Start() - healthcheckServer := server.NewHealthCheckServer() - go healthcheckServer.Start() + // Mark application as ready to receive traffic + health.GetReadinessState().SetReady() + logger.Info(ctx, "Application ready to receive traffic") sigChan := make(chan os.Signal, 1) signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM) @@ -84,9 +86,10 @@ func runServe(cmd *cobra.Command, args []string) { logger.Info(ctx, "Shutdown signal received, starting graceful shutdown...") - if err := healthcheckServer.Stop(); err != nil { - logger.WithError(ctx, err).Error("Failed to stop healthcheck server") - } + // Mark application as not ready (returns 503 on /readyz) + health.GetReadinessState().SetShuttingDown() + logger.Info(ctx, "Marked as not ready, draining in-flight requests...") + if err := apiServer.Stop(); err != nil { logger.WithError(ctx, err).Error("Failed to stop API server") } diff --git a/cmd/hyperfleet-api/server/healthcheck_server.go b/cmd/hyperfleet-api/server/healthcheck_server.go deleted file mode 100755 index 63f8631..0000000 --- a/cmd/hyperfleet-api/server/healthcheck_server.go +++ /dev/null @@ -1,87 +0,0 @@ -package server - -import ( - "context" - "fmt" - "net" - "net/http" - - health "github.com/docker/go-healthcheck" - "github.com/gorilla/mux" - - "github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger" -) - -var ( - updater = health.NewStatusUpdater() -) - -var _ Server = &healthCheckServer{} - -type healthCheckServer struct { - httpServer *http.Server -} - -func NewHealthCheckServer() *healthCheckServer { - router := mux.NewRouter() - health.DefaultRegistry = health.NewRegistry() - health.Register("maintenance_status", updater) - router.HandleFunc("/healthcheck", health.StatusHandler).Methods(http.MethodGet) - router.HandleFunc("/healthcheck/down", downHandler).Methods(http.MethodPost) - router.HandleFunc("/healthcheck/up", upHandler).Methods(http.MethodPost) - - srv := &http.Server{ - Handler: router, - Addr: env().Config.HealthCheck.BindAddress, - } - - return &healthCheckServer{ - httpServer: srv, - } -} - -func (s healthCheckServer) Start() { - ctx := context.Background() - var err error - if env().Config.HealthCheck.EnableHTTPS { - if env().Config.Server.HTTPSCertFile == "" || env().Config.Server.HTTPSKeyFile == "" { - check( - fmt.Errorf("unspecified required --https-cert-file, --https-key-file"), - "Can't start https server", - ) - } - - // Serve with TLS - logger.With(ctx, logger.FieldBindAddress, env().Config.HealthCheck.BindAddress).Info("Serving HealthCheck with TLS") - err = s.httpServer.ListenAndServeTLS(env().Config.Server.HTTPSCertFile, env().Config.Server.HTTPSKeyFile) - } else { - logger.With(ctx, logger.FieldBindAddress, env().Config.HealthCheck.BindAddress).Info("Serving HealthCheck without TLS") - err = s.httpServer.ListenAndServe() - } - if err != nil && err != http.ErrServerClosed { - check(err, "HealthCheck server terminated with errors") - } else { - logger.Info(ctx, "HealthCheck server terminated") - } -} - -func (s healthCheckServer) Stop() error { - return s.httpServer.Shutdown(context.Background()) -} - -// Listen Unimplemented -func (s healthCheckServer) Listen() (listener net.Listener, err error) { - return nil, nil -} - -// Serve Unimplemented -func (s healthCheckServer) Serve(listener net.Listener) { -} - -func upHandler(w http.ResponseWriter, r *http.Request) { - updater.Update(nil) -} - -func downHandler(w http.ResponseWriter, r *http.Request) { - updater.Update(fmt.Errorf("maintenance mode")) -} diff --git a/cmd/hyperfleet-api/server/logging/request_logging_middleware.go b/cmd/hyperfleet-api/server/logging/request_logging_middleware.go index 2f0bddf..2885a43 100755 --- a/cmd/hyperfleet-api/server/logging/request_logging_middleware.go +++ b/cmd/hyperfleet-api/server/logging/request_logging_middleware.go @@ -7,7 +7,6 @@ import ( "log/slog" "net" "net/http" - "strings" "time" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger" @@ -19,11 +18,6 @@ func RequestLoggingMiddleware(masker *middleware.MaskingMiddleware) func(http.Ha return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { ctx := r.Context() - if strings.TrimSuffix(r.URL.Path, "/") == "/healthcheck" { - handler.ServeHTTP(w, r) - return - } - var maskedHeaders http.Header if masker != nil { maskedHeaders = masker.MaskHeaders(r.Header) diff --git a/cmd/hyperfleet-api/server/metrics_server.go b/cmd/hyperfleet-api/server/metrics_server.go index 198ad69..a9b8001 100755 --- a/cmd/hyperfleet-api/server/metrics_server.go +++ b/cmd/hyperfleet-api/server/metrics_server.go @@ -10,6 +10,7 @@ import ( "github.com/openshift-hyperfleet/hyperfleet-api/pkg/api" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/handlers" + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/health" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger" ) @@ -21,6 +22,11 @@ func NewMetricsServer() Server { prometheusMetricsHandler := handlers.NewPrometheusMetricsHandler() mainRouter.Handle("/metrics", prometheusMetricsHandler.Handler()) + // health endpoints (HyperFleet standard) + healthHandler := health.NewHandler(env().Database.SessionFactory) + mainRouter.HandleFunc("/healthz", healthHandler.LivenessHandler).Methods(http.MethodGet) + mainRouter.HandleFunc("/readyz", healthHandler.ReadinessHandler).Methods(http.MethodGet) + var mainHandler http.Handler = mainRouter s := &metricsServer{} diff --git a/docs/deployment.md b/docs/deployment.md index b374b28..ebbff37 100644 --- a/docs/deployment.md +++ b/docs/deployment.md @@ -78,8 +78,7 @@ export OPENAPI_SCHEMA_PATH=/path/to/custom-schema.yaml **Server:** - `PORT` - API server port (default: `8000`) -- `METRICS_PORT` - Metrics endpoint port (default: `8080`) -- `HEALTH_PORT` - Health check port (default: `8083`) +- `METRICS_PORT` - Metrics and health endpoints port (default: `8080`) **Logging:** - `LOG_LEVEL` - Logging level: `debug`, `info`, `warn`, `error` (default: `info`) @@ -272,7 +271,9 @@ kubectl get configmaps --namespace hyperfleet-system ## Health Checks -The deployment includes liveness and readiness probes at `GET /healthcheck` (port 8083). +The deployment includes: +- Liveness probe: `GET /healthz` (port 8080) - Returns 200 if the process is alive +- Readiness probe: `GET /readyz` (port 8080) - Returns 200 when ready to receive traffic, 503 during startup/shutdown ## Scaling diff --git a/docs/development.md b/docs/development.md index e0107eb..20acf92 100644 --- a/docs/development.md +++ b/docs/development.md @@ -95,7 +95,8 @@ The service starts on `localhost:8000`: - REST API: `http://localhost:8000/api/hyperfleet/v1/` - OpenAPI spec: `http://localhost:8000/api/hyperfleet/v1/openapi` - Swagger UI: `http://localhost:8000/api/hyperfleet/v1/openapi.html` -- Health check: `http://localhost:8083/healthcheck` +- Liveness probe: `http://localhost:8080/healthz` +- Readiness probe: `http://localhost:8080/readyz` - Metrics: `http://localhost:8080/metrics` ### Testing the API diff --git a/docs/hyperfleet-api.http b/docs/hyperfleet-api.http index 969cc9f..40d658a 100644 --- a/docs/hyperfleet-api.http +++ b/docs/hyperfleet-api.http @@ -5,7 +5,6 @@ @host = localhost @port = 8000 @metrics_port = 8080 -@health_port = 8083 @baseUrl = http://{{host}}:{{port}} @authToken = @@ -193,8 +192,12 @@ Content-Type: application/json } ### -# @name healthCheck -GET http://{{host}}:{{health_port}}/healthcheck +# @name livenessProbe +GET http://{{host}}:{{metrics_port}}/healthz + +### +# @name readinessProbe +GET http://{{host}}:{{metrics_port}}/readyz ### # @name metrics diff --git a/pkg/config/config.go b/pkg/config/config.go index d5851d9..68875cb 100755 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -13,22 +13,20 @@ import ( ) type ApplicationConfig struct { - Server *ServerConfig `json:"server"` - Metrics *MetricsConfig `json:"metrics"` - HealthCheck *HealthCheckConfig `json:"health_check"` - Database *DatabaseConfig `json:"database"` - OCM *OCMConfig `json:"ocm"` - Logging *LoggingConfig `json:"logging"` + Server *ServerConfig `json:"server"` + Metrics *MetricsConfig `json:"metrics"` + Database *DatabaseConfig `json:"database"` + OCM *OCMConfig `json:"ocm"` + Logging *LoggingConfig `json:"logging"` } func NewApplicationConfig() *ApplicationConfig { return &ApplicationConfig{ - Server: NewServerConfig(), - Metrics: NewMetricsConfig(), - HealthCheck: NewHealthCheckConfig(), - Database: NewDatabaseConfig(), - OCM: NewOCMConfig(), - Logging: NewLoggingConfig(), + Server: NewServerConfig(), + Metrics: NewMetricsConfig(), + Database: NewDatabaseConfig(), + OCM: NewOCMConfig(), + Logging: NewLoggingConfig(), } } @@ -36,7 +34,6 @@ func (c *ApplicationConfig) AddFlags(flagset *pflag.FlagSet) { flagset.AddGoFlagSet(flag.CommandLine) c.Server.AddFlags(flagset) c.Metrics.AddFlags(flagset) - c.HealthCheck.AddFlags(flagset) c.Database.AddFlags(flagset) c.OCM.AddFlags(flagset) c.Logging.AddFlags(flagset) @@ -51,7 +48,6 @@ func (c *ApplicationConfig) ReadFiles() []string { {c.Database.ReadFiles, "Database"}, {c.OCM.ReadFiles, "OCM"}, {c.Metrics.ReadFiles, "Metrics"}, - {c.HealthCheck.ReadFiles, "HealthCheck"}, {c.Logging.ReadFiles, "Logging"}, } var messages []string diff --git a/pkg/config/health_check.go b/pkg/config/health_check.go deleted file mode 100755 index e3aaa79..0000000 --- a/pkg/config/health_check.go +++ /dev/null @@ -1,26 +0,0 @@ -package config - -import ( - "github.com/spf13/pflag" -) - -type HealthCheckConfig struct { - BindAddress string `json:"bind_address"` - EnableHTTPS bool `json:"enable_https"` -} - -func NewHealthCheckConfig() *HealthCheckConfig { - return &HealthCheckConfig{ - BindAddress: "localhost:8083", - EnableHTTPS: false, - } -} - -func (c *HealthCheckConfig) AddFlags(fs *pflag.FlagSet) { - fs.StringVar(&c.BindAddress, "health-check-server-bindaddress", c.BindAddress, "Health check server bind adddress") - fs.BoolVar(&c.EnableHTTPS, "enable-health-check-https", c.EnableHTTPS, "Enable HTTPS for health check server") -} - -func (c *HealthCheckConfig) ReadFiles() error { - return nil -} diff --git a/pkg/health/handlers.go b/pkg/health/handlers.go new file mode 100644 index 0000000..c5bf23e --- /dev/null +++ b/pkg/health/handlers.go @@ -0,0 +1,104 @@ +package health + +import ( + "encoding/json" + "net/http" + + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/db" +) + +// CheckFunc is a function that performs a health check and returns an error if unhealthy +type CheckFunc func() error + +// Handler holds the health check handlers and their dependencies +type Handler struct { + sessionFactory db.SessionFactory +} + +// NewHandler creates a new health handler with the given session factory +func NewHandler(sf db.SessionFactory) *Handler { + return &Handler{ + sessionFactory: sf, + } +} + +// HealthResponse represents the JSON response for health endpoints +type HealthResponse struct { + Status string `json:"status"` + Message string `json:"message,omitempty"` + Checks map[string]string `json:"checks,omitempty"` +} + +// LivenessHandler handles GET /healthz requests +// Returns 200 OK if the process is alive, 503 if unhealthy +func (h *Handler) LivenessHandler(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + + response := HealthResponse{ + Status: "ok", + } + + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(response) +} + +// ReadinessHandler handles GET /readyz requests +// Returns 200 OK if ready to receive traffic, 503 during startup or shutdown +func (h *Handler) ReadinessHandler(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + + readiness := GetReadinessState() + + // Check if shutting down + if readiness.IsShuttingDown() { + response := HealthResponse{ + Status: "error", + Message: "shutting down", + } + w.WriteHeader(http.StatusServiceUnavailable) + _ = json.NewEncoder(w).Encode(response) + return + } + + // Perform readiness checks + checks := make(map[string]string) + allOK := true + + // Check database connection + if h.sessionFactory != nil { + if err := h.sessionFactory.CheckConnection(); err != nil { + checks["database"] = "error" + allOK = false + } else { + checks["database"] = "ok" + } + } else { + checks["database"] = "ok" + } + + // Check if marked as ready + if !readiness.IsReady() { + checks["ready"] = "error" + allOK = false + } else { + checks["ready"] = "ok" + } + + if !allOK { + response := HealthResponse{ + Status: "error", + Message: "not ready", + Checks: checks, + } + w.WriteHeader(http.StatusServiceUnavailable) + _ = json.NewEncoder(w).Encode(response) + return + } + + response := HealthResponse{ + Status: "ok", + Checks: checks, + } + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(response) +} diff --git a/pkg/health/handlers_test.go b/pkg/health/handlers_test.go new file mode 100644 index 0000000..22a3f33 --- /dev/null +++ b/pkg/health/handlers_test.go @@ -0,0 +1,209 @@ +package health + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "testing" +) + +func TestLivenessHandler_ReturnsOK(t *testing.T) { + handler := &Handler{sessionFactory: nil} + + req := httptest.NewRequest(http.MethodGet, "/healthz", nil) + w := httptest.NewRecorder() + + handler.LivenessHandler(w, req) + + resp := w.Result() + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusOK { + t.Errorf("expected status %d, got %d", http.StatusOK, resp.StatusCode) + } + + var response HealthResponse + if err := json.NewDecoder(resp.Body).Decode(&response); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + + if response.Status != "ok" { + t.Errorf("expected status 'ok', got '%s'", response.Status) + } +} + +func TestReadinessHandler_ReturnsOKWhenReady(t *testing.T) { + // Reset and set ready state + state := GetReadinessState() + state.Reset() + state.SetReady() + + handler := &Handler{sessionFactory: nil} + + req := httptest.NewRequest(http.MethodGet, "/readyz", nil) + w := httptest.NewRecorder() + + handler.ReadinessHandler(w, req) + + resp := w.Result() + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusOK { + t.Errorf("expected status %d, got %d", http.StatusOK, resp.StatusCode) + } + + var response HealthResponse + if err := json.NewDecoder(resp.Body).Decode(&response); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + + if response.Status != "ok" { + t.Errorf("expected status 'ok', got '%s'", response.Status) + } + + // Reset state for other tests + state.Reset() +} + +func TestReadinessHandler_Returns503WhenNotReady(t *testing.T) { + // Reset state to not ready + state := GetReadinessState() + state.Reset() + + handler := &Handler{sessionFactory: nil} + + req := httptest.NewRequest(http.MethodGet, "/readyz", nil) + w := httptest.NewRecorder() + + handler.ReadinessHandler(w, req) + + resp := w.Result() + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusServiceUnavailable { + t.Errorf("expected status %d, got %d", http.StatusServiceUnavailable, resp.StatusCode) + } + + var response HealthResponse + if err := json.NewDecoder(resp.Body).Decode(&response); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + + if response.Status != "error" { + t.Errorf("expected status 'error', got '%s'", response.Status) + } +} + +func TestReadinessHandler_Returns503WhenShuttingDown(t *testing.T) { + state := GetReadinessState() + state.Reset() + state.SetReady() + state.SetShuttingDown() + + handler := &Handler{sessionFactory: nil} + + req := httptest.NewRequest(http.MethodGet, "/readyz", nil) + w := httptest.NewRecorder() + + handler.ReadinessHandler(w, req) + + resp := w.Result() + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusServiceUnavailable { + t.Errorf("expected status %d, got %d", http.StatusServiceUnavailable, resp.StatusCode) + } + + var response HealthResponse + if err := json.NewDecoder(resp.Body).Decode(&response); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + + if response.Status != "error" { + t.Errorf("expected status 'error', got '%s'", response.Status) + } + + if response.Message != "shutting down" { + t.Errorf("expected message 'shutting down', got '%s'", response.Message) + } + + // Reset state for other tests + state.Reset() +} + +func TestReadinessHandler_IncludesChecks(t *testing.T) { + state := GetReadinessState() + state.Reset() + state.SetReady() + + handler := &Handler{sessionFactory: nil} + + req := httptest.NewRequest(http.MethodGet, "/readyz", nil) + w := httptest.NewRecorder() + + handler.ReadinessHandler(w, req) + + resp := w.Result() + defer func() { _ = resp.Body.Close() }() + + var response HealthResponse + if err := json.NewDecoder(resp.Body).Decode(&response); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + + if response.Checks == nil { + t.Fatal("expected checks to be present") + } + + if response.Checks["database"] != "ok" { + t.Errorf("expected database check 'ok', got '%s'", response.Checks["database"]) + } + + if response.Checks["ready"] != "ok" { + t.Errorf("expected ready check 'ok', got '%s'", response.Checks["ready"]) + } + + // Reset state for other tests + state.Reset() +} + +func TestReadinessHandler_ContentType(t *testing.T) { + state := GetReadinessState() + state.Reset() + state.SetReady() + + handler := &Handler{sessionFactory: nil} + + req := httptest.NewRequest(http.MethodGet, "/readyz", nil) + w := httptest.NewRecorder() + + handler.ReadinessHandler(w, req) + + resp := w.Result() + defer func() { _ = resp.Body.Close() }() + + contentType := resp.Header.Get("Content-Type") + if contentType != "application/json" { + t.Errorf("expected Content-Type 'application/json', got '%s'", contentType) + } + + // Reset state for other tests + state.Reset() +} + +func TestLivenessHandler_ContentType(t *testing.T) { + handler := &Handler{sessionFactory: nil} + + req := httptest.NewRequest(http.MethodGet, "/healthz", nil) + w := httptest.NewRecorder() + + handler.LivenessHandler(w, req) + + resp := w.Result() + defer func() { _ = resp.Body.Close() }() + + contentType := resp.Header.Get("Content-Type") + if contentType != "application/json" { + t.Errorf("expected Content-Type 'application/json', got '%s'", contentType) + } +} diff --git a/pkg/health/readiness.go b/pkg/health/readiness.go new file mode 100644 index 0000000..84a38a3 --- /dev/null +++ b/pkg/health/readiness.go @@ -0,0 +1,68 @@ +package health + +import ( + "sync" +) + +// ReadinessState represents the current readiness state of the application +type ReadinessState struct { + mu sync.RWMutex + ready bool + shutting bool +} + +// globalReadiness is the singleton readiness state for the application +var globalReadiness = &ReadinessState{ + ready: false, + shutting: false, +} + +// GetReadinessState returns the global readiness state +func GetReadinessState() *ReadinessState { + return globalReadiness +} + +// SetReady marks the application as ready to receive traffic +func (r *ReadinessState) SetReady() { + r.mu.Lock() + defer r.mu.Unlock() + r.ready = true +} + +// SetNotReady marks the application as not ready to receive traffic +func (r *ReadinessState) SetNotReady() { + r.mu.Lock() + defer r.mu.Unlock() + r.ready = false +} + +// SetShuttingDown marks the application as shutting down +// This also sets ready to false +func (r *ReadinessState) SetShuttingDown() { + r.mu.Lock() + defer r.mu.Unlock() + r.shutting = true + r.ready = false +} + +// IsReady returns true if the application is ready to receive traffic +func (r *ReadinessState) IsReady() bool { + r.mu.RLock() + defer r.mu.RUnlock() + return r.ready && !r.shutting +} + +// IsShuttingDown returns true if the application is shutting down +func (r *ReadinessState) IsShuttingDown() bool { + r.mu.RLock() + defer r.mu.RUnlock() + return r.shutting +} + +// Reset resets the readiness state (useful for testing) +func (r *ReadinessState) Reset() { + r.mu.Lock() + defer r.mu.Unlock() + r.ready = false + r.shutting = false +} diff --git a/pkg/health/readiness_test.go b/pkg/health/readiness_test.go new file mode 100644 index 0000000..17a1df4 --- /dev/null +++ b/pkg/health/readiness_test.go @@ -0,0 +1,100 @@ +package health + +import ( + "testing" +) + +func TestReadinessState_InitialState(t *testing.T) { + state := &ReadinessState{} + + if state.IsReady() { + t.Error("expected IsReady() to be false initially") + } + + if state.IsShuttingDown() { + t.Error("expected IsShuttingDown() to be false initially") + } +} + +func TestReadinessState_SetReady(t *testing.T) { + state := &ReadinessState{} + + state.SetReady() + + if !state.IsReady() { + t.Error("expected IsReady() to be true after SetReady()") + } + + if state.IsShuttingDown() { + t.Error("expected IsShuttingDown() to be false after SetReady()") + } +} + +func TestReadinessState_SetNotReady(t *testing.T) { + state := &ReadinessState{} + + state.SetReady() + state.SetNotReady() + + if state.IsReady() { + t.Error("expected IsReady() to be false after SetNotReady()") + } +} + +func TestReadinessState_SetShuttingDown(t *testing.T) { + state := &ReadinessState{} + + state.SetReady() + state.SetShuttingDown() + + if state.IsReady() { + t.Error("expected IsReady() to be false after SetShuttingDown()") + } + + if !state.IsShuttingDown() { + t.Error("expected IsShuttingDown() to be true after SetShuttingDown()") + } +} + +func TestReadinessState_Reset(t *testing.T) { + state := &ReadinessState{} + + state.SetReady() + state.SetShuttingDown() + state.Reset() + + if state.IsReady() { + t.Error("expected IsReady() to be false after Reset()") + } + + if state.IsShuttingDown() { + t.Error("expected IsShuttingDown() to be false after Reset()") + } +} + +func TestReadinessState_IsReadyFalseWhenShuttingDown(t *testing.T) { + state := &ReadinessState{} + + // Set ready first + state.SetReady() + if !state.IsReady() { + t.Error("expected IsReady() to be true after SetReady()") + } + + // Then set shutting down + state.SetShuttingDown() + + // IsReady should be false even if ready was set before + if state.IsReady() { + t.Error("expected IsReady() to be false when shutting down") + } +} + +func TestGetReadinessState_ReturnsSingleton(t *testing.T) { + state1 := GetReadinessState() + state2 := GetReadinessState() + + if state1 != state2 { + t.Error("expected GetReadinessState() to return the same instance") + } +} diff --git a/test/helper.go b/test/helper.go index 75a3e45..40e8607 100755 --- a/test/helper.go +++ b/test/helper.go @@ -54,18 +54,17 @@ var jwkURL string type TimeFunc func() time.Time type Helper struct { - Ctx context.Context - DBFactory db.SessionFactory - AppConfig *config.ApplicationConfig - APIServer server.Server - MetricsServer server.Server - HealthCheckServer server.Server - TimeFunc TimeFunc - JWTPrivateKey *rsa.PrivateKey - JWTCA *rsa.PublicKey - T *testing.T - teardowns []func() error - Factories factories.Factories + Ctx context.Context + DBFactory db.SessionFactory + AppConfig *config.ApplicationConfig + APIServer server.Server + MetricsServer server.Server + TimeFunc TimeFunc + JWTPrivateKey *rsa.PrivateKey + JWTCA *rsa.PublicKey + T *testing.T + teardowns []func() error + Factories factories.Factories } func NewHelper(t *testing.T) *Helper { @@ -115,7 +114,6 @@ func NewHelper(t *testing.T) *Helper { } helper.startAPIServer() helper.startMetricsServer() - helper.startHealthCheckServer() }) helper.T = t return helper @@ -180,15 +178,6 @@ func (helper *Helper) stopMetricsServer() error { return nil } -func (helper *Helper) startHealthCheckServer() { - ctx := context.Background() - helper.HealthCheckServer = server.NewHealthCheckServer() - go func() { - logger.Debug(ctx, "Test health check server started") - helper.HealthCheckServer.Start() - logger.Debug(ctx, "Test health check server stopped") - }() -} func (helper *Helper) RestartServer() { ctx := context.Background() @@ -257,10 +246,6 @@ func (helper *Helper) MetricsURL(path string) string { return fmt.Sprintf("http://%s%s", helper.AppConfig.Metrics.BindAddress, path) } -func (helper *Helper) HealthCheckURL(path string) string { - return fmt.Sprintf("http://%s%s", helper.AppConfig.HealthCheck.BindAddress, path) -} - func (helper *Helper) NewApiClient() *openapi.APIClient { config := openapi.NewConfiguration() // Override the server URL to use the local test server From 28df99584d907d839e0f19579a98cc16002649de Mon Sep 17 00:00:00 2001 From: Rafael Benevides Date: Tue, 13 Jan 2026 16:36:50 -0300 Subject: [PATCH 2/2] HYPERFLEET-453 - feat: implement graceful shutdown with signal handling Add configurable graceful shutdown with SHUTDOWN_TIMEOUT environment variable (default 20s). Use signal.NotifyContext for SIGTERM/SIGINT handling and implement proper shutdown sequence: mark not ready, drain HTTP connections, cleanup OpenTelemetry, close database. --- AGENTS.md | 3 + cmd/hyperfleet-api/servecmd/cmd.go | 65 ++++++++---- cmd/hyperfleet-api/server/api_server.go | 4 +- cmd/hyperfleet-api/server/metrics_server.go | 4 +- cmd/hyperfleet-api/server/server.go | 2 +- docs/deployment.md | 3 + pkg/config/config.go | 3 + pkg/config/shutdown.go | 41 ++++++++ pkg/config/shutdown_test.go | 108 ++++++++++++++++++++ test/helper.go | 4 +- 10 files changed, 212 insertions(+), 25 deletions(-) create mode 100644 pkg/config/shutdown.go create mode 100644 pkg/config/shutdown_test.go diff --git a/AGENTS.md b/AGENTS.md index 2382b9b..3b5d7c5 100755 --- a/AGENTS.md +++ b/AGENTS.md @@ -347,6 +347,9 @@ Serves the hyperfleet REST API with full authentication, database connectivity, - `--metrics-server-bindaddress` - Metrics and health endpoints server address (default: "localhost:8080") - `--enable-metrics-https` - Enable HTTPS for metrics server +- **Graceful Shutdown:** + - `--shutdown-timeout` - Graceful shutdown timeout (default: 20s, env: `SHUTDOWN_TIMEOUT`) + - **Performance Tuning:** - `--http-read-timeout` - HTTP server read timeout (default: 5s) - `--http-write-timeout` - HTTP server write timeout (default: 30s) diff --git a/cmd/hyperfleet-api/servecmd/cmd.go b/cmd/hyperfleet-api/servecmd/cmd.go index 2d3ce86..e3f66c0 100755 --- a/cmd/hyperfleet-api/servecmd/cmd.go +++ b/cmd/hyperfleet-api/servecmd/cmd.go @@ -44,11 +44,15 @@ func runServe(cmd *cobra.Command, args []string) { os.Exit(1) } - // Bind environment variables for advanced configuration (OTel, Masking) + // Bind environment variables for advanced configuration (OTel, Masking, Shutdown) environments.Environment().Config.Logging.BindEnv(cmd.PersistentFlags()) + environments.Environment().Config.Shutdown.BindEnv() initLogger() + shutdownTimeout := environments.Environment().Config.Shutdown.Timeout + logger.With(ctx, "shutdown_timeout", shutdownTimeout.String()).Info("Shutdown timeout configured") + var tp *trace.TracerProvider if environments.Environment().Config.Logging.OTel.Enabled { samplingRate := environments.Environment().Config.Logging.OTel.SamplingRate @@ -80,30 +84,55 @@ func runServe(cmd *cobra.Command, args []string) { health.GetReadinessState().SetReady() logger.Info(ctx, "Application ready to receive traffic") - sigChan := make(chan os.Signal, 1) - signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM) - <-sigChan + // Wait for shutdown signal using signal.NotifyContext + signalCtx, stop := signal.NotifyContext(ctx, os.Interrupt, syscall.SIGTERM) + defer stop() + <-signalCtx.Done() logger.Info(ctx, "Shutdown signal received, starting graceful shutdown...") - // Mark application as not ready (returns 503 on /readyz) - health.GetReadinessState().SetShuttingDown() - logger.Info(ctx, "Marked as not ready, draining in-flight requests...") + // Create shutdown context with timeout + shutdownCtx, cancel := context.WithTimeout(context.Background(), shutdownTimeout) + defer cancel() - if err := apiServer.Stop(); err != nil { - logger.WithError(ctx, err).Error("Failed to stop API server") - } - if err := metricsServer.Stop(); err != nil { - logger.WithError(ctx, err).Error("Failed to stop metrics server") - } + // Channel to signal shutdown completion + shutdownComplete := make(chan struct{}) - if tp != nil { - if err := telemetry.Shutdown(context.Background(), tp); err != nil { - logger.WithError(ctx, err).Error("Failed to shutdown OpenTelemetry") + go func() { + defer close(shutdownComplete) + + // Phase 1: Mark application as not ready (returns 503 on /readyz) + health.GetReadinessState().SetShuttingDown() + logger.Info(ctx, "Marked as not ready, draining in-flight requests...") + + // Phase 2-3: Stop servers (stops accepting new connections and drains in-flight requests) + if err := apiServer.Stop(shutdownCtx); err != nil { + logger.WithError(ctx, err).Error("Failed to stop API server") + } + if err := metricsServer.Stop(shutdownCtx); err != nil { + logger.WithError(ctx, err).Error("Failed to stop metrics server") + } + + // Phase 4: Cleanup resources + if tp != nil { + if err := telemetry.Shutdown(shutdownCtx, tp); err != nil { + logger.WithError(ctx, err).Error("Failed to shutdown OpenTelemetry") + } } - } - logger.Info(ctx, "Graceful shutdown completed") + // Close database connections + environments.Environment().Teardown() + logger.Info(ctx, "Resources cleaned up") + }() + + // Wait for shutdown to complete or timeout + select { + case <-shutdownComplete: + logger.Info(ctx, "Graceful shutdown completed") + case <-shutdownCtx.Done(): + logger.Error(ctx, "Shutdown timeout exceeded, forcing exit") + os.Exit(1) + } } // initLogger initializes the global slog logger from configuration diff --git a/cmd/hyperfleet-api/server/api_server.go b/cmd/hyperfleet-api/server/api_server.go index 2e23d0c..a5b2939 100755 --- a/cmd/hyperfleet-api/server/api_server.go +++ b/cmd/hyperfleet-api/server/api_server.go @@ -154,6 +154,6 @@ func (s apiServer) Start() { } } -func (s apiServer) Stop() error { - return s.httpServer.Shutdown(context.Background()) +func (s apiServer) Stop(ctx context.Context) error { + return s.httpServer.Shutdown(ctx) } diff --git a/cmd/hyperfleet-api/server/metrics_server.go b/cmd/hyperfleet-api/server/metrics_server.go index a9b8001..2f154eb 100755 --- a/cmd/hyperfleet-api/server/metrics_server.go +++ b/cmd/hyperfleet-api/server/metrics_server.go @@ -74,6 +74,6 @@ func (s metricsServer) Start() { } } -func (s metricsServer) Stop() error { - return s.httpServer.Shutdown(context.Background()) +func (s metricsServer) Stop(ctx context.Context) error { + return s.httpServer.Shutdown(ctx) } diff --git a/cmd/hyperfleet-api/server/server.go b/cmd/hyperfleet-api/server/server.go index 02a5d14..ae85330 100755 --- a/cmd/hyperfleet-api/server/server.go +++ b/cmd/hyperfleet-api/server/server.go @@ -12,7 +12,7 @@ import ( type Server interface { Start() - Stop() error + Stop(ctx context.Context) error Listen() (net.Listener, error) Serve(net.Listener) } diff --git a/docs/deployment.md b/docs/deployment.md index ebbff37..b0fac4e 100644 --- a/docs/deployment.md +++ b/docs/deployment.md @@ -80,6 +80,9 @@ export OPENAPI_SCHEMA_PATH=/path/to/custom-schema.yaml - `PORT` - API server port (default: `8000`) - `METRICS_PORT` - Metrics and health endpoints port (default: `8080`) +**Graceful Shutdown:** +- `SHUTDOWN_TIMEOUT` - Graceful shutdown timeout (default: `20s`) + **Logging:** - `LOG_LEVEL` - Logging level: `debug`, `info`, `warn`, `error` (default: `info`) - `LOG_FORMAT` - Log format: `json`, `text` (default: `json`) diff --git a/pkg/config/config.go b/pkg/config/config.go index 68875cb..e22e10f 100755 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -18,6 +18,7 @@ type ApplicationConfig struct { Database *DatabaseConfig `json:"database"` OCM *OCMConfig `json:"ocm"` Logging *LoggingConfig `json:"logging"` + Shutdown *ShutdownConfig `json:"shutdown"` } func NewApplicationConfig() *ApplicationConfig { @@ -27,6 +28,7 @@ func NewApplicationConfig() *ApplicationConfig { Database: NewDatabaseConfig(), OCM: NewOCMConfig(), Logging: NewLoggingConfig(), + Shutdown: NewShutdownConfig(), } } @@ -37,6 +39,7 @@ func (c *ApplicationConfig) AddFlags(flagset *pflag.FlagSet) { c.Database.AddFlags(flagset) c.OCM.AddFlags(flagset) c.Logging.AddFlags(flagset) + c.Shutdown.AddFlags(flagset) } func (c *ApplicationConfig) ReadFiles() []string { diff --git a/pkg/config/shutdown.go b/pkg/config/shutdown.go new file mode 100644 index 0000000..0851be7 --- /dev/null +++ b/pkg/config/shutdown.go @@ -0,0 +1,41 @@ +package config + +import ( + "os" + "time" + + "github.com/spf13/pflag" +) + +const ( + // DefaultShutdownTimeout is the default timeout for graceful shutdown + DefaultShutdownTimeout = 20 * time.Second +) + +type ShutdownConfig struct { + Timeout time.Duration `json:"timeout"` +} + +func NewShutdownConfig() *ShutdownConfig { + return &ShutdownConfig{ + Timeout: DefaultShutdownTimeout, + } +} + +func (s *ShutdownConfig) AddFlags(fs *pflag.FlagSet) { + fs.DurationVar(&s.Timeout, "shutdown-timeout", s.Timeout, "Graceful shutdown timeout") +} + +func (s *ShutdownConfig) ReadFiles() error { + return nil +} + +// BindEnv binds environment variables to config values +// Environment variables take precedence over flags +func (s *ShutdownConfig) BindEnv() { + if val := os.Getenv("SHUTDOWN_TIMEOUT"); val != "" { + if d, err := time.ParseDuration(val); err == nil { + s.Timeout = d + } + } +} diff --git a/pkg/config/shutdown_test.go b/pkg/config/shutdown_test.go new file mode 100644 index 0000000..9e7905d --- /dev/null +++ b/pkg/config/shutdown_test.go @@ -0,0 +1,108 @@ +package config + +import ( + "testing" + "time" + + "github.com/spf13/pflag" +) + +func TestNewShutdownConfig_Defaults(t *testing.T) { + cfg := NewShutdownConfig() + + if cfg.Timeout != DefaultShutdownTimeout { + t.Errorf("expected default timeout %v, got %v", DefaultShutdownTimeout, cfg.Timeout) + } + + if cfg.Timeout != 20*time.Second { + t.Errorf("expected timeout 20s, got %v", cfg.Timeout) + } +} + +func TestShutdownConfig_AddFlags(t *testing.T) { + cfg := NewShutdownConfig() + fs := pflag.NewFlagSet("test", pflag.ContinueOnError) + + cfg.AddFlags(fs) + + flag := fs.Lookup("shutdown-timeout") + if flag == nil { + t.Fatal("expected shutdown-timeout flag to be defined") + } + + if flag.DefValue != "20s" { + t.Errorf("expected default value '20s', got '%s'", flag.DefValue) + } +} + +func TestShutdownConfig_AddFlags_CustomValue(t *testing.T) { + cfg := NewShutdownConfig() + fs := pflag.NewFlagSet("test", pflag.ContinueOnError) + + cfg.AddFlags(fs) + + err := fs.Parse([]string{"--shutdown-timeout=30s"}) + if err != nil { + t.Fatalf("failed to parse flags: %v", err) + } + + if cfg.Timeout != 30*time.Second { + t.Errorf("expected timeout 30s, got %v", cfg.Timeout) + } +} + +func TestShutdownConfig_BindEnv(t *testing.T) { + tests := []struct { + name string + envValue string + expected time.Duration + }{ + { + name: "valid duration", + envValue: "45s", + expected: 45 * time.Second, + }, + { + name: "valid duration in minutes", + envValue: "1m", + expected: 1 * time.Minute, + }, + { + name: "invalid duration keeps default", + envValue: "invalid", + expected: DefaultShutdownTimeout, + }, + { + name: "empty value keeps default", + envValue: "", + expected: DefaultShutdownTimeout, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cfg := NewShutdownConfig() + + if tt.envValue != "" { + t.Setenv("SHUTDOWN_TIMEOUT", tt.envValue) + } else { + t.Setenv("SHUTDOWN_TIMEOUT", "") + } + + cfg.BindEnv() + + if cfg.Timeout != tt.expected { + t.Errorf("expected timeout %v, got %v", tt.expected, cfg.Timeout) + } + }) + } +} + +func TestShutdownConfig_ReadFiles(t *testing.T) { + cfg := NewShutdownConfig() + + err := cfg.ReadFiles() + if err != nil { + t.Errorf("expected no error, got %v", err) + } +} diff --git a/test/helper.go b/test/helper.go index 40e8607..6736b43 100755 --- a/test/helper.go +++ b/test/helper.go @@ -155,7 +155,7 @@ func (helper *Helper) startAPIServer() { } func (helper *Helper) stopAPIServer() error { - if err := helper.APIServer.Stop(); err != nil { + if err := helper.APIServer.Stop(context.Background()); err != nil { return fmt.Errorf("unable to stop api server: %s", err.Error()) } return nil @@ -172,7 +172,7 @@ func (helper *Helper) startMetricsServer() { } func (helper *Helper) stopMetricsServer() error { - if err := helper.MetricsServer.Stop(); err != nil { + if err := helper.MetricsServer.Stop(context.Background()); err != nil { return fmt.Errorf("unable to stop metrics server: %s", err.Error()) } return nil