From e3b15d777209befb3de6af5f62f090a338eaf30b Mon Sep 17 00:00:00 2001 From: Alessandro Affinito Date: Tue, 23 Dec 2025 17:10:18 +0100 Subject: [PATCH 1/6] Add pprof profiling support - Introduced environment variables for pprof configuration in const.go. - Updated main.go to start a pprof server if enabled. - Enhanced Makefile with pprof-related targets for profiling and load testing. - Modified Dockerfile.ci-rp to enable pprof during test runs. - Updated .gitignore to exclude pprof data directory. --- .gitignore | 1 + Dockerfile.ci-rp | 5 +- Makefile | 163 +++++++++++++++++++ cmd/aro/const.go | 14 ++ cmd/aro/main.go | 20 ++- cmd/aro/pprof.go | 144 +++++++++++++++++ cmd/aro/pprof_test.go | 356 ++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 696 insertions(+), 7 deletions(-) create mode 100644 cmd/aro/pprof.go create mode 100644 cmd/aro/pprof_test.go diff --git a/.gitignore b/.gitignore index 92c3e41c8bf..8d073ce3360 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ gomock_reflect_* /coverage.* /report.xml /e2e-report.xml +/pprof-data/ /deploy/config.yaml **/*.swp /portal/v2/node_modules/ diff --git a/Dockerfile.ci-rp b/Dockerfile.ci-rp index b628f322745..acf3c2aa76a 100644 --- a/Dockerfile.ci-rp +++ b/Dockerfile.ci-rp @@ -59,7 +59,10 @@ RUN go build -ldflags "-X github.com/Azure/ARO-RP/pkg/util/version.GitCommit=${A RUN go test ./test/e2e/... -tags e2e,codec.safe -c -ldflags "-X github.com/Azure/ARO-RP/pkg/util/version.GitCommit=${ARO_VERSION}" -o e2e.test -# Additional tests +# Additional tests with pprof enabled +# PPROF_ENABLED=true enables profiling during test runs +ENV PPROF_ENABLED=true +ENV PPROF_PORT=6060 RUN gotestsum --format pkgname --junitfile report.xml -- -coverprofile=cover.out ./... \ && gocov convert cover.out | gocov-xml > coverage.xml diff --git a/Makefile b/Makefile index c4fe6fb7511..fbbc26d9032 100644 --- a/Makefile +++ b/Makefile @@ -529,3 +529,166 @@ run-selenium: .PHONY: validate-roledef validate-roledef: go run ./hack/role -verified-version "$(OCP_VERSION)" -oc-bin=$(OC) + +############################################################################### +# pprof Profiling Targets +############################################################################### + +# pprof configuration +PPROF_HOST ?= 127.0.0.1 +PPROF_PORT ?= 6060 +PPROF_URL = http://$(PPROF_HOST):$(PPROF_PORT) +PPROF_OUTPUT_DIR ?= ./pprof-data +PPROF_DURATION ?= 30s + +# Load test configuration +LOADTEST_URL ?= https://localhost:8443/healthz/ready +LOADTEST_DURATION ?= 30s +LOADTEST_RATE ?= 100 + +.PHONY: pprof-check +pprof-check: ## Check if pprof server is running + @echo "Checking pprof server at $(PPROF_URL)..." + @curl -s -o /dev/null -w "%{http_code}" $(PPROF_URL)/debug/pprof/ | grep -q "200" && \ + echo "✓ pprof server is running" || \ + (echo "✗ pprof server is not running. Start it with: make runlocal-rp" && exit 1) + +.PHONY: pprof-collect-all +pprof-collect-all: ## Collect all pprof profiles (CPU, heap, goroutine, etc.) + @mkdir -p $(PPROF_OUTPUT_DIR) + @echo "Collecting pprof profiles from $(PPROF_URL)..." + @echo "Output directory: $(PPROF_OUTPUT_DIR)" + @echo "" + @echo "Collecting CPU profile ($(PPROF_DURATION))..." + @curl -s "$(PPROF_URL)/debug/pprof/profile?seconds=$$(echo $(PPROF_DURATION) | sed 's/s//')" -o $(PPROF_OUTPUT_DIR)/cpu.prof && \ + echo " ✓ CPU profile saved to $(PPROF_OUTPUT_DIR)/cpu.prof" || \ + echo " ✗ Failed to collect CPU profile" + @echo "Collecting heap profile..." + @curl -s "$(PPROF_URL)/debug/pprof/heap" -o $(PPROF_OUTPUT_DIR)/heap.prof && \ + echo " ✓ Heap profile saved to $(PPROF_OUTPUT_DIR)/heap.prof" || \ + echo " ✗ Failed to collect heap profile" + @echo "Collecting allocs profile..." + @curl -s "$(PPROF_URL)/debug/pprof/allocs" -o $(PPROF_OUTPUT_DIR)/allocs.prof && \ + echo " ✓ Allocs profile saved to $(PPROF_OUTPUT_DIR)/allocs.prof" || \ + echo " ✗ Failed to collect allocs profile" + @echo "Collecting goroutine profile..." + @curl -s "$(PPROF_URL)/debug/pprof/goroutine" -o $(PPROF_OUTPUT_DIR)/goroutine.prof && \ + echo " ✓ Goroutine profile saved to $(PPROF_OUTPUT_DIR)/goroutine.prof" || \ + echo " ✗ Failed to collect goroutine profile" + @echo "Collecting threadcreate profile..." + @curl -s "$(PPROF_URL)/debug/pprof/threadcreate" -o $(PPROF_OUTPUT_DIR)/threadcreate.prof && \ + echo " ✓ Threadcreate profile saved to $(PPROF_OUTPUT_DIR)/threadcreate.prof" || \ + echo " ✗ Failed to collect threadcreate profile" + @echo "Collecting block profile..." + @curl -s "$(PPROF_URL)/debug/pprof/block" -o $(PPROF_OUTPUT_DIR)/block.prof && \ + echo " ✓ Block profile saved to $(PPROF_OUTPUT_DIR)/block.prof" || \ + echo " ✗ Failed to collect block profile" + @echo "Collecting mutex profile..." + @curl -s "$(PPROF_URL)/debug/pprof/mutex" -o $(PPROF_OUTPUT_DIR)/mutex.prof && \ + echo " ✓ Mutex profile saved to $(PPROF_OUTPUT_DIR)/mutex.prof" || \ + echo " ✗ Failed to collect mutex profile" + @echo "Collecting trace (5s)..." + @curl -s "$(PPROF_URL)/debug/pprof/trace?seconds=5" -o $(PPROF_OUTPUT_DIR)/trace.out && \ + echo " ✓ Trace saved to $(PPROF_OUTPUT_DIR)/trace.out" || \ + echo " ✗ Failed to collect trace" + @echo "" + @echo "Profile collection complete!" + @echo "" + @echo "To view profiles, use:" + @echo " go tool pprof -http=:8888 $(PPROF_OUTPUT_DIR)/cpu.prof" + @echo " go tool pprof -http=:8888 $(PPROF_OUTPUT_DIR)/heap.prof" + @echo " go tool trace $(PPROF_OUTPUT_DIR)/trace.out" + +.PHONY: pprof-cpu +pprof-cpu: ## Collect and open CPU profile in browser + @echo "Collecting CPU profile for $(PPROF_DURATION)..." + go tool pprof -http=:8888 "$(PPROF_URL)/debug/pprof/profile?seconds=$$(echo $(PPROF_DURATION) | sed 's/s//')" + +.PHONY: pprof-heap +pprof-heap: ## Collect and open heap profile in browser + @echo "Opening heap profile..." + go tool pprof -http=:8888 "$(PPROF_URL)/debug/pprof/heap" + +.PHONY: pprof-goroutine +pprof-goroutine: ## Collect and open goroutine profile in browser + @echo "Opening goroutine profile..." + go tool pprof -http=:8888 "$(PPROF_URL)/debug/pprof/goroutine" + +.PHONY: pprof-allocs +pprof-allocs: ## Collect and open allocs profile in browser + @echo "Opening allocs profile..." + go tool pprof -http=:8888 "$(PPROF_URL)/debug/pprof/allocs" + +.PHONY: pprof-block +pprof-block: ## Collect and open block profile in browser + @echo "Opening block profile..." + go tool pprof -http=:8888 "$(PPROF_URL)/debug/pprof/block" + +.PHONY: pprof-mutex +pprof-mutex: ## Collect and open mutex profile in browser + @echo "Opening mutex profile..." + go tool pprof -http=:8888 "$(PPROF_URL)/debug/pprof/mutex" + +.PHONY: pprof-trace +pprof-trace: ## Collect and open execution trace in browser + @mkdir -p $(PPROF_OUTPUT_DIR) + @echo "Collecting trace for 5 seconds..." + @curl -s "$(PPROF_URL)/debug/pprof/trace?seconds=5" -o $(PPROF_OUTPUT_DIR)/trace.out + @echo "Opening trace viewer..." + go tool trace $(PPROF_OUTPUT_DIR)/trace.out + +.PHONY: loadtest-hey +loadtest-hey: ## Run load test using hey (install: go install github.com/rakyll/hey@latest) + @command -v hey >/dev/null 2>&1 || { echo "hey not found. Install with: go install github.com/rakyll/hey@latest"; exit 1; } + @echo "Running load test with hey..." + @echo " URL: $(LOADTEST_URL)" + @echo " Duration: $(LOADTEST_DURATION)" + @echo " Rate: $(LOADTEST_RATE) req/s" + hey -z $(LOADTEST_DURATION) -q $(LOADTEST_RATE) -disable-keepalive $(LOADTEST_URL) + +.PHONY: loadtest-vegeta +loadtest-vegeta: ## Run load test using vegeta (install: go install github.com/tsenart/vegeta@latest) + @command -v vegeta >/dev/null 2>&1 || { echo "vegeta not found. Install with: go install github.com/tsenart/vegeta@latest"; exit 1; } + @mkdir -p $(PPROF_OUTPUT_DIR) + @echo "Running load test with vegeta..." + @echo " URL: $(LOADTEST_URL)" + @echo " Duration: $(LOADTEST_DURATION)" + @echo " Rate: $(LOADTEST_RATE) req/s" + @echo "GET $(LOADTEST_URL)" | vegeta attack -duration=$(LOADTEST_DURATION) -rate=$(LOADTEST_RATE) -insecure | \ + tee $(PPROF_OUTPUT_DIR)/vegeta-results.bin | vegeta report + @echo "" + @echo "Results saved to $(PPROF_OUTPUT_DIR)/vegeta-results.bin" + @echo "Generate HTML report: vegeta report -type=html $(PPROF_OUTPUT_DIR)/vegeta-results.bin > $(PPROF_OUTPUT_DIR)/vegeta-report.html" + +.PHONY: pprof-loadtest +pprof-loadtest: ## Run load test and collect pprof profiles simultaneously + @command -v hey >/dev/null 2>&1 || { echo "hey not found. Install with: go install github.com/rakyll/hey@latest"; exit 1; } + @mkdir -p $(PPROF_OUTPUT_DIR) + @echo "Starting load test and profile collection..." + @echo "" + @echo "Step 1: Starting CPU profile collection in background ($(PPROF_DURATION))..." + @(curl -s "$(PPROF_URL)/debug/pprof/profile?seconds=$$(echo $(PPROF_DURATION) | sed 's/s//')" -o $(PPROF_OUTPUT_DIR)/loadtest-cpu.prof && \ + echo "CPU profile saved to $(PPROF_OUTPUT_DIR)/loadtest-cpu.prof") & + @sleep 2 + @echo "Step 2: Running load test..." + hey -z $(LOADTEST_DURATION) -q $(LOADTEST_RATE) -disable-keepalive $(LOADTEST_URL) || true + @echo "" + @echo "Step 3: Collecting heap profile..." + @curl -s "$(PPROF_URL)/debug/pprof/heap" -o $(PPROF_OUTPUT_DIR)/loadtest-heap.prof + @echo "Heap profile saved to $(PPROF_OUTPUT_DIR)/loadtest-heap.prof" + @echo "" + @echo "Step 4: Collecting goroutine profile..." + @curl -s "$(PPROF_URL)/debug/pprof/goroutine" -o $(PPROF_OUTPUT_DIR)/loadtest-goroutine.prof + @echo "Goroutine profile saved to $(PPROF_OUTPUT_DIR)/loadtest-goroutine.prof" + @echo "" + @echo "Load test and profiling complete!" + @echo "" + @echo "View profiles with:" + @echo " go tool pprof -http=:8888 $(PPROF_OUTPUT_DIR)/loadtest-cpu.prof" + @echo " go tool pprof -http=:8888 $(PPROF_OUTPUT_DIR)/loadtest-heap.prof" + @echo " go tool pprof -http=:8888 $(PPROF_OUTPUT_DIR)/loadtest-goroutine.prof" + +.PHONY: pprof-clean +pprof-clean: ## Clean up pprof output directory + rm -rf $(PPROF_OUTPUT_DIR) + @echo "Cleaned up $(PPROF_OUTPUT_DIR)" diff --git a/cmd/aro/const.go b/cmd/aro/const.go index 4177b5f9fb5..3d8da8b7fba 100644 --- a/cmd/aro/const.go +++ b/cmd/aro/const.go @@ -7,4 +7,18 @@ const ( envOpenShiftVersions = "OPENSHIFT_VERSIONS" envInstallerImageDigests = "INSTALLER_IMAGE_DIGESTS" envPlatformWorkloadIdentityRoleSets = "PLATFORM_WORKLOAD_IDENTITY_ROLE_SETS" + + // pprof configuration environment variables + // PPROF_ENABLED: Set to "true" or "1" to enable pprof server. + // Defaults to enabled only in development mode (RP_MODE=development). + envPprofEnabled = "PPROF_ENABLED" + + // PPROF_PORT: TCP port for the pprof HTTP server. + // Defaults to 6060. + envPprofPort = "PPROF_PORT" + + // PPROF_HOST: Host address for the pprof server. + // Restricted to localhost addresses for security. + // Defaults to 127.0.0.1. + envPprofHost = "PPROF_HOST" ) diff --git a/cmd/aro/main.go b/cmd/aro/main.go index 985d2d02583..c4b84f4c754 100644 --- a/cmd/aro/main.go +++ b/cmd/aro/main.go @@ -7,12 +7,9 @@ import ( "context" "flag" "fmt" - "net/http" "os" "strings" - _ "net/http/pprof" - "github.com/Azure/ARO-RP/pkg/env" utillog "github.com/Azure/ARO-RP/pkg/util/log" _ "github.com/Azure/ARO-RP/pkg/util/scheme" @@ -42,9 +39,20 @@ func main() { serviceName := serviceForCommand(flag.Arg(0)) log := env.LoggerForService(serviceName, utillog.GetLogger()) - go func() { - log.Warn(http.ListenAndServe("localhost:6060", nil)) - }() + // Start pprof server if enabled + pprofSrv, pprofErr := newPprofServer(log) + if pprofErr != nil { + log.Warnf("failed to create pprof server: %v", pprofErr) + } else if pprofSrv != nil { + if startErr := pprofSrv.Start(ctx); startErr != nil { + log.Warnf("failed to start pprof server: %v", startErr) + } + defer func() { + if stopErr := pprofSrv.Stop(ctx); stopErr != nil { + log.Warnf("failed to stop pprof server: %v", stopErr) + } + }() + } log.Printf("starting, git commit %s", version.GitCommit) log.Printf("command line: '%s'", strings.Join(os.Args, " ")) diff --git a/cmd/aro/pprof.go b/cmd/aro/pprof.go new file mode 100644 index 00000000000..2ad99579dac --- /dev/null +++ b/cmd/aro/pprof.go @@ -0,0 +1,144 @@ +package main + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "context" + "errors" + "fmt" + "net" + "net/http" + "os" + "strconv" + "strings" + "time" + + // Importing net/http/pprof registers the following handlers on http.DefaultServeMux: + // /debug/pprof/ - index page listing available profiles + // /debug/pprof/cmdline - command line invocation + // /debug/pprof/profile - CPU profile (accepts ?seconds=N) + // /debug/pprof/symbol - symbol lookup + // /debug/pprof/trace - execution trace (accepts ?seconds=N) + // /debug/pprof/heap - heap profile + // /debug/pprof/goroutine - goroutine profile + // /debug/pprof/allocs - allocation profile + // /debug/pprof/block - block profile + // /debug/pprof/mutex - mutex profile + // /debug/pprof/threadcreate - thread creation profile + _ "net/http/pprof" + + "github.com/sirupsen/logrus" +) + +const ( + defaultPprofPort = 6060 + defaultPprofHost = "127.0.0.1" + pprofReadTimeout = 30 * time.Second + pprofWriteTimeout = 60 * time.Second +) + +// pprofServer provides a production-ready pprof HTTP server with: +// - Environment variable configuration (PPROF_ENABLED, PPROF_PORT, PPROF_HOST) +// - Localhost-only binding and request validation for security +// - Port collision detection and graceful shutdown +type pprofServer struct { + log *logrus.Entry + server *http.Server + listener net.Listener + port int + host string +} + +func newPprofServer(log *logrus.Entry) (*pprofServer, error) { + if !isPprofEnabled() { + log.Info("pprof server disabled via environment variable") + return nil, nil + } + + return &pprofServer{ + log: log, + port: getPprofPort(), + host: getPprofHost(), + }, nil +} + +func isPprofEnabled() bool { + val := os.Getenv(envPprofEnabled) + if val == "" { + return strings.EqualFold(os.Getenv("RP_MODE"), "development") + } + return strings.EqualFold(val, "true") || val == "1" +} + +func getPprofPort() int { + if port, err := strconv.Atoi(os.Getenv(envPprofPort)); err == nil && port > 0 && port <= 65535 { + return port + } + return defaultPprofPort +} + +func getPprofHost() string { + host := os.Getenv(envPprofHost) + if host == "" || !isLocalhostAddr(host) { + return defaultPprofHost + } + return host +} + +func isLocalhostAddr(addr string) bool { + return addr == "127.0.0.1" || addr == "localhost" || addr == "::1" || addr == "[::1]" +} + +func (p *pprofServer) Start(ctx context.Context) error { + if p == nil { + return nil + } + + addr := fmt.Sprintf("%s:%d", p.host, p.port) + ln, err := net.Listen("tcp", addr) + if err != nil { + return fmt.Errorf("pprof: failed to listen on %s: %w", addr, err) + } + p.listener = ln + + p.server = &http.Server{ + Handler: p.localhostOnly(http.DefaultServeMux), + ReadTimeout: pprofReadTimeout, + WriteTimeout: pprofWriteTimeout, + BaseContext: func(net.Listener) context.Context { return ctx }, + } + + p.log.Infof("pprof server listening on %s", addr) + + go func() { + if err := p.server.Serve(p.listener); err != nil && !errors.Is(err, http.ErrServerClosed) { + p.log.Warnf("pprof server error: %v", err) + } + }() + + return nil +} + +func (p *pprofServer) Stop(ctx context.Context) error { + if p == nil || p.server == nil { + return nil + } + p.log.Info("stopping pprof server") + shutdownCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + return p.server.Shutdown(shutdownCtx) +} + +// localhostOnly rejects requests from non-localhost addresses +func (p *pprofServer) localhostOnly(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + host, _, err := net.SplitHostPort(r.RemoteAddr) + if err != nil || !isLocalhostAddr(host) { + p.log.Warnf("pprof: rejected request from %s", r.RemoteAddr) + http.Error(w, "Forbidden", http.StatusForbidden) + return + } + next.ServeHTTP(w, r) + }) +} diff --git a/cmd/aro/pprof_test.go b/cmd/aro/pprof_test.go new file mode 100644 index 00000000000..05193c44c33 --- /dev/null +++ b/cmd/aro/pprof_test.go @@ -0,0 +1,356 @@ +package main + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "context" + "net/http" + "os" + "testing" + "time" + + "github.com/sirupsen/logrus" +) + +func TestIsPprofEnabled(t *testing.T) { + tests := []struct { + name string + envPprof string + envRPMode string + wantEnabled bool + }{ + { + name: "explicitly enabled", + envPprof: "true", + envRPMode: "", + wantEnabled: true, + }, + { + name: "explicitly enabled with 1", + envPprof: "1", + envRPMode: "", + wantEnabled: true, + }, + { + name: "explicitly disabled", + envPprof: "false", + envRPMode: "", + wantEnabled: false, + }, + { + name: "default in development mode", + envPprof: "", + envRPMode: "development", + wantEnabled: true, + }, + { + name: "default in production mode", + envPprof: "", + envRPMode: "", + wantEnabled: false, + }, + { + name: "case insensitive true", + envPprof: "TRUE", + envRPMode: "", + wantEnabled: true, + }, + { + name: "case insensitive development", + envPprof: "", + envRPMode: "DEVELOPMENT", + wantEnabled: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Save and restore environment + origPprof := os.Getenv(envPprofEnabled) + origRPMode := os.Getenv("RP_MODE") + defer func() { + os.Setenv(envPprofEnabled, origPprof) + os.Setenv("RP_MODE", origRPMode) + }() + + os.Setenv(envPprofEnabled, tt.envPprof) + os.Setenv("RP_MODE", tt.envRPMode) + + got := isPprofEnabled() + if got != tt.wantEnabled { + t.Errorf("isPprofEnabled() = %v, want %v", got, tt.wantEnabled) + } + }) + } +} + +func TestGetPprofPort(t *testing.T) { + tests := []struct { + name string + envValue string + wantPort int + }{ + { + name: "default port", + envValue: "", + wantPort: defaultPprofPort, + }, + { + name: "custom port", + envValue: "7070", + wantPort: 7070, + }, + { + name: "invalid port string", + envValue: "invalid", + wantPort: defaultPprofPort, + }, + { + name: "port too low", + envValue: "0", + wantPort: defaultPprofPort, + }, + { + name: "port too high", + envValue: "65536", + wantPort: defaultPprofPort, + }, + { + name: "negative port", + envValue: "-1", + wantPort: defaultPprofPort, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + origValue := os.Getenv(envPprofPort) + defer os.Setenv(envPprofPort, origValue) + + os.Setenv(envPprofPort, tt.envValue) + + got := getPprofPort() + if got != tt.wantPort { + t.Errorf("getPprofPort() = %v, want %v", got, tt.wantPort) + } + }) + } +} + +func TestGetPprofHost(t *testing.T) { + tests := []struct { + name string + envValue string + wantHost string + }{ + { + name: "default host", + envValue: "", + wantHost: defaultPprofHost, + }, + { + name: "localhost", + envValue: "localhost", + wantHost: "localhost", + }, + { + name: "127.0.0.1", + envValue: "127.0.0.1", + wantHost: "127.0.0.1", + }, + { + name: "::1 ipv6", + envValue: "::1", + wantHost: "::1", + }, + { + name: "non-localhost blocked", + envValue: "0.0.0.0", + wantHost: defaultPprofHost, + }, + { + name: "external IP blocked", + envValue: "192.168.1.1", + wantHost: defaultPprofHost, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + origValue := os.Getenv(envPprofHost) + defer os.Setenv(envPprofHost, origValue) + + os.Setenv(envPprofHost, tt.envValue) + + got := getPprofHost() + if got != tt.wantHost { + t.Errorf("getPprofHost() = %v, want %v", got, tt.wantHost) + } + }) + } +} + +func TestIsLocalhostAddr(t *testing.T) { + tests := []struct { + addr string + want bool + }{ + {"127.0.0.1", true}, + {"localhost", true}, + {"::1", true}, + {"[::1]", true}, + {"0.0.0.0", false}, + {"192.168.1.1", false}, + {"10.0.0.1", false}, + {"example.com", false}, + {"", false}, + } + + for _, tt := range tests { + t.Run(tt.addr, func(t *testing.T) { + got := isLocalhostAddr(tt.addr) + if got != tt.want { + t.Errorf("isLocalhostAddr(%q) = %v, want %v", tt.addr, got, tt.want) + } + }) + } +} + +func TestNewPprofServerDisabled(t *testing.T) { + // Save and restore environment + origPprof := os.Getenv(envPprofEnabled) + origRPMode := os.Getenv("RP_MODE") + defer func() { + os.Setenv(envPprofEnabled, origPprof) + os.Setenv("RP_MODE", origRPMode) + }() + + os.Setenv(envPprofEnabled, "false") + os.Setenv("RP_MODE", "") + + log := logrus.NewEntry(logrus.New()) + srv, err := newPprofServer(log) + + if err != nil { + t.Errorf("newPprofServer() error = %v, want nil", err) + } + if srv != nil { + t.Errorf("newPprofServer() = %v, want nil when disabled", srv) + } +} + +func TestPprofServerStartStop(t *testing.T) { + // Save and restore environment + origPprof := os.Getenv(envPprofEnabled) + origRPMode := os.Getenv("RP_MODE") + origPort := os.Getenv(envPprofPort) + defer func() { + os.Setenv(envPprofEnabled, origPprof) + os.Setenv("RP_MODE", origRPMode) + os.Setenv(envPprofPort, origPort) + }() + + os.Setenv(envPprofEnabled, "true") + os.Setenv("RP_MODE", "") + // Use a random high port to avoid conflicts + os.Setenv(envPprofPort, "16060") + + log := logrus.NewEntry(logrus.New()) + srv, err := newPprofServer(log) + if err != nil { + t.Fatalf("newPprofServer() error = %v", err) + } + if srv == nil { + t.Fatal("newPprofServer() returned nil") + } + + ctx := context.Background() + + // Start the server + if err := srv.Start(ctx); err != nil { + t.Fatalf("Start() error = %v", err) + } + + // Give the server time to start + time.Sleep(100 * time.Millisecond) + + // Verify pprof endpoints are accessible + resp, err := http.Get("http://127.0.0.1:16060/debug/pprof/") + if err != nil { + t.Errorf("Failed to access pprof index: %v", err) + } else { + resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Errorf("pprof index returned status %d, want %d", resp.StatusCode, http.StatusOK) + } + } + + // Stop the server + if err := srv.Stop(ctx); err != nil { + t.Errorf("Stop() error = %v", err) + } + + // Give the server time to stop + time.Sleep(100 * time.Millisecond) + + // Verify the server is no longer responding + _, err = http.Get("http://127.0.0.1:16060/debug/pprof/") + if err == nil { + t.Error("Server should not be responding after Stop()") + } +} + +func TestPprofServerPortCollision(t *testing.T) { + // Save and restore environment + origPprof := os.Getenv(envPprofEnabled) + origRPMode := os.Getenv("RP_MODE") + origPort := os.Getenv(envPprofPort) + defer func() { + os.Setenv(envPprofEnabled, origPprof) + os.Setenv("RP_MODE", origRPMode) + os.Setenv(envPprofPort, origPort) + }() + + os.Setenv(envPprofEnabled, "true") + os.Setenv("RP_MODE", "") + os.Setenv(envPprofPort, "16061") + + log := logrus.NewEntry(logrus.New()) + ctx := context.Background() + + // Start first server + srv1, err := newPprofServer(log) + if err != nil { + t.Fatalf("newPprofServer() error = %v", err) + } + if err := srv1.Start(ctx); err != nil { + t.Fatalf("First Start() error = %v", err) + } + defer srv1.Stop(ctx) + + // Try to start second server on same port + srv2, err := newPprofServer(log) + if err != nil { + t.Fatalf("newPprofServer() error = %v", err) + } + + err = srv2.Start(ctx) + if err == nil { + srv2.Stop(ctx) + t.Error("Second Start() should have failed due to port collision") + } +} + +func TestPprofServerNilSafe(t *testing.T) { + var srv *pprofServer + ctx := context.Background() + + // These should not panic + if err := srv.Start(ctx); err != nil { + t.Errorf("Start() on nil server should return nil, got %v", err) + } + if err := srv.Stop(ctx); err != nil { + t.Errorf("Stop() on nil server should return nil, got %v", err) + } +} From 353d15c32c01c9254be33cc159319ed5ad9cae3c Mon Sep 17 00:00:00 2001 From: Alessandro Affinito Date: Tue, 23 Dec 2025 17:10:25 +0100 Subject: [PATCH 2/6] Add pprof profiling configuration to env.example --- env.example | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/env.example b/env.example index 91e03a729a0..d57c6b88d34 100644 --- a/env.example +++ b/env.example @@ -27,5 +27,12 @@ export TAG=latest export LOCAL_VPN_IMAGE=vpn export E2E_LABEL='!smoke&&!regressiontest' +# pprof profiling configuration +# Set to "true" to enable pprof server (automatically enabled in development mode) +export PPROF_ENABLED="${PPROF_ENABLED:-true}" +# TCP port for pprof HTTP server (default: 6060) +export PPROF_PORT="${PPROF_PORT:-6060}" +# Host address for pprof (restricted to localhost for security) +export PPROF_HOST="${PPROF_HOST:-127.0.0.1}" . secrets/env From 932a647873cb44fe048790b4c2225894bd8beb57 Mon Sep 17 00:00:00 2001 From: Alessandro Affinito Date: Mon, 29 Dec 2025 10:17:33 +0100 Subject: [PATCH 3/6] putting dockerfiles out of the pprof workflow --- Dockerfile.ci-rp | 4 ---- Dockerfile.dev-env | 10 ++++------ 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/Dockerfile.ci-rp b/Dockerfile.ci-rp index acf3c2aa76a..be881915cff 100644 --- a/Dockerfile.ci-rp +++ b/Dockerfile.ci-rp @@ -59,10 +59,6 @@ RUN go build -ldflags "-X github.com/Azure/ARO-RP/pkg/util/version.GitCommit=${A RUN go test ./test/e2e/... -tags e2e,codec.safe -c -ldflags "-X github.com/Azure/ARO-RP/pkg/util/version.GitCommit=${ARO_VERSION}" -o e2e.test -# Additional tests with pprof enabled -# PPROF_ENABLED=true enables profiling during test runs -ENV PPROF_ENABLED=true -ENV PPROF_PORT=6060 RUN gotestsum --format pkgname --junitfile report.xml -- -coverprofile=cover.out ./... \ && gocov convert cover.out | gocov-xml > coverage.xml diff --git a/Dockerfile.dev-env b/Dockerfile.dev-env index 110088c255f..e6490f1fda1 100644 --- a/Dockerfile.dev-env +++ b/Dockerfile.dev-env @@ -1,6 +1,5 @@ -# Use Fedora as base image since the project documentation mentions Fedora/RHEL dependencies -FROM arointsvc.azurecr.io/fedora:42 - +ARG FEDORA_REGISTRY +FROM ${FEDORA_REGISTRY}/fedora:42 # Install system dependencies RUN dnf update -y && dnf install -y \ gpgme-devel \ @@ -52,10 +51,9 @@ ENV PATH="/usr/local/go/bin:${PATH}" # Install bingo and Go tools RUN /usr/local/go/bin/go install github.com/bwplotka/bingo@latest COPY .bingo/ /workspace/.bingo/ +WORKDIR /workspace RUN export PATH="/usr/local/go/bin:/root/go/bin:$PATH" && \ - cd /workspace && \ /root/go/bin/bingo get ENV PATH="/workspace/.bingo/bin:${PATH}" -# Set up working directory -WORKDIR /workspace +USER ${USERID} \ No newline at end of file From 99933900848fad028e8558dac837d3928db2b11b Mon Sep 17 00:00:00 2001 From: Alessandro Affinito Date: Mon, 29 Dec 2025 10:18:15 +0100 Subject: [PATCH 4/6] WIP: automate all endpoints profiling leveraging the swagger --- Makefile | 101 ++++----- docs/pprof-analysis-guide.md | 332 ++++++++++++++++++++++++++++++ hack/pprof-analyze.sh | 206 +++++++++++++++++++ hack/pprof-profile-endpoint.sh | 363 +++++++++++++++++++++++++++++++++ 4 files changed, 952 insertions(+), 50 deletions(-) create mode 100644 docs/pprof-analysis-guide.md create mode 100644 hack/pprof-analyze.sh create mode 100755 hack/pprof-profile-endpoint.sh diff --git a/Makefile b/Makefile index fbbc26d9032..6ed8cc395c4 100644 --- a/Makefile +++ b/Makefile @@ -542,8 +542,8 @@ PPROF_OUTPUT_DIR ?= ./pprof-data PPROF_DURATION ?= 30s # Load test configuration -LOADTEST_URL ?= https://localhost:8443/healthz/ready -LOADTEST_DURATION ?= 30s +LOADTEST_BASE_URL ?= https://localhost:8443 +LOADTEST_DURATION ?= 20s LOADTEST_RATE ?= 100 .PHONY: pprof-check @@ -554,7 +554,7 @@ pprof-check: ## Check if pprof server is running (echo "✗ pprof server is not running. Start it with: make runlocal-rp" && exit 1) .PHONY: pprof-collect-all -pprof-collect-all: ## Collect all pprof profiles (CPU, heap, goroutine, etc.) +pprof-collect-all: ## Collect all pprof profile types (CPU, heap, goroutine, etc.) from the running server. Does NOT profile endpoints under load - use pprof-profile-endpoint for that. @mkdir -p $(PPROF_OUTPUT_DIR) @echo "Collecting pprof profiles from $(PPROF_URL)..." @echo "Output directory: $(PPROF_OUTPUT_DIR)" @@ -637,56 +637,57 @@ pprof-trace: ## Collect and open execution trace in browser @echo "Opening trace viewer..." go tool trace $(PPROF_OUTPUT_DIR)/trace.out -.PHONY: loadtest-hey -loadtest-hey: ## Run load test using hey (install: go install github.com/rakyll/hey@latest) - @command -v hey >/dev/null 2>&1 || { echo "hey not found. Install with: go install github.com/rakyll/hey@latest"; exit 1; } - @echo "Running load test with hey..." - @echo " URL: $(LOADTEST_URL)" - @echo " Duration: $(LOADTEST_DURATION)" - @echo " Rate: $(LOADTEST_RATE) req/s" - hey -z $(LOADTEST_DURATION) -q $(LOADTEST_RATE) -disable-keepalive $(LOADTEST_URL) - .PHONY: loadtest-vegeta -loadtest-vegeta: ## Run load test using vegeta (install: go install github.com/tsenart/vegeta@latest) +loadtest-vegeta: ## Run load test using vegeta. Usage: make loadtest-vegeta ENDPOINT=/api/v1/clusters DURATION=20s RATE=100 + @if [ -z "$(ENDPOINT)" ]; then \ + echo "Error: ENDPOINT is required. Example: make loadtest-vegeta ENDPOINT=/api/v1/clusters"; \ + exit 1; \ + fi @command -v vegeta >/dev/null 2>&1 || { echo "vegeta not found. Install with: go install github.com/tsenart/vegeta@latest"; exit 1; } - @mkdir -p $(PPROF_OUTPUT_DIR) - @echo "Running load test with vegeta..." - @echo " URL: $(LOADTEST_URL)" - @echo " Duration: $(LOADTEST_DURATION)" - @echo " Rate: $(LOADTEST_RATE) req/s" - @echo "GET $(LOADTEST_URL)" | vegeta attack -duration=$(LOADTEST_DURATION) -rate=$(LOADTEST_RATE) -insecure | \ - tee $(PPROF_OUTPUT_DIR)/vegeta-results.bin | vegeta report - @echo "" - @echo "Results saved to $(PPROF_OUTPUT_DIR)/vegeta-results.bin" - @echo "Generate HTML report: vegeta report -type=html $(PPROF_OUTPUT_DIR)/vegeta-results.bin > $(PPROF_OUTPUT_DIR)/vegeta-report.html" - -.PHONY: pprof-loadtest -pprof-loadtest: ## Run load test and collect pprof profiles simultaneously - @command -v hey >/dev/null 2>&1 || { echo "hey not found. Install with: go install github.com/rakyll/hey@latest"; exit 1; } - @mkdir -p $(PPROF_OUTPUT_DIR) - @echo "Starting load test and profile collection..." - @echo "" - @echo "Step 1: Starting CPU profile collection in background ($(PPROF_DURATION))..." - @(curl -s "$(PPROF_URL)/debug/pprof/profile?seconds=$$(echo $(PPROF_DURATION) | sed 's/s//')" -o $(PPROF_OUTPUT_DIR)/loadtest-cpu.prof && \ - echo "CPU profile saved to $(PPROF_OUTPUT_DIR)/loadtest-cpu.prof") & - @sleep 2 - @echo "Step 2: Running load test..." - hey -z $(LOADTEST_DURATION) -q $(LOADTEST_RATE) -disable-keepalive $(LOADTEST_URL) || true + @ENDPOINT_NAME=$$(echo "$(ENDPOINT)" | sed 's|^/||' | sed 's|/|-|g' | sed 's|[^a-zA-Z0-9-]|_|g' | tr '[:upper:]' '[:lower:]'); \ + if [ -z "$$ENDPOINT_NAME" ]; then ENDPOINT_NAME="endpoint"; fi; \ + TEST_URL="$(LOADTEST_BASE_URL)$(ENDPOINT)"; \ + if [ -n "$(DURATION)" ]; then TEST_DURATION="$(DURATION)"; else TEST_DURATION="$(LOADTEST_DURATION)"; fi; \ + if [ -n "$(RATE)" ]; then TEST_RATE="$(RATE)"; else TEST_RATE="$(LOADTEST_RATE)"; fi; \ + mkdir -p $(PPROF_OUTPUT_DIR); \ + echo "Running load test with vegeta..."; \ + echo " URL: $$TEST_URL"; \ + echo " Duration: $$TEST_DURATION"; \ + echo " Rate: $$TEST_RATE req/s"; \ + echo "GET $$TEST_URL" | vegeta attack -duration=$$TEST_DURATION -rate=$$TEST_RATE -insecure | \ + tee $(PPROF_OUTPUT_DIR)/$$ENDPOINT_NAME-vegeta.bin | vegeta report; \ + echo ""; \ + echo "Results saved to $(PPROF_OUTPUT_DIR)/$$ENDPOINT_NAME-vegeta.bin"; \ + echo "Generate HTML report: vegeta report -type=html $(PPROF_OUTPUT_DIR)/$$ENDPOINT_NAME-vegeta.bin > $(PPROF_OUTPUT_DIR)/$$ENDPOINT_NAME-vegeta.html" + +.PHONY: pprof-profile-endpoint +pprof-profile-endpoint: ## Profile app under load for specific endpoint(s). Usage: make pprof-profile-endpoint ENDPOINT=/api/v1/clusters DURATION=20s RATE=100. Use ENDPOINT=all to profile ALL endpoints from swagger (runs load test + collects profiles for each endpoint individually). + @echo "Note: Ensure the RP is running with pprof enabled:" + @echo " Terminal 1: make runlocal-rp" + @echo " (pprof is enabled by default in development mode)" @echo "" - @echo "Step 3: Collecting heap profile..." - @curl -s "$(PPROF_URL)/debug/pprof/heap" -o $(PPROF_OUTPUT_DIR)/loadtest-heap.prof - @echo "Heap profile saved to $(PPROF_OUTPUT_DIR)/loadtest-heap.prof" - @echo "" - @echo "Step 4: Collecting goroutine profile..." - @curl -s "$(PPROF_URL)/debug/pprof/goroutine" -o $(PPROF_OUTPUT_DIR)/loadtest-goroutine.prof - @echo "Goroutine profile saved to $(PPROF_OUTPUT_DIR)/loadtest-goroutine.prof" - @echo "" - @echo "Load test and profiling complete!" - @echo "" - @echo "View profiles with:" - @echo " go tool pprof -http=:8888 $(PPROF_OUTPUT_DIR)/loadtest-cpu.prof" - @echo " go tool pprof -http=:8888 $(PPROF_OUTPUT_DIR)/loadtest-heap.prof" - @echo " go tool pprof -http=:8888 $(PPROF_OUTPUT_DIR)/loadtest-goroutine.prof" + @if [ -z "$(ENDPOINT)" ]; then \ + echo "Error: ENDPOINT is required. Example: make pprof-profile-endpoint ENDPOINT=/api/v1/clusters"; \ + echo " Or use ENDPOINT=all to profile all endpoints from swagger"; \ + exit 1; \ + fi + @TEST_DURATION="$(DURATION)"; \ + if [ -z "$$TEST_DURATION" ]; then TEST_DURATION="$(LOADTEST_DURATION)"; fi; \ + TEST_RATE="$(RATE)"; \ + if [ -z "$$TEST_RATE" ]; then TEST_RATE="$(LOADTEST_RATE)"; fi; \ + hack/pprof-profile-endpoint.sh ENDPOINT="$(ENDPOINT)" DURATION="$$TEST_DURATION" RATE="$$TEST_RATE" + +.PHONY: pprof-analyze +pprof-analyze: ## Analyze a pprof profile and generate improvement suggestions. Usage: make pprof-analyze PROFILE=pprof-data/endpoint-cpu.prof + @if [ -z "$(PROFILE)" ]; then \ + echo "Error: PROFILE is required"; \ + echo "Usage: make pprof-analyze PROFILE=pprof-data/providers-microsoft-redhatopenshift-operations-cpu.prof"; \ + echo ""; \ + echo "Available profiles:"; \ + ls -1 $(PPROF_OUTPUT_DIR)/*.prof 2>/dev/null | sed 's|^| |' || echo " No profiles found in $(PPROF_OUTPUT_DIR)"; \ + exit 1; \ + fi + @hack/pprof-analyze.sh "$(PROFILE)" .PHONY: pprof-clean pprof-clean: ## Clean up pprof output directory diff --git a/docs/pprof-analysis-guide.md b/docs/pprof-analysis-guide.md new file mode 100644 index 00000000000..c3f909316a7 --- /dev/null +++ b/docs/pprof-analysis-guide.md @@ -0,0 +1,332 @@ +# pprof Profile Analysis Guide + +## Overview + +This guide explains how to analyze the collected pprof profiles and create feature improvement requests based on the findings. + +## Profile Types and What They Tell You + +### CPU Profile (`*-cpu.prof`) +- **What it shows**: Where the program spends CPU time during execution +- **Key metrics**: + - `flat`: Time spent in the function itself + - `cum`: Cumulative time (function + its callees) +- **Use cases**: Identify CPU bottlenecks, hot paths, inefficient algorithms + +### Heap Profile (`*-heap.prof`) +- **What it shows**: Memory allocations currently in use +- **Key metrics**: + - `inuse_space`: Bytes of memory currently allocated + - `inuse_objects`: Number of objects currently allocated +- **Use cases**: Memory leaks, excessive allocations, large objects + +### Allocs Profile (`*-allocs.prof`) +- **What it shows**: Total memory allocations since program start +- **Key metrics**: + - `alloc_space`: Total bytes allocated (including freed) + - `alloc_objects`: Total number of objects allocated +- **Use cases**: Allocation hotspots, frequent allocations, GC pressure + +### Goroutine Profile (`*-goroutine.prof`) +- **What it shows**: Stack traces of all goroutines +- **Key metrics**: Number of goroutines, their states (running, waiting, etc.) +- **Use cases**: Goroutine leaks, excessive concurrency, deadlocks + +### Block Profile (`*-block.prof`) +- **What it shows**: Time spent blocked on synchronization primitives +- **Key metrics**: Time blocked on mutexes, channels, etc. +- **Use cases**: Lock contention, blocking operations + +### Mutex Profile (`*-mutex.prof`) +- **What it shows**: Contention on mutexes +- **Key metrics**: Time other goroutines waited for locks +- **Use cases**: Lock contention, performance bottlenecks from locking + +## Analyzing a Profile + +### Expected Endpoint errors + The 400/404 responses are expected validation errors from the server, not script issues: + 400 for /openShiftVersions/{openShiftVersion}: Version "4.14.0" is not in the enabled versions cache. The server validates that the version exists before returning it. + + 400 for /platformWorkloadIdentityRoleSets/{openShiftMinorVersion}: Similar validation - the minor version "4.14" is not in the cache. + + 404 for GET /openShiftClusters/{resourceName}: Expected - the test cluster "test-cluster" doesn't exist in the database. + + 400 for PATCH/PUT /openShiftClusters/{resourceName}: Expected - these endpoints require a request body with cluster configuration. + + 400 for POST /listCredentials//listAdminCredentials: Expected - InvalidSubscriptionState means the test subscription is not registered in the environment. + + These errors indicate the server is processing requests and returning appropriate validation responses. Profiling still captures server behavior under load, which is the goal. + The script is working correctly - it's making the requests with the proper HTTP methods, and the server is responding with validation errors as expected for test data. + + +### Step 1: View the Profile in Browser + +```bash +# For CPU/Heap/Allocs profiles +go tool pprof -http=:8888 pprof-data/providers-microsoft-redhatopenshift-operations-cpu.prof + +# For goroutine profile +go tool pprof -http=:8888 pprof-data/providers-microsoft-redhatopenshift-operations-goroutine.prof + +# For execution trace +go tool trace pprof-data/providers-microsoft-redhatopenshift-operations-trace.out +``` + +### Step 2: Command-Line Analysis + +```bash +# Top functions by CPU time +go tool pprof -top -cum pprof-data/providers-microsoft-redhatopenshift-operations-cpu.prof + +# Top memory allocations +go tool pprof -top -cum -alloc_space pprof-data/providers-microsoft-redhatopenshift-operations-allocs.prof + +# List all goroutines +go tool pprof -top pprof-data/providers-microsoft-redhatopenshift-operations-goroutine.prof + +# Show call graph +go tool pprof -web pprof-data/providers-microsoft-redhatopenshift-operations-cpu.prof +``` + +### Step 3: Compare Profiles + +Compare the same endpoint across different load conditions: + +```bash +# Compare two CPU profiles +go tool pprof -base=pprof-data/endpoint1-cpu.prof pprof-data/endpoint2-cpu.prof +``` + +## Example Analysis: Operations Endpoint + +### CPU Profile Analysis + +```bash +go tool pprof -top -cum pprof-data/providers-microsoft-redhatopenshift-operations-cpu.prof +``` + +**What to look for:** +- Functions with high `cum` values: These are hot paths +- Functions with high `flat` values: These are doing actual work (not just calling other functions) +- Unexpected functions in the top: May indicate inefficiencies + +**Example findings:** +``` + flat flat% sum% cum cum% + 120ms 45.0% 45.0% 120ms 45.0% runtime.mallocgc + 80ms 30.0% 75.0% 200ms 75.0% encoding/json.Marshal + 30ms 11.3% 86.3% 230ms 86.3% github.com/Azure/ARO-RP/pkg/frontend.getOperations +``` + +**Interpretation:** +- `runtime.mallocgc`: High memory allocation overhead (45% of CPU time) +- `encoding/json.Marshal`: JSON serialization is expensive (30% of CPU time) +- Consider: Caching, pooling, or optimizing JSON marshaling + +### Heap Profile Analysis + +```bash +go tool pprof -top -cum -inuse_space pprof-data/providers-microsoft-redhatopenshift-operations-heap.prof +``` + +**What to look for:** +- Large `inuse_space`: Memory currently allocated +- Functions allocating many objects: May indicate inefficient patterns + +**Example findings:** +``` + flat flat% sum% cum cum% + 2.50MB 50.0% 50.0% 2.50MB 50.0% encoding/json.Marshal + 1.00MB 20.0% 70.0% 1.00MB 20.0% bytes.(*Buffer).grow +``` + +**Interpretation:** +- JSON marshaling allocates significant memory +- Buffer growth suggests dynamic allocation +- Consider: Pre-allocating buffers, using object pools + +### Goroutine Profile Analysis + +```bash +go tool pprof -top pprof-data/providers-microsoft-redhatopenshift-operations-goroutine.prof +``` + +**What to look for:** +- Total number of goroutines: Should be reasonable for the load +- Goroutines stuck in certain states: May indicate leaks or deadlocks +- Stack traces showing waiting: May indicate blocking operations + +**Example findings:** +``` +1000: github.com/Azure/ARO-RP/pkg/frontend.getOperations + github.com/Azure/ARO-RP/pkg/database.OpenShiftClusters.ListByQuery + github.com/Azure/ARO-RP/pkg/database.(*openShiftClusters).listByQuery + database/sql.(*DB).QueryContext +``` + +**Interpretation:** +- Many goroutines waiting on database queries +- May indicate: Database connection pool exhaustion, slow queries, or lack of query timeout + +## Creating Feature Improvement Requests + +### 1. Identify the Problem + +From the profile analysis, identify: +- **Performance bottlenecks**: High CPU usage, slow operations +- **Memory issues**: Excessive allocations, potential leaks +- **Concurrency issues**: Too many goroutines, lock contention +- **Inefficiencies**: Repeated work, unnecessary allocations + +### 2. Quantify the Impact + +Document: +- **Current performance**: Response time, throughput, memory usage +- **Bottleneck location**: Specific function, package, or operation +- **Scale**: How does it behave under different loads? + +### 3. Create Improvement Request Template + +```markdown +## Performance Improvement: [Endpoint Name] + +### Problem Statement +[Describe the performance issue identified from profiling] + +### Current Behavior +- **Endpoint**: `/providers/Microsoft.RedHatOpenShift/operations` +- **Load**: 10 req/s for 10s +- **CPU Profile**: [Key finding] +- **Heap Profile**: [Key finding] +- **Goroutine Profile**: [Key finding] + +### Profile Analysis + +#### CPU Profile +``` +[Top 5 functions by CPU time] +``` + +**Interpretation**: [What this tells us] + +#### Heap Profile +``` +[Top 5 allocations by size] +``` + +**Interpretation**: [What this tells us] + +#### Goroutine Profile +``` +[Number of goroutines and key stack traces] +``` + +**Interpretation**: [What this tells us] + +### Proposed Solution +[Describe the improvement] + +### Expected Impact +- **Performance**: [Expected improvement] +- **Memory**: [Expected improvement] +- **Scalability**: [Expected improvement] + +### Implementation Notes +[Any technical considerations] + +### Priority +[High/Medium/Low based on impact] +``` + +### 4. Common Improvement Patterns + +#### Pattern 1: Reduce Allocations +**Finding**: High `alloc_space` in profiles +**Solution**: +- Use object pools for frequently allocated objects +- Pre-allocate slices/maps with known capacity +- Reuse buffers instead of creating new ones + +#### Pattern 2: Optimize Hot Paths +**Finding**: High CPU time in specific functions +**Solution**: +- Cache expensive computations +- Optimize algorithms (e.g., use maps instead of linear search) +- Reduce function call overhead + +#### Pattern 3: Reduce Lock Contention +**Finding**: High time in `*-block.prof` or `*-mutex.prof` +**Solution**: +- Use read-write locks where appropriate +- Reduce lock scope +- Use lock-free data structures where possible +- Shard locks for better concurrency + +#### Pattern 4: Optimize Database Queries +**Finding**: Many goroutines waiting on database +**Solution**: +- Add query timeouts +- Optimize slow queries +- Increase connection pool size +- Use connection pooling effectively + +#### Pattern 5: Reduce JSON Marshaling Overhead +**Finding**: High CPU/memory in `encoding/json.Marshal` +**Solution**: +- Cache marshaled responses where possible +- Use streaming JSON encoding for large responses +- Consider faster JSON libraries (e.g., `jsoniter`) +- Pre-allocate buffers + +## Automated Analysis Script + +Create a script to generate analysis reports: + +```bash +#!/bin/bash +# analyze-profile.sh - Generate analysis report for a profile + +PROFILE=$1 +ENDPOINT_NAME=$(basename "$PROFILE" .prof) + +echo "# Analysis Report: $ENDPOINT_NAME" +echo "" +echo "## CPU Profile Analysis" +go tool pprof -top -cum "$PROFILE" 2>&1 | head -20 +echo "" +echo "## Memory Analysis" +go tool pprof -top -cum -inuse_space "$PROFILE" 2>&1 | head -20 +``` + +## Next Steps + +1. **Run the analysis** on key endpoints (operations, cluster CRUD, etc.) +2. **Compare profiles** across different load levels +3. **Identify patterns** that appear across multiple endpoints +4. **Prioritize improvements** based on impact and effort +5. **Create tickets** using the template above +6. **Track improvements** by re-profiling after changes + +## Tools and Commands Reference + +```bash +# Interactive web UI +go tool pprof -http=:8888 + +# Text output +go tool pprof -top +go tool pprof -top -cum +go tool pprof -list + +# Compare profiles +go tool pprof -base= + +# Generate reports +go tool pprof -text > report.txt +go tool pprof -svg > report.svg + +# Execution trace +go tool trace +``` + diff --git a/hack/pprof-analyze.sh b/hack/pprof-analyze.sh new file mode 100644 index 00000000000..49ca637a93d --- /dev/null +++ b/hack/pprof-analyze.sh @@ -0,0 +1,206 @@ +#!/bin/bash +# Copyright (c) Microsoft Corporation. +# Licensed under the Apache License 2.0. +# +# Analyze pprof profiles and generate improvement suggestions. +# Usage: +# ./hack/pprof-analyze.sh +# ./hack/pprof-analyze.sh pprof-data/providers-microsoft-redhatopenshift-operations-cpu.prof + +set -euo pipefail + +PROFILE="${1:-}" +PPROF_OUTPUT_DIR="${PPROF_OUTPUT_DIR:-./pprof-data}" + +if [ -z "$PROFILE" ]; then + echo "Usage: $0 " + echo "" + echo "Available profiles:" + ls -1 "$PPROF_OUTPUT_DIR"/*.prof 2>/dev/null | sed 's|.*/||' | sed 's|^| |' || echo " No profiles found in $PPROF_OUTPUT_DIR" + exit 1 +fi + +if [ ! -f "$PROFILE" ]; then + echo "Error: Profile file not found: $PROFILE" + exit 1 +fi + +ENDPOINT_NAME=$(basename "$PROFILE" .prof) +REPORT_FILE="${PPROF_OUTPUT_DIR}/${ENDPOINT_NAME}-analysis.md" + +echo "Analyzing profile: $PROFILE" +echo "Generating report: $REPORT_FILE" +echo "" + +{ + echo "# Performance Analysis: $ENDPOINT_NAME" + echo "" + echo "Generated: $(date)" + echo "Profile: $PROFILE" + echo "" + echo "---" + echo "" + + # Determine profile type + if [[ "$PROFILE" == *"-cpu.prof" ]]; then + echo "## Profile Type: CPU" + echo "" + echo "### Top Functions by CPU Time (Cumulative)" + echo '```' + go tool pprof -top -cum "$PROFILE" 2>&1 | head -30 + echo '```' + echo "" + echo "### Top Functions by CPU Time (Flat)" + echo '```' + go tool pprof -top "$PROFILE" 2>&1 | head -30 + echo '```' + echo "" + echo "### Key Insights" + echo "" + echo "**Hot Paths**: Functions with high cumulative time are in the critical path" + echo "**Bottlenecks**: Functions with high flat time are doing actual work" + echo "" + + elif [[ "$PROFILE" == *"-heap.prof" ]]; then + echo "## Profile Type: Heap (In-Use Memory)" + echo "" + echo "### Top Allocations by Size (In-Use Space)" + echo '```' + go tool pprof -top -cum -inuse_space "$PROFILE" 2>&1 | head -30 + echo '```' + echo "" + echo "### Top Allocations by Count (In-Use Objects)" + echo '```' + go tool pprof -top -cum -inuse_objects "$PROFILE" 2>&1 | head -30 + echo '```' + echo "" + echo "### Key Insights" + echo "" + echo "**Memory Usage**: Shows currently allocated memory" + echo "**Potential Leaks**: Look for unexpected allocations that persist" + echo "" + + elif [[ "$PROFILE" == *"-allocs.prof" ]]; then + echo "## Profile Type: Allocations (Total Since Start)" + echo "" + echo "### Top Allocations by Total Space" + echo '```' + go tool pprof -top -cum -alloc_space "$PROFILE" 2>&1 | head -30 + echo '```' + echo "" + echo "### Top Allocations by Total Count" + echo '```' + go tool pprof -top -cum -alloc_objects "$PROFILE" 2>&1 | head -30 + echo '```' + echo "" + echo "### Key Insights" + echo "" + echo "**Allocation Hotspots**: Functions that allocate frequently" + echo "**GC Pressure**: High allocation rates can cause GC pauses" + echo "" + + elif [[ "$PROFILE" == *"-goroutine.prof" ]]; then + echo "## Profile Type: Goroutines" + echo "" + echo "### Goroutine Count" + echo '```' + go tool pprof -top "$PROFILE" 2>&1 | head -5 + echo '```' + echo "" + echo "### Top Goroutine Stack Traces" + echo '```' + go tool pprof -top "$PROFILE" 2>&1 | head -50 + echo '```' + echo "" + echo "### Key Insights" + echo "" + echo "**Goroutine Leaks**: Unusually high goroutine counts" + echo "**Blocking Operations**: Goroutines stuck in waiting states" + echo "" + + elif [[ "$PROFILE" == *"-block.prof" ]]; then + echo "## Profile Type: Block (Synchronization Blocking)" + echo "" + echo "### Top Blocking Operations" + echo '```' + go tool pprof -top -cum "$PROFILE" 2>&1 | head -30 + echo '```' + echo "" + echo "### Key Insights" + echo "" + echo "**Lock Contention**: Time spent waiting on locks" + echo "**Channel Blocking**: Time spent waiting on channel operations" + echo "" + + elif [[ "$PROFILE" == *"-mutex.prof" ]]; then + echo "## Profile Type: Mutex (Lock Contention)" + echo "" + echo "### Top Mutex Contention" + echo '```' + go tool pprof -top -cum "$PROFILE" 2>&1 | head -30 + echo '```' + echo "" + echo "### Key Insights" + echo "" + echo "**Lock Contention**: Mutexes with high contention" + echo "**Performance Impact**: Time other goroutines waited for locks" + echo "" + else + echo "## Profile Type: Unknown" + echo "" + echo "### Top Functions" + echo '```' + go tool pprof -top -cum "$PROFILE" 2>&1 | head -30 + echo '```' + fi + + echo "" + echo "---" + echo "" + echo "## Improvement Suggestions" + echo "" + echo "### 1. Review Hot Paths" + echo "- Identify functions with highest CPU/memory usage" + echo "- Consider caching, optimization, or algorithm improvements" + echo "" + echo "### 2. Check for Allocation Patterns" + echo "- Look for frequent allocations in hot paths" + echo "- Consider object pooling or pre-allocation" + echo "" + echo "### 3. Analyze Concurrency" + echo "- Review goroutine counts and states" + echo "- Check for potential leaks or excessive concurrency" + echo "" + echo "### 4. Investigate Blocking" + echo "- Review lock contention and blocking operations" + echo "- Consider lock-free alternatives or reducing lock scope" + echo "" + echo "---" + echo "" + echo "## Next Steps" + echo "" + echo "1. Open interactive view:" + echo " \`\`\`bash" + echo " go tool pprof -http=:8888 $PROFILE" + echo " \`\`\`" + echo "" + echo "2. Compare with other profiles:" + echo " \`\`\`bash" + echo " go tool pprof -base= $PROFILE" + echo " \`\`\`" + echo "" + echo "3. Generate visualizations:" + echo " \`\`\`bash" + echo " go tool pprof -svg $PROFILE > ${ENDPOINT_NAME}-graph.svg" + echo " \`\`\`" + +} > "$REPORT_FILE" + +echo "✓ Analysis complete!" +echo "" +echo "View the report:" +echo " cat $REPORT_FILE" +echo "" +echo "Or open interactive view:" +echo " go tool pprof -http=:8888 $PROFILE" + diff --git a/hack/pprof-profile-endpoint.sh b/hack/pprof-profile-endpoint.sh new file mode 100755 index 00000000000..9871bba79cd --- /dev/null +++ b/hack/pprof-profile-endpoint.sh @@ -0,0 +1,363 @@ +#!/bin/bash +# Copyright (c) Microsoft Corporation. +# Licensed under the Apache License 2.0. +# +# Profile the ARO RP under load for specific endpoints. +# Usage: +# ./hack/pprof-profile-endpoint.sh ENDPOINT=/api/v1/clusters DURATION=20s RATE=100 +# ./hack/pprof-profile-endpoint.sh ENDPOINT=all DURATION=10s RATE=50 + +set -euo pipefail + +# Default values +ENDPOINT="${ENDPOINT:-}" +DURATION="${DURATION:-20s}" +RATE="${RATE:-100}" +PPROF_HOST="${PPROF_HOST:-127.0.0.1}" +PPROF_PORT="${PPROF_PORT:-6060}" +PPROF_URL="http://${PPROF_HOST}:${PPROF_PORT}" +PPROF_OUTPUT_DIR="${PPROF_OUTPUT_DIR:-./pprof-data}" +LOADTEST_BASE_URL="${LOADTEST_BASE_URL:-https://localhost:8443}" + +# Test values for path parameters +TEST_SUBSCRIPTION_ID="${TEST_SUBSCRIPTION_ID:-00000000-0000-0000-0000-000000000000}" +TEST_RESOURCE_GROUP="${TEST_RESOURCE_GROUP:-test-rg}" +TEST_LOCATION="${TEST_LOCATION:-eastus}" +TEST_RESOURCE_NAME="${TEST_RESOURCE_NAME:-test-cluster}" +TEST_OPENSHIFT_VERSION="${TEST_OPENSHIFT_VERSION:-4.14.0}" +TEST_OPENSHIFT_MINOR_VERSION="${TEST_OPENSHIFT_MINOR_VERSION:-4.14}" +TEST_OPERATION_ID="${TEST_OPERATION_ID:-00000000-0000-0000-0000-000000000000}" +TEST_DETECTOR_ID="${TEST_DETECTOR_ID:-test-detector}" +TEST_SYNC_SET_NAME="${TEST_SYNC_SET_NAME:-test-syncset}" +TEST_MANIFEST_ID="${TEST_MANIFEST_ID:-test-manifest}" +TEST_DEPLOYMENT_NAME="${TEST_DEPLOYMENT_NAME:-test-deployment}" + +# API version to use for requests (default to latest stable) +# Valid versions: 2020-04-30, 2022-04-01, 2022-09-04, 2023-04-01, 2023-09-04, 2023-11-22, 2025-07-25 +# Preview versions: 2021-09-01-preview, 2023-07-01-preview, 2024-08-12-preview +# Admin version: admin +TEST_API_VERSION="${TEST_API_VERSION:-2025-07-25}" + +# Find the latest swagger file +SWAGGER_DIR="swagger/redhatopenshift/resource-manager/Microsoft.RedHatOpenShift/openshiftclusters" +LATEST_SWAGGER=$(find "${SWAGGER_DIR}" -name "redhatopenshift.json" -type f | sort -V | tail -1) + +if [ -z "$LATEST_SWAGGER" ]; then + echo "Error: Could not find swagger file in ${SWAGGER_DIR}" + exit 1 +fi + +# Sanitize endpoint name for filesystem +sanitize_endpoint() { + local result + result=$(sed 's|^/||; s|/|-|g; s|[^a-zA-Z0-9-]|_|g' <<< "$1") + tr '[:upper:]' '[:lower:]' <<< "$result" | sed 's|__*|_|g' +} + +# Replace path parameters with test values +substitute_path_params() { + local path="$1" + + # Replace each parameter placeholder with its test value + path=$(sed "s|{subscriptionId}|${TEST_SUBSCRIPTION_ID}|g" <<< "$path") + path=$(sed "s|{resourceGroupName}|${TEST_RESOURCE_GROUP}|g" <<< "$path") + path=$(sed "s|{resourceProviderNamespace}|Microsoft.RedHatOpenShift|g" <<< "$path") + path=$(sed "s|{resourceType}|openShiftClusters|g" <<< "$path") + path=$(sed "s|{resourceName}|${TEST_RESOURCE_NAME}|g" <<< "$path") + path=$(sed "s|{location}|${TEST_LOCATION}|g" <<< "$path") + path=$(sed "s|{openShiftVersion}|${TEST_OPENSHIFT_VERSION}|g" <<< "$path") + path=$(sed "s|{openShiftMinorVersion}|${TEST_OPENSHIFT_MINOR_VERSION}|g" <<< "$path") + path=$(sed "s|{operationId}|${TEST_OPERATION_ID}|g" <<< "$path") + path=$(sed "s|{detectorId}|${TEST_DETECTOR_ID}|g" <<< "$path") + path=$(sed "s|{syncsetname}|${TEST_SYNC_SET_NAME}|g" <<< "$path") + path=$(sed "s|{manifestId}|${TEST_MANIFEST_ID}|g" <<< "$path") + path=$(sed "s|{deploymentName}|${TEST_DEPLOYMENT_NAME}|g" <<< "$path") + + # Remove any remaining unmatched {param} patterns and stray } characters + sed 's|{[^}]*}||g; s|}||g' <<< "$path" +} + +# Extract endpoints from swagger file +# Returns: pathmethod (e.g., "/path/to/endpointGET") +extract_endpoints_from_swagger() { + local swagger_file="$1" + + command -v jq >/dev/null 2>&1 || { + echo "Error: jq is required to parse swagger files. Install with: brew install jq (macOS) or apt-get install jq (Linux)" + exit 1 + } + + # Extract paths with their HTTP methods (format: pathmethod) + { + jq -r '.paths | to_entries[] | + .key as $path | + .value | + to_entries[] | + select(.key | test("^(get|post|put|patch|delete)$"; "i")) | + "\($path)\t\(.key)"' "$swagger_file" | \ + awk '{print $1 "\t" toupper($2)}' | \ + grep -v "^/admin" || true + printf '/healthz/ready\tGET\n' + } | sort -u +} + +# Profile a single endpoint +# Parameters: endpoint_pathhttp_method (e.g., "/path/to/endpointGET") +# or just endpoint_path (defaults to GET for backward compatibility) +profile_endpoint() { + local endpoint_input="$1" + local endpoint + local http_method + + # Parse endpoint and method (format: pathmethod) + if [[ "$endpoint_input" == *$'\t'* ]]; then + IFS=$'\t' read -r endpoint http_method <<< "$endpoint_input" + # Trim whitespace from method + http_method=$(tr -d '[:space:]' <<< "$http_method") + else + # Backward compatibility: if no method specified, default to GET + endpoint="$endpoint_input" + http_method="GET" + fi + + # Normalize method to uppercase + http_method=$(tr '[:lower:]' '[:upper:]' <<< "$http_method") + + local endpoint_name + endpoint_name=$(sanitize_endpoint "$endpoint") + [ -z "$endpoint_name" ] && endpoint_name="endpoint" + + local substituted_path + substituted_path=$(substitute_path_params "$endpoint") + + # Add api-version query parameter (required by ARM API) + local test_url + if [[ "$substituted_path" == *"?"* ]]; then + test_url="${LOADTEST_BASE_URL}${substituted_path}&api-version=${TEST_API_VERSION}" + else + test_url="${LOADTEST_BASE_URL}${substituted_path}?api-version=${TEST_API_VERSION}" + fi + + local seconds="${DURATION//s/}" + + echo "==========================================" + echo "Profiling endpoint: $endpoint" + echo "HTTP Method: $http_method" + echo "Substituted path: $substituted_path" + echo "API version: $TEST_API_VERSION" + echo "URL: $test_url" + echo "Duration: $DURATION" + echo "Rate: $RATE req/s" + echo "Profile prefix: $endpoint_name" + echo "==========================================" + echo "" + + # Check if pprof server is running + if ! curl -s -o /dev/null -w "%{http_code}" "${PPROF_URL}/debug/pprof/" | grep -q "200"; then + echo "Warning: pprof server is not running at ${PPROF_URL}" + echo "Start it with: make runlocal-rp (with PPROF_ENABLED=true)" + echo "" + fi + + # Check if vegeta is available + command -v vegeta >/dev/null 2>&1 || { + echo "Error: vegeta not found. Install with: go install github.com/tsenart/vegeta@latest" + exit 1 + } + + mkdir -p "$PPROF_OUTPUT_DIR" + + echo "Starting vegeta attack in background..." + echo "Note: Most endpoints require authentication (mutual TLS or MISE) and valid resources." + echo " Expected errors (these are normal and indicate the server is processing requests):" + echo " - 400 Bad Request:" + echo " * Resource validation (e.g., OpenShift version not in enabled versions cache)" + echo " * InvalidSubscriptionState (subscription not registered in test environment)" + echo " * Missing request body for PUT/PATCH/POST endpoints" + echo " - 403 Forbidden: Authentication required (mutual TLS or MISE)" + echo " - 404 Not Found: Resource doesn't exist (expected for test data like test-cluster)" + echo " - 405 Method Not Allowed: Wrong HTTP method (should be fixed now)" + echo " Profiling will still capture server behavior under load regardless of response codes." + echo "" + + # Build vegeta target with correct HTTP method + # Format: METHOD URL (vegeta expects this format) + # Pipe directly to vegeta - it reads from stdin + { + printf '%s %s\n' "$http_method" "$test_url" + } | vegeta attack -duration="$DURATION" -rate="$RATE" -insecure > "${PPROF_OUTPUT_DIR}/${endpoint_name}-vegeta.bin" & + local vegeta_pid=$! + echo "Vegeta PID: $vegeta_pid" + sleep 1 + echo "" + + echo "Collecting profiles during load test..." + + # CPU profile + echo " → CPU profile (${seconds} seconds)..." + if curl -s "${PPROF_URL}/debug/pprof/profile?seconds=${seconds}" -o "${PPROF_OUTPUT_DIR}/${endpoint_name}-cpu.prof" 2>/dev/null; then + echo " ✓ CPU: ${PPROF_OUTPUT_DIR}/${endpoint_name}-cpu.prof" + else + echo " ✗ Failed to collect CPU profile" + fi + + # Heap profile + echo " → Heap profile..." + if curl -s "${PPROF_URL}/debug/pprof/heap" -o "${PPROF_OUTPUT_DIR}/${endpoint_name}-heap.prof" 2>/dev/null; then + echo " ✓ Heap: ${PPROF_OUTPUT_DIR}/${endpoint_name}-heap.prof" + else + echo " ✗ Failed to collect heap profile" + fi + + # Allocs profile + echo " → Allocs profile..." + if curl -s "${PPROF_URL}/debug/pprof/allocs" -o "${PPROF_OUTPUT_DIR}/${endpoint_name}-allocs.prof" 2>/dev/null; then + echo " ✓ Allocs: ${PPROF_OUTPUT_DIR}/${endpoint_name}-allocs.prof" + else + echo " ✗ Failed to collect allocs profile" + fi + + # Goroutine profile + echo " → Goroutine profile..." + if curl -s "${PPROF_URL}/debug/pprof/goroutine" -o "${PPROF_OUTPUT_DIR}/${endpoint_name}-goroutine.prof" 2>/dev/null; then + echo " ✓ Goroutine: ${PPROF_OUTPUT_DIR}/${endpoint_name}-goroutine.prof" + else + echo " ✗ Failed to collect goroutine profile" + fi + + # Block profile + echo " → Block profile..." + if curl -s "${PPROF_URL}/debug/pprof/block" -o "${PPROF_OUTPUT_DIR}/${endpoint_name}-block.prof" 2>/dev/null; then + echo " ✓ Block: ${PPROF_OUTPUT_DIR}/${endpoint_name}-block.prof" + else + echo " ✗ Failed to collect block profile" + fi + + # Mutex profile + echo " → Mutex profile..." + if curl -s "${PPROF_URL}/debug/pprof/mutex" -o "${PPROF_OUTPUT_DIR}/${endpoint_name}-mutex.prof" 2>/dev/null; then + echo " ✓ Mutex: ${PPROF_OUTPUT_DIR}/${endpoint_name}-mutex.prof" + else + echo " ✗ Failed to collect mutex profile" + fi + + # Threadcreate profile + echo " → Threadcreate profile..." + if curl -s "${PPROF_URL}/debug/pprof/threadcreate" -o "${PPROF_OUTPUT_DIR}/${endpoint_name}-threadcreate.prof" 2>/dev/null; then + echo " ✓ Threadcreate: ${PPROF_OUTPUT_DIR}/${endpoint_name}-threadcreate.prof" + else + echo " ✗ Failed to collect threadcreate profile" + fi + + # Execution trace + echo " → Execution trace (5s)..." + if curl -s "${PPROF_URL}/debug/pprof/trace?seconds=5" -o "${PPROF_OUTPUT_DIR}/${endpoint_name}-trace.out" 2>/dev/null; then + echo " ✓ Trace: ${PPROF_OUTPUT_DIR}/${endpoint_name}-trace.out" + else + echo " ✗ Failed to collect trace" + fi + + echo "" + echo "Waiting for vegeta to finish..." + wait "$vegeta_pid" 2>/dev/null || true + + echo "" + echo "Vegeta report:" + vegeta report "${PPROF_OUTPUT_DIR}/${endpoint_name}-vegeta.bin" || true + + echo "" + echo "==========================================" + echo "Profile collection complete for: $endpoint" + echo "==========================================" + echo "" +} + +# Main execution +main() { + if [ -z "$ENDPOINT" ]; then + echo "Error: ENDPOINT is required" + echo "Usage: $0 ENDPOINT=/api/v1/clusters [DURATION=20s] [RATE=100]" + echo " $0 ENDPOINT=all [DURATION=10s] [RATE=50]" + exit 1 + fi + + if [ "$ENDPOINT" = "all" ]; then + echo "Extracting endpoints from swagger: $LATEST_SWAGGER" + echo "" + + local endpoints + endpoints=$(extract_endpoints_from_swagger "$LATEST_SWAGGER") + + if [ -z "$endpoints" ]; then + echo "Error: No endpoints found in swagger file" + exit 1 + fi + + local count + count=$(echo "$endpoints" | wc -l | tr -d ' ') + echo "Found $count endpoints to profile" + echo "" + echo "Note: After running 'make runlocal-rp', the RP frontend server is available." + echo " However, many endpoints require:" + echo " - Authentication (mutual TLS or MISE)" + echo " - Existing resources (subscriptions, clusters, etc.)" + echo " - Valid API versions" + echo " - Request bodies for PUT/PATCH/POST endpoints" + echo "" + echo " Expected response codes (these are normal and indicate server processing):" + echo " - 400: Validation errors, missing resources, unregistered subscriptions" + echo " - 403: Authentication required" + echo " - 404: Resources don't exist (expected for test data)" + echo " - 405: Wrong HTTP method (should be fixed)" + echo "" + echo " The profiling will still capture the server's behavior under load." + echo "" + read -p "Continue? (y/N) " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + exit 0 + fi + + while IFS=$'\t' read -r endpoint method; do + [ -z "$endpoint" ] && continue + # Trim whitespace from method + method=$(tr -d '[:space:]' <<< "${method:-GET}") + echo "" + echo ">>> Profiling: $endpoint ($method) <<<" + echo "" + profile_endpoint "${endpoint}$(printf '\t')${method}" + echo "" + echo "---" + echo "" + sleep 2 # Small delay between endpoints + done <<< "$endpoints" + + echo "" + echo "==========================================" + echo "All endpoints profiled!" + echo "==========================================" + echo "" + echo "View profiles in: $PPROF_OUTPUT_DIR" + echo "" + echo "Example commands:" + echo " go tool pprof -http=:8888 ${PPROF_OUTPUT_DIR}/-cpu.prof" + echo " go tool pprof -http=:8888 ${PPROF_OUTPUT_DIR}/-heap.prof" + echo " go tool trace ${PPROF_OUTPUT_DIR}/-trace.out" + else + profile_endpoint "$ENDPOINT" + + local endpoint_name + endpoint_name=$(sanitize_endpoint "$ENDPOINT") + echo "View profiles:" + echo " go tool pprof -http=:8888 ${PPROF_OUTPUT_DIR}/${endpoint_name}-cpu.prof" + echo " go tool pprof -http=:8888 ${PPROF_OUTPUT_DIR}/${endpoint_name}-heap.prof" + echo " go tool pprof -http=:8888 ${PPROF_OUTPUT_DIR}/${endpoint_name}-goroutine.prof" + echo " go tool trace ${PPROF_OUTPUT_DIR}/${endpoint_name}-trace.out" + echo "" + echo "Generate vegeta HTML report:" + echo " vegeta report -type=html ${PPROF_OUTPUT_DIR}/${endpoint_name}-vegeta.bin > ${PPROF_OUTPUT_DIR}/${endpoint_name}-vegeta.html" + fi +} + +main "$@" + From 9bdba149ce91689d1bff333529b1832fcadc5d90 Mon Sep 17 00:00:00 2001 From: Alessandro Affinito Date: Mon, 29 Dec 2025 17:24:45 +0100 Subject: [PATCH 5/6] Enhance pprof analysis guide with custom endpoint profiling instructions --- docs/pprof-analysis-guide.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/pprof-analysis-guide.md b/docs/pprof-analysis-guide.md index c3f909316a7..f8ed6c0a1cb 100644 --- a/docs/pprof-analysis-guide.md +++ b/docs/pprof-analysis-guide.md @@ -45,6 +45,8 @@ This guide explains how to analyze the collected pprof profiles and create featu ## Analyzing a Profile ### Expected Endpoint errors +> For profiling custom data flows, configure your resources, then use `make pprof-profile-endpoint ENDPOINT=/api/v1/yourEndpoint` + The 400/404 responses are expected validation errors from the server, not script issues: 400 for /openShiftVersions/{openShiftVersion}: Version "4.14.0" is not in the enabled versions cache. The server validates that the version exists before returning it. @@ -57,7 +59,6 @@ This guide explains how to analyze the collected pprof profiles and create featu 400 for POST /listCredentials//listAdminCredentials: Expected - InvalidSubscriptionState means the test subscription is not registered in the environment. These errors indicate the server is processing requests and returning appropriate validation responses. Profiling still captures server behavior under load, which is the goal. - The script is working correctly - it's making the requests with the proper HTTP methods, and the server is responding with validation errors as expected for test data. ### Step 1: View the Profile in Browser From 068868a5b3831b46fe77892f441365bca53ac5eb Mon Sep 17 00:00:00 2001 From: Alessandro Affinito Date: Mon, 29 Dec 2025 17:24:54 +0100 Subject: [PATCH 6/6] Refactor endpoint name sanitization in pprof profile script - Updated the endpoint name sanitization to use the substituted path instead of the original endpoint. - Ensured that the endpoint name defaults to "endpoint" if sanitization results in an empty value. --- hack/pprof-profile-endpoint.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/hack/pprof-profile-endpoint.sh b/hack/pprof-profile-endpoint.sh index 9871bba79cd..426613d82f4 100755 --- a/hack/pprof-profile-endpoint.sh +++ b/hack/pprof-profile-endpoint.sh @@ -123,13 +123,14 @@ profile_endpoint() { # Normalize method to uppercase http_method=$(tr '[:lower:]' '[:upper:]' <<< "$http_method") - local endpoint_name - endpoint_name=$(sanitize_endpoint "$endpoint") - [ -z "$endpoint_name" ] && endpoint_name="endpoint" - local substituted_path substituted_path=$(substitute_path_params "$endpoint") + # Sanitize the substituted path for use as filename (not the original with placeholders) + local endpoint_name + endpoint_name=$(sanitize_endpoint "$substituted_path") + [ -z "$endpoint_name" ] && endpoint_name="endpoint" + # Add api-version query parameter (required by ARM API) local test_url if [[ "$substituted_path" == *"?"* ]]; then