From e9036eec7f197cfd6b874e29f924f54a9867e091 Mon Sep 17 00:00:00 2001 From: Allen Zhong Date: Fri, 10 Oct 2025 16:51:10 +0900 Subject: [PATCH 1/2] cluster: support retention by size for prometheus --- embed/templates/scripts/run_prometheus.sh.tpl | 7 ++- pkg/cluster/spec/monitoring.go | 24 +++++++- pkg/cluster/spec/monitoring_test.go | 56 +++++++++++++++---- pkg/cluster/template/scripts/monitoring.go | 3 +- .../template/scripts/monitoring_test.go | 3 +- 5 files changed, 77 insertions(+), 16 deletions(-) diff --git a/embed/templates/scripts/run_prometheus.sh.tpl b/embed/templates/scripts/run_prometheus.sh.tpl index b9d65b516b..3cc9cc591e 100644 --- a/embed/templates/scripts/run_prometheus.sh.tpl +++ b/embed/templates/scripts/run_prometheus.sh.tpl @@ -55,5 +55,10 @@ exec bin/prometheus/prometheus \ {{- end}} {{- end}} {{- if not .EnablePromAgentMode}} - --storage.tsdb.retention="{{.Retention}}" +{{- if .RetentionSize}} + --storage.tsdb.retention.size="{{.RetentionSize}}" +{{- end}} +{{- if .RetentionTime}} + --storage.tsdb.retention.time="{{.RetentionTime}}" +{{- end}} {{- end}} diff --git a/pkg/cluster/spec/monitoring.go b/pkg/cluster/spec/monitoring.go index e295dfa1e3..8a81e11c16 100644 --- a/pkg/cluster/spec/monitoring.go +++ b/pkg/cluster/spec/monitoring.go @@ -54,7 +54,9 @@ type PrometheusSpec struct { RemoteConfig Remote `yaml:"remote_config,omitempty" validate:"remote_config:ignore"` ExternalAlertmanagers []ExternalAlertmanager `yaml:"external_alertmanagers" validate:"external_alertmanagers:ignore"` PushgatewayAddrs []string `yaml:"pushgateway_addrs,omitempty" validate:"pushgateway_addrs:ignore"` - Retention string `yaml:"storage_retention,omitempty" validate:"storage_retention:editable"` + Retention string `yaml:"storage_retention,omitempty" validate:"storage_retention:editable"` // deprecated + RetentionSize string `yaml:"storage_retention_size,omitempty" validate:"storage_retention_size:editable"` + RetentionTime string `yaml:"storage_retention_time,omitempty" validate:"storage_retention_time:editable"` ResourceControl meta.ResourceControl `yaml:"resource_control,omitempty" validate:"resource_control:editable"` Arch string `yaml:"arch,omitempty"` OS string `yaml:"os,omitempty"` @@ -270,7 +272,6 @@ func (i *MonitorInstance) InitConfig( cfg := &scripts.PrometheusScript{ Port: spec.Port, WebExternalURL: fmt.Sprintf("http://%s", utils.JoinHostPort(spec.Host, spec.Port)), - Retention: getRetention(spec.Retention), EnableNG: spec.NgPort > 0, EnablePromAgentMode: spec.EnablePromAgentMode, // Get from spec directly @@ -282,6 +283,13 @@ func (i *MonitorInstance) InitConfig( AdditionalArgs: spec.AdditionalArgs, } + // Set retention policy + if spec.RetentionTime == "" { + cfg.RetentionTime = getRetentionTime(spec.Retention) + } else { + cfg.RetentionTime = getRetentionTime(spec.RetentionTime) + } + cfg.RetentionSize = getRetentionSize(spec.RetentionSize) // Check if agent mode is enabled in additional arguments if !cfg.EnablePromAgentMode { @@ -675,7 +683,17 @@ func mergeAdditionalScrapeConf(source string, addition map[string]any) error { return utils.WriteFile(source, bytes, 0644) } -func getRetention(retention string) string { +func getRetentionSize(retention string) string { + retention = strings.ToUpper(strings.TrimSpace(retention)) + valid, _ := regexp.MatchString("^[1-9]\\d*(B|KB|MB|GB|TB|PB|EB)$", retention) + if retention == "" || !valid { + return "" + } + return retention +} + +func getRetentionTime(retention string) string { + retention = strings.TrimSpace(retention) valid, _ := regexp.MatchString("^[1-9]\\d*d$", retention) if retention == "" || !valid { return "30d" diff --git a/pkg/cluster/spec/monitoring_test.go b/pkg/cluster/spec/monitoring_test.go index 6d3a33addf..779fe2516f 100644 --- a/pkg/cluster/spec/monitoring_test.go +++ b/pkg/cluster/spec/monitoring_test.go @@ -229,32 +229,68 @@ scrape_configs: func TestGetRetention(t *testing.T) { var val string - val = getRetention("-1d") + val = getRetentionTime("-1d") assert.EqualValues(t, "30d", val) - val = getRetention("0d") + val = getRetentionTime("0d") assert.EqualValues(t, "30d", val) - val = getRetention("01d") + val = getRetentionTime("01d") assert.EqualValues(t, "30d", val) - val = getRetention("1dd") + val = getRetentionTime("1dd") assert.EqualValues(t, "30d", val) - val = getRetention("*1d") + val = getRetentionTime("*1d") assert.EqualValues(t, "30d", val) - val = getRetention("1d ") - assert.EqualValues(t, "30d", val) + val = getRetentionTime("1d ") + assert.EqualValues(t, "1d", val) + + val = getRetentionTime(" 1d") + assert.EqualValues(t, "1d", val) - val = getRetention("ddd") + val = getRetentionTime("ddd") assert.EqualValues(t, "30d", val) - val = getRetention("60d") + val = getRetentionTime("60d") assert.EqualValues(t, "60d", val) - val = getRetention("999d") + val = getRetentionTime("999d") assert.EqualValues(t, "999d", val) + + val = getRetentionSize("-1MB") + assert.EqualValues(t, "", val) + + val = getRetentionSize("30d") + assert.EqualValues(t, "", val) + + val = getRetentionSize("1k") + assert.EqualValues(t, "", val) + + val = getRetentionSize("01G") + assert.EqualValues(t, "", val) + + val = getRetentionSize("233mb") + assert.EqualValues(t, "233MB", val) + + val = getRetentionSize("*1GB") + assert.EqualValues(t, "", val) + + val = getRetentionSize("20GB ") + assert.EqualValues(t, "20GB", val) + + val = getRetentionSize(" 20GB") + assert.EqualValues(t, "20GB", val) + + val = getRetentionSize("3TB") + assert.EqualValues(t, "3TB", val) + + val = getRetentionSize("30GB") + assert.EqualValues(t, "30GB", val) + + val = getRetentionSize("1EB") + assert.EqualValues(t, "1EB", val) } // TestHandleRemoteWrite verifies that remote write configurations are properly handled diff --git a/pkg/cluster/template/scripts/monitoring.go b/pkg/cluster/template/scripts/monitoring.go index 2e40e38bd4..3bdec0f439 100644 --- a/pkg/cluster/template/scripts/monitoring.go +++ b/pkg/cluster/template/scripts/monitoring.go @@ -26,7 +26,8 @@ import ( type PrometheusScript struct { Port int WebExternalURL string - Retention string + RetentionSize string + RetentionTime string EnableNG bool EnablePromAgentMode bool diff --git a/pkg/cluster/template/scripts/monitoring_test.go b/pkg/cluster/template/scripts/monitoring_test.go index c2b2f664b2..f11e8df6d4 100644 --- a/pkg/cluster/template/scripts/monitoring_test.go +++ b/pkg/cluster/template/scripts/monitoring_test.go @@ -32,7 +32,8 @@ func TestPrometheusScriptWithAgentMode(t *testing.T) { script := &PrometheusScript{ Port: 9090, WebExternalURL: "http://localhost:9090", - Retention: "30d", + RetentionTime: "30d", + RetentionSize: "100GB", EnableNG: false, EnablePromAgentMode: true, DeployDir: "/deploy", From b2b2026933682d44f40c3943eb0773bb84149a91 Mon Sep 17 00:00:00 2001 From: Allen Zhong Date: Tue, 18 Nov 2025 15:11:50 +0900 Subject: [PATCH 2/2] cluster: log warning when retention values are invalid --- pkg/cluster/spec/monitoring.go | 20 +++++++++----- pkg/cluster/spec/monitoring_test.go | 42 ++++++++++++++--------------- 2 files changed, 35 insertions(+), 27 deletions(-) diff --git a/pkg/cluster/spec/monitoring.go b/pkg/cluster/spec/monitoring.go index 8a81e11c16..9386344ca9 100644 --- a/pkg/cluster/spec/monitoring.go +++ b/pkg/cluster/spec/monitoring.go @@ -30,6 +30,7 @@ import ( "github.com/pingcap/tiup/pkg/cluster/ctxt" "github.com/pingcap/tiup/pkg/cluster/template/config" "github.com/pingcap/tiup/pkg/cluster/template/scripts" + logprinter "github.com/pingcap/tiup/pkg/logger/printer" "github.com/pingcap/tiup/pkg/meta" "github.com/pingcap/tiup/pkg/set" "github.com/pingcap/tiup/pkg/utils" @@ -284,12 +285,13 @@ func (i *MonitorInstance) InitConfig( AdditionalArgs: spec.AdditionalArgs, } // Set retention policy - if spec.RetentionTime == "" { - cfg.RetentionTime = getRetentionTime(spec.Retention) + logPtr := ctx.Value(logprinter.ContextKeyLogger).(*logprinter.Logger) + if spec.RetentionTime == "" { // keep backward compatiability + cfg.RetentionTime = getRetentionTime(logPtr, spec.Retention) } else { - cfg.RetentionTime = getRetentionTime(spec.RetentionTime) + cfg.RetentionTime = getRetentionTime(logPtr, spec.RetentionTime) } - cfg.RetentionSize = getRetentionSize(spec.RetentionSize) + cfg.RetentionSize = getRetentionSize(logPtr, spec.RetentionSize) // Check if agent mode is enabled in additional arguments if !cfg.EnablePromAgentMode { @@ -683,19 +685,25 @@ func mergeAdditionalScrapeConf(source string, addition map[string]any) error { return utils.WriteFile(source, bytes, 0644) } -func getRetentionSize(retention string) string { +func getRetentionSize(l *logprinter.Logger, retention string) string { retention = strings.ToUpper(strings.TrimSpace(retention)) valid, _ := regexp.MatchString("^[1-9]\\d*(B|KB|MB|GB|TB|PB|EB)$", retention) if retention == "" || !valid { + if !valid && l != nil { + l.Warnf("invalid retention size %s, ignored.", retention) + } return "" } return retention } -func getRetentionTime(retention string) string { +func getRetentionTime(l *logprinter.Logger, retention string) string { retention = strings.TrimSpace(retention) valid, _ := regexp.MatchString("^[1-9]\\d*d$", retention) if retention == "" || !valid { + if !valid && l != nil { + l.Warnf("invalid retention time %s, using 30d as default", retention) + } return "30d" } return retention diff --git a/pkg/cluster/spec/monitoring_test.go b/pkg/cluster/spec/monitoring_test.go index 779fe2516f..ce8b34e355 100644 --- a/pkg/cluster/spec/monitoring_test.go +++ b/pkg/cluster/spec/monitoring_test.go @@ -229,67 +229,67 @@ scrape_configs: func TestGetRetention(t *testing.T) { var val string - val = getRetentionTime("-1d") + val = getRetentionTime(nil, "-1d") assert.EqualValues(t, "30d", val) - val = getRetentionTime("0d") + val = getRetentionTime(nil, "0d") assert.EqualValues(t, "30d", val) - val = getRetentionTime("01d") + val = getRetentionTime(nil, "01d") assert.EqualValues(t, "30d", val) - val = getRetentionTime("1dd") + val = getRetentionTime(nil, "1dd") assert.EqualValues(t, "30d", val) - val = getRetentionTime("*1d") + val = getRetentionTime(nil, "*1d") assert.EqualValues(t, "30d", val) - val = getRetentionTime("1d ") + val = getRetentionTime(nil, "1d ") assert.EqualValues(t, "1d", val) - val = getRetentionTime(" 1d") + val = getRetentionTime(nil, " 1d") assert.EqualValues(t, "1d", val) - val = getRetentionTime("ddd") + val = getRetentionTime(nil, "ddd") assert.EqualValues(t, "30d", val) - val = getRetentionTime("60d") + val = getRetentionTime(nil, "60d") assert.EqualValues(t, "60d", val) - val = getRetentionTime("999d") + val = getRetentionTime(nil, "999d") assert.EqualValues(t, "999d", val) - val = getRetentionSize("-1MB") + val = getRetentionSize(nil, "-1MB") assert.EqualValues(t, "", val) - val = getRetentionSize("30d") + val = getRetentionSize(nil, "30d") assert.EqualValues(t, "", val) - val = getRetentionSize("1k") + val = getRetentionSize(nil, "1k") assert.EqualValues(t, "", val) - val = getRetentionSize("01G") + val = getRetentionSize(nil, "01G") assert.EqualValues(t, "", val) - val = getRetentionSize("233mb") + val = getRetentionSize(nil, "233mb") assert.EqualValues(t, "233MB", val) - val = getRetentionSize("*1GB") + val = getRetentionSize(nil, "*1GB") assert.EqualValues(t, "", val) - val = getRetentionSize("20GB ") + val = getRetentionSize(nil, "20GB ") assert.EqualValues(t, "20GB", val) - val = getRetentionSize(" 20GB") + val = getRetentionSize(nil, " 20GB") assert.EqualValues(t, "20GB", val) - val = getRetentionSize("3TB") + val = getRetentionSize(nil, "3TB") assert.EqualValues(t, "3TB", val) - val = getRetentionSize("30GB") + val = getRetentionSize(nil, "30GB") assert.EqualValues(t, "30GB", val) - val = getRetentionSize("1EB") + val = getRetentionSize(nil, "1EB") assert.EqualValues(t, "1EB", val) }