diff --git a/braintrust/README.md b/braintrust/README.md index da4bbb6..89c1d8a 100644 --- a/braintrust/README.md +++ b/braintrust/README.md @@ -18,6 +18,32 @@ The `braintrust-secrets` secret must contain the following keys: | `GCS_ACCESS_KEY_ID` | Google HMAC Access ID string | Valid S3 API Key Id (only required if `cloud` is `google`) | | `GCS_SECRET_ACCESS_KEY` | Google HMAC Secret string | Valid S3 Secret string (only required if `cloud` is `google`) | +## Scheduled Restarts + +By default, the chart creates a CronJob that performs rolling restarts of the API, +Brainstore reader, and Brainstore writer Deployments once per hour using +`kubectl rollout restart`. This keeps restarts graceful and leverages the +Deployment rolling update strategy. + +You can customize or disable the schedules: + +```yaml +scheduledRestart: + enabled: true + schedule: "0 * * * *" + targets: + brainstoreWriter: false # Opt out of writer restarts + image: + repository: "chainguard/kubectl" + tag: "latest" # Optional: pin a specific version +``` + +Defaults to `chainguard/kubectl:latest` from Docker Hub. + +If you already manage RBAC or service accounts, set +`scheduledRestart.serviceAccount.create` and `scheduledRestart.rbac.create` to +false and provide a `scheduledRestart.serviceAccount.name`. + ## Azure Key Vault Driver Integration If you're using Azure, the Azure Key Vault CSI driver is default enabled and will automatically sync secrets from Azure Key Vault into Kubernetes. This eliminates the need to manually create and manage the `braintrust-secrets` Kubernetes secret. diff --git a/braintrust/templates/scheduled-restart.yaml b/braintrust/templates/scheduled-restart.yaml new file mode 100644 index 0000000..4d5b5d3 --- /dev/null +++ b/braintrust/templates/scheduled-restart.yaml @@ -0,0 +1,116 @@ +{{- if .Values.scheduledRestart.enabled }} +{{- $namespace := include "braintrust.namespace" . }} +{{- $saName := default "braintrust-restart" .Values.scheduledRestart.serviceAccount.name }} +{{- $labels := merge .Values.global.labels .Values.scheduledRestart.labels }} +{{- if .Values.scheduledRestart.serviceAccount.create }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ $saName }} + namespace: {{ $namespace }} + {{- with .Values.scheduledRestart.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with $labels }} + labels: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} +{{- if .Values.scheduledRestart.rbac.create }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ $saName }} + namespace: {{ $namespace }} + {{- with $labels }} + labels: + {{- toYaml . | nindent 4 }} + {{- end }} +rules: + - apiGroups: ["apps"] + resources: ["deployments"] + verbs: ["get", "list", "patch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ $saName }} + namespace: {{ $namespace }} + {{- with $labels }} + labels: + {{- toYaml . | nindent 4 }} + {{- end }} +subjects: + - kind: ServiceAccount + name: {{ $saName }} + namespace: {{ $namespace }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ $saName }} +{{- end }} +{{- $hasTargets := or .Values.scheduledRestart.targets.api (or .Values.scheduledRestart.targets.brainstoreReader .Values.scheduledRestart.targets.brainstoreWriter) }} +{{- if $hasTargets }} +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ .Values.api.name }}-restart + namespace: {{ $namespace }} + {{- with $labels }} + labels: + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .Values.scheduledRestart.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + schedule: {{ required "scheduledRestart.schedule is required" .Values.scheduledRestart.schedule | quote }} + concurrencyPolicy: {{ .Values.scheduledRestart.concurrencyPolicy | default "Forbid" }} + startingDeadlineSeconds: {{ .Values.scheduledRestart.startingDeadlineSeconds | default 600 }} + successfulJobsHistoryLimit: {{ .Values.scheduledRestart.successfulJobsHistoryLimit | default 1 }} + failedJobsHistoryLimit: {{ .Values.scheduledRestart.failedJobsHistoryLimit | default 3 }} + jobTemplate: + spec: + backoffLimit: {{ .Values.scheduledRestart.backoffLimit | default 1 }} + {{- with .Values.scheduledRestart.ttlSecondsAfterFinished }} + ttlSecondsAfterFinished: {{ . }} + {{- end }} + template: + metadata: + {{- with $labels }} + labels: + {{- toYaml . | nindent 12 }} + {{- end }} + spec: + serviceAccountName: {{ $saName }} + restartPolicy: Never + containers: + - name: restart + image: "{{ .Values.scheduledRestart.image.repository }}:{{ default "latest" .Values.scheduledRestart.image.tag }}" + imagePullPolicy: {{ .Values.scheduledRestart.image.pullPolicy }} + command: + - kubectl + args: + - rollout + - restart + {{- if .Values.scheduledRestart.targets.api }} + - deployment/{{ .Values.api.name }} + {{- end }} + {{- if .Values.scheduledRestart.targets.brainstoreReader }} + - deployment/{{ .Values.brainstore.reader.name }} + {{- end }} + {{- if .Values.scheduledRestart.targets.brainstoreWriter }} + - deployment/{{ .Values.brainstore.writer.name }} + {{- end }} + - --namespace + - {{ $namespace | quote }} + {{- with .Values.scheduledRestart.resources }} + resources: + {{- toYaml . | nindent 16 }} + {{- end }} +{{- end }} +{{- end }} diff --git a/braintrust/values.yaml b/braintrust/values.yaml index e3847d8..3860849 100644 --- a/braintrust/values.yaml +++ b/braintrust/values.yaml @@ -68,6 +68,35 @@ objectStorage: # Single API bucket with paths for responses and code bundles apiBucket: "" +scheduledRestart: + # Perform rolling restarts on a schedule using kubectl rollout restart. + enabled: true + targets: + api: true + brainstoreReader: true + brainstoreWriter: true + schedule: "0 * * * *" + image: + repository: "chainguard/kubectl" + tag: "latest" + pullPolicy: IfNotPresent + serviceAccount: + create: true + name: "braintrust-restart" + annotations: {} + rbac: + create: true + concurrencyPolicy: "Forbid" + startingDeadlineSeconds: 600 + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 3 + backoffLimit: 1 + # Set to enable automatic cleanup of finished jobs. + ttlSecondsAfterFinished: "" + resources: {} + labels: {} + annotations: {} + api: name: "braintrust-api" labels: {}