From f1292caf1f8f909265f9ea0aeb4ffcdec35ec9eb Mon Sep 17 00:00:00 2001 From: Oded David Date: Wed, 3 Dec 2025 12:57:32 +0200 Subject: [PATCH 1/2] feat(helm): add ServiceMonitor and PrometheusRule support - Add serviceMonitor configuration to values.yaml with interval and scrapeTimeout options - Add prometheusRule configuration with RDSControllerSyncErrors alert (severity: critical) - Create service-monitor.yaml template for Prometheus Operator ServiceMonitor - Create prometheus-rule.yaml template for Prometheus Operator PrometheusRule - Update values.schema.json with schema definitions for new options --- helm/templates/prometheus-rule.yaml | 25 ++++++++++++++ helm/templates/service-monitor.yaml | 36 ++++++++++++++++++++ helm/values.schema.json | 53 +++++++++++++++++++++++++++++ helm/values.yaml | 26 ++++++++++++++ 4 files changed, 140 insertions(+) create mode 100644 helm/templates/prometheus-rule.yaml create mode 100644 helm/templates/service-monitor.yaml diff --git a/helm/templates/prometheus-rule.yaml b/helm/templates/prometheus-rule.yaml new file mode 100644 index 0000000..9fd6010 --- /dev/null +++ b/helm/templates/prometheus-rule.yaml @@ -0,0 +1,25 @@ +{{- if .Values.metrics.prometheusRule.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ .Chart.Name | trimSuffix "-chart" | trunc 44 }}-controller-rules + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/name: {{ include "ack-rds-controller.app.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} + k8s-app: {{ include "ack-rds-controller.app.name" . }} + helm.sh/chart: {{ include "ack-rds-controller.chart.name-version" . }} +{{- with .Values.metrics.prometheusRule.additionalLabels }} +{{ toYaml . | indent 4 }} +{{- end }} +spec: + groups: + - name: {{ include "ack-rds-controller.app.name" . }} + rules: +{{- with .Values.metrics.prometheusRule.rules }} +{{ toYaml . | indent 8 }} +{{- end }} +{{- end }} + diff --git a/helm/templates/service-monitor.yaml b/helm/templates/service-monitor.yaml new file mode 100644 index 0000000..4c82a5a --- /dev/null +++ b/helm/templates/service-monitor.yaml @@ -0,0 +1,36 @@ +{{- if .Values.metrics.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ .Chart.Name | trimSuffix "-chart" | trunc 44 }}-controller-metrics + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/name: {{ include "ack-rds-controller.app.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} + k8s-app: {{ include "ack-rds-controller.app.name" . }} + helm.sh/chart: {{ include "ack-rds-controller.chart.name-version" . }} +{{- with .Values.metrics.serviceMonitor.additionalLabels }} +{{ toYaml . | indent 4 }} +{{- end }} +spec: + selector: + matchLabels: + app.kubernetes.io/name: {{ include "ack-rds-controller.app.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: Helm + k8s-app: {{ include "ack-rds-controller.app.name" . }} + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + endpoints: + - port: metricsport + {{- with .Values.metrics.serviceMonitor.interval }} + interval: {{ . }} + {{- end }} + {{- with .Values.metrics.serviceMonitor.scrapeTimeout }} + scrapeTimeout: {{ . }} + {{- end }} +{{- end }} + diff --git a/helm/values.schema.json b/helm/values.schema.json index c3f56a0..e186962 100644 --- a/helm/values.schema.json +++ b/helm/values.schema.json @@ -104,6 +104,59 @@ "type" ], "type": "object" + }, + "serviceMonitor": { + "description": "Prometheus ServiceMonitor settings", + "properties": { + "enabled": { + "type": "boolean" + }, + "additionalLabels": { + "type": "object" + }, + "interval": { + "type": "string" + }, + "scrapeTimeout": { + "type": "string" + } + }, + "type": "object" + }, + "prometheusRule": { + "description": "Prometheus PrometheusRule settings", + "properties": { + "enabled": { + "type": "boolean" + }, + "additionalLabels": { + "type": "object" + }, + "rules": { + "type": "array", + "items": { + "type": "object", + "properties": { + "alert": { + "type": "string" + }, + "expr": { + "type": "string" + }, + "for": { + "type": "string" + }, + "labels": { + "type": "object" + }, + "annotations": { + "type": "object" + } + } + } + } + }, + "type": "object" } }, "required": [ diff --git a/helm/values.yaml b/helm/values.yaml index 8f2aea3..994e73c 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -77,6 +77,32 @@ metrics: # Which Type to use for the Kubernetes Service? # See: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types type: "ClusterIP" + serviceMonitor: + # Set to true to automatically create a Prometheus ServiceMonitor resource + # Requires the Prometheus Operator CRDs to be installed + enabled: false + # Additional labels for the ServiceMonitor (e.g., for Prometheus selector) + additionalLabels: {} + # Scrape interval + interval: 30s + # Scrape timeout + scrapeTimeout: 10s + prometheusRule: + # Set to true to automatically create a Prometheus PrometheusRule resource + # Requires the Prometheus Operator CRDs to be installed + enabled: false + # Additional labels for the PrometheusRule (e.g., for Prometheus selector) + additionalLabels: {} + # Alert rules (can be overridden or extended) + rules: + - alert: RDSControllerSyncErrors + expr: sum by (controller) (rate(controller_runtime_reconcile_errors_total{job="rds-controller-controller-metrics"}[10m])) > 0.5 + for: 5m + labels: + severity: critical + annotations: + description: RDS controller having sync errors in the last 10 minutes for controller {{ $labels.controller }} + summary: RDS controller having sync errors with one or more objects resources: requests: From 8010ab549495097b30abc62c9d16488a604fcae3 Mon Sep 17 00:00:00 2001 From: Oded David Date: Wed, 3 Dec 2025 12:59:22 +0200 Subject: [PATCH 2/2] fix(helm): correct job label in PrometheusRule alert expression The alert expression was referencing job='rds-controller-controller-metrics' but the ServiceMonitor generates job='rds-controller-metrics'. Fixed the mismatch so the alert will correctly match scraped metrics. --- helm/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/values.yaml b/helm/values.yaml index 994e73c..fef14bc 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -96,7 +96,7 @@ metrics: # Alert rules (can be overridden or extended) rules: - alert: RDSControllerSyncErrors - expr: sum by (controller) (rate(controller_runtime_reconcile_errors_total{job="rds-controller-controller-metrics"}[10m])) > 0.5 + expr: sum by (controller) (rate(controller_runtime_reconcile_errors_total{job="rds-controller-metrics"}[10m])) > 0.5 for: 5m labels: severity: critical