From aa991e115e0fab3504a96f8e84d1b0f19a2ab8e8 Mon Sep 17 00:00:00 2001 From: Vladimir Videlov Date: Wed, 11 Mar 2026 15:29:21 +0100 Subject: [PATCH 1/2] refactor --- charts/controlplane-operations/Chart.yaml | 2 +- .../alerts/controlplane-bond.yaml | 10 ++++++---- .../alerts/controlplane-node.yaml | 5 +++-- .../alerts/controlplane-pvc.yaml | 10 ++++++---- .../alerts/controlplane-remote.yaml | 20 +++++++++++-------- .../plugindefinition.yaml | 4 ++-- .../templates/_helpers.tpl | 18 +++-------------- charts/controlplane-operations/values.yaml | 13 ++++++++---- 8 files changed, 42 insertions(+), 40 deletions(-) diff --git a/charts/controlplane-operations/Chart.yaml b/charts/controlplane-operations/Chart.yaml index 8001ce1..6f21490 100644 --- a/charts/controlplane-operations/Chart.yaml +++ b/charts/controlplane-operations/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: controlplane-operations -version: 1.0.12 +version: 1.0.13 description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters. maintainers: - name: Vladimir Videlov (d051408) diff --git a/charts/controlplane-operations/alerts/controlplane-bond.yaml b/charts/controlplane-operations/alerts/controlplane-bond.yaml index 3ec416f..c39f18f 100644 --- a/charts/controlplane-operations/alerts/controlplane-bond.yaml +++ b/charts/controlplane-operations/alerts/controlplane-bond.yaml @@ -6,10 +6,11 @@ groups: expr: sum(node_bonding_active) by (master, node) < 2 for: {{ dig "NodeBondDegradedMain" "for" "15m" .Values.prometheusRules }} labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "NodeBondDegradedMain" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/NodeBondDegradedMain.md - {{ include "controlplane-operations.additionalRuleLabels.common" . }} - {{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }} + service: {{ dig "NodeBondDegradedMain" "service" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + support_group: {{ dig "NodeBondDegradedMain" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: Bond `{{`{{ $labels.master }}`}}` on `{{`{{ $labels.node }}`}}` is degraded. Imminent network outage for this node. summary: Bond `{{`{{ $labels.master }}`}}` is degraded. Node network connectivity is not HA. Switch failover or upgrade will cause an outage! @@ -20,10 +21,11 @@ groups: expr: sum(node_network_up{device=~"bond.*|vlan.*"} == 0) by (node, device) for: {{ dig "NodeVirtualInterfaceDown" "for" "15m" .Values.prometheusRules }} labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "NodeVirtualInterfaceDown" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/NodeVirtualInterfaceDown.md - {{ include "controlplane-operations.additionalRuleLabels.common" . }} - {{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }} + service: {{ dig "NodeVirtualInterfaceDown" "service" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + support_group: {{ dig "NodeVirtualInterfaceDown" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: Interface `{{`{{ $labels.device }}`}}` on `{{`{{ $labels.node }}`}}` is down. Tenant network outage for this node. summary: Interface `{{`{{ $labels.device }}`}}` is down. Node network connectivity is degraded. diff --git a/charts/controlplane-operations/alerts/controlplane-node.yaml b/charts/controlplane-operations/alerts/controlplane-node.yaml index 96f733c..8ee03c8 100644 --- a/charts/controlplane-operations/alerts/controlplane-node.yaml +++ b/charts/controlplane-operations/alerts/controlplane-node.yaml @@ -6,10 +6,11 @@ groups: expr: kube_node_status_condition{condition="BridgeFilterVLANTagged", status="true"} == 1 for: {{ dig "KubernetesNodeBridgeFilterVLANTagged" "for" "15m" .Values.prometheusRules }} labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "KubernetesNodeBridgeFilterVLANTagged" "severity" "info" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/KubernetesNodeBridgeFilterVLANTagged.md - {{ include "controlplane-operations.additionalRuleLabels.common" . }} - {{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }} + service: {{ dig "KubernetesNodeBridgeFilterVLANTagged" "service" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + support_group: {{ dig "KubernetesNodeBridgeFilterVLANTagged" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: VLAN-tagged ARP/IP traffic is filtered by ARPtables/IPtables on `{{`{{ $labels.node }}`}}`. Network datapath threatened! summary: Bridged VLAN-tagged traffic is filtered by IPtables. diff --git a/charts/controlplane-operations/alerts/controlplane-pvc.yaml b/charts/controlplane-operations/alerts/controlplane-pvc.yaml index e219c72..d5f08bc 100644 --- a/charts/controlplane-operations/alerts/controlplane-pvc.yaml +++ b/charts/controlplane-operations/alerts/controlplane-pvc.yaml @@ -11,10 +11,11 @@ groups: expr: kubelet_volume_stats_available_percent < 10 for: {{ dig "KubernetesPVCNoSpaceLeft" "for" "10m" .Values.prometheusRules }} labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "KubernetesPVCNoSpaceLeft" "severity" "info" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/KubernetesPVCNoSpaceLeft.md - {{ include "controlplane-operations.additionalRuleLabels.common" . }} - {{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }} + service: {{ dig "KubernetesPVCNoSpaceLeft" "service" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + support_group: {{ dig "KubernetesPVCNoSpaceLeft" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: "The PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` is almost full. Increase or delete files." summary: "PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` free space is less than 10%." @@ -25,10 +26,11 @@ groups: expr: kubelet_volume_stats_available_percent < 2 for: {{ dig "KubernetesPVCNoSpaceLeft" "for" "10m" .Values.prometheusRules }} labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "KubernetesPVCNoSpaceLeft" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/KubernetesPVCNoSpaceLeft.md - {{ include "controlplane-operations.additionalRuleLabels.common" . }} - {{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }} + service: {{ dig "KubernetesPVCNoSpaceLeft" "service" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + support_group: {{ dig "KubernetesPVCNoSpaceLeft" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: "The PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` is full. Programs will stop working if relying upon free storage." summary: "PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` usage is over 98%." diff --git a/charts/controlplane-operations/alerts/controlplane-remote.yaml b/charts/controlplane-operations/alerts/controlplane-remote.yaml index 42f13a5..2127b6e 100644 --- a/charts/controlplane-operations/alerts/controlplane-remote.yaml +++ b/charts/controlplane-operations/alerts/controlplane-remote.yaml @@ -8,10 +8,11 @@ groups: == 1 for: {{ dig "ArgoraUpdateInError" "for" "10m" .Values.prometheusRules }} labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "ArgoraUpdateInError" "severity" "warning" .Values.prometheusRules }} playbook: https://operations.global.cloud.sap/docs/support/playbook/argora-update-in-error/ #TODO: add playbook - {{ include "controlplane-operations.additionalRuleLabels.common" . }} - {{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }} + service: {{ dig "ArgoraUpdateInError" "service" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + support_group: {{ dig "ArgoraUpdateInError" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: "Argora Update CR status is in Error state for more than 10 minutes." summary: "Update CR in Error state." @@ -24,10 +25,11 @@ groups: == 1 for: {{ dig "ArgoraClusterImportInError" "for" "10m" .Values.prometheusRules }} labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "ArgoraClusterImportInError" "severity" "warning" .Values.prometheusRules }} playbook: https://operations.global.cloud.sap/docs/support/playbook/argora-clusterimport-in-error/ #TODO: add playbook - {{ include "controlplane-operations.additionalRuleLabels.common" . }} - {{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }} + service: {{ dig "ArgoraClusterImportInError" "service" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + support_group: {{ dig "ArgoraClusterImportInError" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: "Argora ClusterImport CR status is in Error state for more than 10 minutes." summary: "ClusterImport CR in Error state." @@ -39,10 +41,11 @@ groups: kube_pod_status_ready{pod=~"argora-controller-manager-.+",condition="true"} == 0 for: {{ dig "ArgoraPodNotReadyError" "for" "5m" .Values.prometheusRules }} labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "ArgoraPodNotReadyError" "severity" "warning" .Values.prometheusRules }} playbook: https://operations.global.cloud.sap/docs/support/playbook/argora-pod-not-ready/ #TODO: add playbook - {{ include "controlplane-operations.additionalRuleLabels.common" . }} - {{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }} + service: {{ dig "ArgoraPodNotReadyError" "service" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + support_group: {{ dig "ArgoraPodNotReadyError" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: "Argora Pod is not ready for more than 5 minutes." summary: "Pod not ready." @@ -55,10 +58,11 @@ groups: == 1 for: {{ dig "ServerStuckInDiscovery" "for" "15m" .Values.prometheusRules }} labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "ServerStuckInDiscovery" "severity" "warning" .Values.prometheusRules }} playbook: https://operations.global.cloud.sap/docs/support/playbook/server-stuck-in-discovery/ #TODO: add playbook - {{ include "controlplane-operations.additionalRuleLabels.common" . }} - {{ include "controlplane-operations.additionalRuleLabels.supportGroupFoundation" . }} + service: {{ dig "ServerStuckInDiscovery" "service" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + support_group: {{ dig "ServerStuckInDiscovery" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: "Server is stuck in Discovery for more than 15 minutes." summary: "Server stuck in Discovery." diff --git a/charts/controlplane-operations/plugindefinition.yaml b/charts/controlplane-operations/plugindefinition.yaml index 123efab..c7fdfd8 100644 --- a/charts/controlplane-operations/plugindefinition.yaml +++ b/charts/controlplane-operations/plugindefinition.yaml @@ -3,7 +3,7 @@ kind: PluginDefinition metadata: name: controlplane-operations spec: - version: 1.0.12 + version: 1.0.13 displayName: Controlplane operations bundle description: Operations bundle for Controlane clusters docMarkDownUrl: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/README.md @@ -11,7 +11,7 @@ spec: helmChart: name: controlplane-operations repository: oci://ghcr.io/cloudoperators/controlplane-operations/charts - version: 1.0.12 + version: 1.0.13 options: - name: prometheusRules.create description: Create Prometheus rules diff --git a/charts/controlplane-operations/templates/_helpers.tpl b/charts/controlplane-operations/templates/_helpers.tpl index bf77ab4..555a753 100644 --- a/charts/controlplane-operations/templates/_helpers.tpl +++ b/charts/controlplane-operations/templates/_helpers.tpl @@ -19,27 +19,15 @@ app.kubernetes.io/part-of: {{ $root.Release.Name }} {{- end }} {{- end }} -{{- define "controlplane-operations.additionalRuleLabels.common" }} -{{- if .Values.prometheusRules.additionalRuleLabels.common }} -{{- toYaml .Values.prometheusRules.additionalRuleLabels.common | nindent 6 }} +{{- define "controlplane-operations.additionalRuleLabels" }} +{{- if .Values.prometheusRules.additionalRuleLabels }} +{{- toYaml .Values.prometheusRules.additionalRuleLabels | nindent 6 }} {{- end }} {{- if .Values.global.commonLabels }} {{ tpl (toYaml .Values.global.commonLabels) . }} {{- end }} {{- end }} -{{- define "controlplane-operations.additionalRuleLabels.supportGroupContainers" }} -{{- if .Values.prometheusRules.additionalRuleLabels.supportGroupContainers }} -{{- toYaml .Values.prometheusRules.additionalRuleLabels.supportGroupContainers | nindent 6 }} -{{- end }} -{{- end }} - -{{- define "controlplane-operations.additionalRuleLabels.supportGroupFoundation" }} -{{- if .Values.prometheusRules.additionalRuleLabels.supportGroupFoundation }} -{{- toYaml .Values.prometheusRules.additionalRuleLabels.supportGroupFoundation | nindent 6 }} -{{- end }} -{{- end }} - {{- define "controlplane-operations.dashboardSelectorLabels" }} {{- $path := index . 0 -}} {{- $root := index . 1 -}} diff --git a/charts/controlplane-operations/values.yaml b/charts/controlplane-operations/values.yaml index 887a3f9..ed330ee 100644 --- a/charts/controlplane-operations/values.yaml +++ b/charts/controlplane-operations/values.yaml @@ -26,12 +26,12 @@ prometheusRules: ## Annotations for PrometheusRules annotations: {} + ## Default support group for alerts. This is used if no support group label is defined for an alert. + defaultSupportGroup: "" + ## Additional labels for PrometheusRule alerts ## This is useful for adding additional labels of alerts to each rule - additionalRuleLabels: - common: {} - supportGroupContainers: {} - supportGroupFoundation: {} + additionalRuleLabels: {} ## Additional annotations for PrometheusRule alerts additionalRuleAnnotations: {} @@ -41,6 +41,11 @@ prometheusRules: # KubernetesApiServerDown: true # KubeletDown: true + # NodeVirtualInterfaceDown: + # service: "cc-cp" + # supportGroup: "containers" + # for: "15m" + # severity: "warning" ## Create default dashboards for monitoring the cluster ## From 70625d41aaf5bd2476b514e0a28f0d139a7db368 Mon Sep 17 00:00:00 2001 From: Vladimir Videlov Date: Wed, 11 Mar 2026 15:34:37 +0100 Subject: [PATCH 2/2] defaultService --- .../controlplane-operations/alerts/controlplane-bond.yaml | 4 ++-- .../controlplane-operations/alerts/controlplane-node.yaml | 2 +- .../controlplane-operations/alerts/controlplane-pvc.yaml | 4 ++-- .../alerts/controlplane-remote.yaml | 8 ++++---- charts/controlplane-operations/values.yaml | 3 +++ 5 files changed, 12 insertions(+), 9 deletions(-) diff --git a/charts/controlplane-operations/alerts/controlplane-bond.yaml b/charts/controlplane-operations/alerts/controlplane-bond.yaml index c39f18f..152c648 100644 --- a/charts/controlplane-operations/alerts/controlplane-bond.yaml +++ b/charts/controlplane-operations/alerts/controlplane-bond.yaml @@ -9,7 +9,7 @@ groups: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "NodeBondDegradedMain" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/NodeBondDegradedMain.md - service: {{ dig "NodeBondDegradedMain" "service" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + service: {{ dig "NodeBondDegradedMain" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "NodeBondDegradedMain" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: Bond `{{`{{ $labels.master }}`}}` on `{{`{{ $labels.node }}`}}` is degraded. Imminent network outage for this node. @@ -24,7 +24,7 @@ groups: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "NodeVirtualInterfaceDown" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/NodeVirtualInterfaceDown.md - service: {{ dig "NodeVirtualInterfaceDown" "service" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + service: {{ dig "NodeVirtualInterfaceDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "NodeVirtualInterfaceDown" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: Interface `{{`{{ $labels.device }}`}}` on `{{`{{ $labels.node }}`}}` is down. Tenant network outage for this node. diff --git a/charts/controlplane-operations/alerts/controlplane-node.yaml b/charts/controlplane-operations/alerts/controlplane-node.yaml index 8ee03c8..75e2d59 100644 --- a/charts/controlplane-operations/alerts/controlplane-node.yaml +++ b/charts/controlplane-operations/alerts/controlplane-node.yaml @@ -9,7 +9,7 @@ groups: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "KubernetesNodeBridgeFilterVLANTagged" "severity" "info" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/KubernetesNodeBridgeFilterVLANTagged.md - service: {{ dig "KubernetesNodeBridgeFilterVLANTagged" "service" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + service: {{ dig "KubernetesNodeBridgeFilterVLANTagged" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "KubernetesNodeBridgeFilterVLANTagged" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: VLAN-tagged ARP/IP traffic is filtered by ARPtables/IPtables on `{{`{{ $labels.node }}`}}`. Network datapath threatened! diff --git a/charts/controlplane-operations/alerts/controlplane-pvc.yaml b/charts/controlplane-operations/alerts/controlplane-pvc.yaml index d5f08bc..c31cef6 100644 --- a/charts/controlplane-operations/alerts/controlplane-pvc.yaml +++ b/charts/controlplane-operations/alerts/controlplane-pvc.yaml @@ -14,7 +14,7 @@ groups: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "KubernetesPVCNoSpaceLeft" "severity" "info" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/KubernetesPVCNoSpaceLeft.md - service: {{ dig "KubernetesPVCNoSpaceLeft" "service" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + service: {{ dig "KubernetesPVCNoSpaceLeft" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "KubernetesPVCNoSpaceLeft" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: "The PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` is almost full. Increase or delete files." @@ -29,7 +29,7 @@ groups: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "KubernetesPVCNoSpaceLeft" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/KubernetesPVCNoSpaceLeft.md - service: {{ dig "KubernetesPVCNoSpaceLeft" "service" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + service: {{ dig "KubernetesPVCNoSpaceLeft" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "KubernetesPVCNoSpaceLeft" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: "The PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` is full. Programs will stop working if relying upon free storage." diff --git a/charts/controlplane-operations/alerts/controlplane-remote.yaml b/charts/controlplane-operations/alerts/controlplane-remote.yaml index 2127b6e..fa51e0e 100644 --- a/charts/controlplane-operations/alerts/controlplane-remote.yaml +++ b/charts/controlplane-operations/alerts/controlplane-remote.yaml @@ -11,7 +11,7 @@ groups: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "ArgoraUpdateInError" "severity" "warning" .Values.prometheusRules }} playbook: https://operations.global.cloud.sap/docs/support/playbook/argora-update-in-error/ #TODO: add playbook - service: {{ dig "ArgoraUpdateInError" "service" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + service: {{ dig "ArgoraUpdateInError" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "ArgoraUpdateInError" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: "Argora Update CR status is in Error state for more than 10 minutes." @@ -28,7 +28,7 @@ groups: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "ArgoraClusterImportInError" "severity" "warning" .Values.prometheusRules }} playbook: https://operations.global.cloud.sap/docs/support/playbook/argora-clusterimport-in-error/ #TODO: add playbook - service: {{ dig "ArgoraClusterImportInError" "service" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + service: {{ dig "ArgoraClusterImportInError" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "ArgoraClusterImportInError" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: "Argora ClusterImport CR status is in Error state for more than 10 minutes." @@ -44,7 +44,7 @@ groups: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "ArgoraPodNotReadyError" "severity" "warning" .Values.prometheusRules }} playbook: https://operations.global.cloud.sap/docs/support/playbook/argora-pod-not-ready/ #TODO: add playbook - service: {{ dig "ArgoraPodNotReadyError" "service" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + service: {{ dig "ArgoraPodNotReadyError" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "ArgoraPodNotReadyError" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: "Argora Pod is not ready for more than 5 minutes." @@ -61,7 +61,7 @@ groups: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "ServerStuckInDiscovery" "severity" "warning" .Values.prometheusRules }} playbook: https://operations.global.cloud.sap/docs/support/playbook/server-stuck-in-discovery/ #TODO: add playbook - service: {{ dig "ServerStuckInDiscovery" "service" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + service: {{ dig "ServerStuckInDiscovery" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "ServerStuckInDiscovery" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: "Server is stuck in Discovery for more than 15 minutes." diff --git a/charts/controlplane-operations/values.yaml b/charts/controlplane-operations/values.yaml index ed330ee..978d5fc 100644 --- a/charts/controlplane-operations/values.yaml +++ b/charts/controlplane-operations/values.yaml @@ -26,6 +26,9 @@ prometheusRules: ## Annotations for PrometheusRules annotations: {} + ## Default service label for alerts. This is used if no service label is defined for an alert. + defaultService: "" + ## Default support group for alerts. This is used if no support group label is defined for an alert. defaultSupportGroup: ""