diff --git a/charts/controlplane-operations/Chart.yaml b/charts/controlplane-operations/Chart.yaml index 8001ce1..6f21490 100644 --- a/charts/controlplane-operations/Chart.yaml +++ b/charts/controlplane-operations/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: controlplane-operations -version: 1.0.12 +version: 1.0.13 description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters. maintainers: - name: Vladimir Videlov (d051408) diff --git a/charts/controlplane-operations/alerts/controlplane-bond.yaml b/charts/controlplane-operations/alerts/controlplane-bond.yaml index 3ec416f..152c648 100644 --- a/charts/controlplane-operations/alerts/controlplane-bond.yaml +++ b/charts/controlplane-operations/alerts/controlplane-bond.yaml @@ -6,10 +6,11 @@ groups: expr: sum(node_bonding_active) by (master, node) < 2 for: {{ dig "NodeBondDegradedMain" "for" "15m" .Values.prometheusRules }} labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "NodeBondDegradedMain" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/NodeBondDegradedMain.md - {{ include "controlplane-operations.additionalRuleLabels.common" . }} - {{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }} + service: {{ dig "NodeBondDegradedMain" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "NodeBondDegradedMain" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: Bond `{{`{{ $labels.master }}`}}` on `{{`{{ $labels.node }}`}}` is degraded. Imminent network outage for this node. summary: Bond `{{`{{ $labels.master }}`}}` is degraded. Node network connectivity is not HA. Switch failover or upgrade will cause an outage! @@ -20,10 +21,11 @@ groups: expr: sum(node_network_up{device=~"bond.*|vlan.*"} == 0) by (node, device) for: {{ dig "NodeVirtualInterfaceDown" "for" "15m" .Values.prometheusRules }} labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "NodeVirtualInterfaceDown" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/NodeVirtualInterfaceDown.md - {{ include "controlplane-operations.additionalRuleLabels.common" . }} - {{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }} + service: {{ dig "NodeVirtualInterfaceDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "NodeVirtualInterfaceDown" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: Interface `{{`{{ $labels.device }}`}}` on `{{`{{ $labels.node }}`}}` is down. Tenant network outage for this node. summary: Interface `{{`{{ $labels.device }}`}}` is down. Node network connectivity is degraded. diff --git a/charts/controlplane-operations/alerts/controlplane-node.yaml b/charts/controlplane-operations/alerts/controlplane-node.yaml index 96f733c..75e2d59 100644 --- a/charts/controlplane-operations/alerts/controlplane-node.yaml +++ b/charts/controlplane-operations/alerts/controlplane-node.yaml @@ -6,10 +6,11 @@ groups: expr: kube_node_status_condition{condition="BridgeFilterVLANTagged", status="true"} == 1 for: {{ dig "KubernetesNodeBridgeFilterVLANTagged" "for" "15m" .Values.prometheusRules }} labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "KubernetesNodeBridgeFilterVLANTagged" "severity" "info" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/KubernetesNodeBridgeFilterVLANTagged.md - {{ include "controlplane-operations.additionalRuleLabels.common" . }} - {{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }} + service: {{ dig "KubernetesNodeBridgeFilterVLANTagged" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "KubernetesNodeBridgeFilterVLANTagged" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: VLAN-tagged ARP/IP traffic is filtered by ARPtables/IPtables on `{{`{{ $labels.node }}`}}`. Network datapath threatened! summary: Bridged VLAN-tagged traffic is filtered by IPtables. diff --git a/charts/controlplane-operations/alerts/controlplane-pvc.yaml b/charts/controlplane-operations/alerts/controlplane-pvc.yaml index e219c72..c31cef6 100644 --- a/charts/controlplane-operations/alerts/controlplane-pvc.yaml +++ b/charts/controlplane-operations/alerts/controlplane-pvc.yaml @@ -11,10 +11,11 @@ groups: expr: kubelet_volume_stats_available_percent < 10 for: {{ dig "KubernetesPVCNoSpaceLeft" "for" "10m" .Values.prometheusRules }} labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "KubernetesPVCNoSpaceLeft" "severity" "info" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/KubernetesPVCNoSpaceLeft.md - {{ include "controlplane-operations.additionalRuleLabels.common" . }} - {{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }} + service: {{ dig "KubernetesPVCNoSpaceLeft" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "KubernetesPVCNoSpaceLeft" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: "The PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` is almost full. Increase or delete files." summary: "PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` free space is less than 10%." @@ -25,10 +26,11 @@ groups: expr: kubelet_volume_stats_available_percent < 2 for: {{ dig "KubernetesPVCNoSpaceLeft" "for" "10m" .Values.prometheusRules }} labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "KubernetesPVCNoSpaceLeft" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/KubernetesPVCNoSpaceLeft.md - {{ include "controlplane-operations.additionalRuleLabels.common" . }} - {{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }} + service: {{ dig "KubernetesPVCNoSpaceLeft" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "KubernetesPVCNoSpaceLeft" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: "The PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` is full. Programs will stop working if relying upon free storage." summary: "PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` usage is over 98%." diff --git a/charts/controlplane-operations/alerts/controlplane-remote.yaml b/charts/controlplane-operations/alerts/controlplane-remote.yaml index 42f13a5..fa51e0e 100644 --- a/charts/controlplane-operations/alerts/controlplane-remote.yaml +++ b/charts/controlplane-operations/alerts/controlplane-remote.yaml @@ -8,10 +8,11 @@ groups: == 1 for: {{ dig "ArgoraUpdateInError" "for" "10m" .Values.prometheusRules }} labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "ArgoraUpdateInError" "severity" "warning" .Values.prometheusRules }} playbook: https://operations.global.cloud.sap/docs/support/playbook/argora-update-in-error/ #TODO: add playbook - {{ include "controlplane-operations.additionalRuleLabels.common" . }} - {{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }} + service: {{ dig "ArgoraUpdateInError" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "ArgoraUpdateInError" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: "Argora Update CR status is in Error state for more than 10 minutes." summary: "Update CR in Error state." @@ -24,10 +25,11 @@ groups: == 1 for: {{ dig "ArgoraClusterImportInError" "for" "10m" .Values.prometheusRules }} labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "ArgoraClusterImportInError" "severity" "warning" .Values.prometheusRules }} playbook: https://operations.global.cloud.sap/docs/support/playbook/argora-clusterimport-in-error/ #TODO: add playbook - {{ include "controlplane-operations.additionalRuleLabels.common" . }} - {{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }} + service: {{ dig "ArgoraClusterImportInError" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "ArgoraClusterImportInError" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: "Argora ClusterImport CR status is in Error state for more than 10 minutes." summary: "ClusterImport CR in Error state." @@ -39,10 +41,11 @@ groups: kube_pod_status_ready{pod=~"argora-controller-manager-.+",condition="true"} == 0 for: {{ dig "ArgoraPodNotReadyError" "for" "5m" .Values.prometheusRules }} labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "ArgoraPodNotReadyError" "severity" "warning" .Values.prometheusRules }} playbook: https://operations.global.cloud.sap/docs/support/playbook/argora-pod-not-ready/ #TODO: add playbook - {{ include "controlplane-operations.additionalRuleLabels.common" . }} - {{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }} + service: {{ dig "ArgoraPodNotReadyError" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "ArgoraPodNotReadyError" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: "Argora Pod is not ready for more than 5 minutes." summary: "Pod not ready." @@ -55,10 +58,11 @@ groups: == 1 for: {{ dig "ServerStuckInDiscovery" "for" "15m" .Values.prometheusRules }} labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "ServerStuckInDiscovery" "severity" "warning" .Values.prometheusRules }} playbook: https://operations.global.cloud.sap/docs/support/playbook/server-stuck-in-discovery/ #TODO: add playbook - {{ include "controlplane-operations.additionalRuleLabels.common" . }} - {{ include "controlplane-operations.additionalRuleLabels.supportGroupFoundation" . }} + service: {{ dig "ServerStuckInDiscovery" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "ServerStuckInDiscovery" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: "Server is stuck in Discovery for more than 15 minutes." summary: "Server stuck in Discovery." diff --git a/charts/controlplane-operations/plugindefinition.yaml b/charts/controlplane-operations/plugindefinition.yaml index 123efab..c7fdfd8 100644 --- a/charts/controlplane-operations/plugindefinition.yaml +++ b/charts/controlplane-operations/plugindefinition.yaml @@ -3,7 +3,7 @@ kind: PluginDefinition metadata: name: controlplane-operations spec: - version: 1.0.12 + version: 1.0.13 displayName: Controlplane operations bundle description: Operations bundle for Controlane clusters docMarkDownUrl: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/README.md @@ -11,7 +11,7 @@ spec: helmChart: name: controlplane-operations repository: oci://ghcr.io/cloudoperators/controlplane-operations/charts - version: 1.0.12 + version: 1.0.13 options: - name: prometheusRules.create description: Create Prometheus rules diff --git a/charts/controlplane-operations/templates/_helpers.tpl b/charts/controlplane-operations/templates/_helpers.tpl index bf77ab4..555a753 100644 --- a/charts/controlplane-operations/templates/_helpers.tpl +++ b/charts/controlplane-operations/templates/_helpers.tpl @@ -19,27 +19,15 @@ app.kubernetes.io/part-of: {{ $root.Release.Name }} {{- end }} {{- end }} -{{- define "controlplane-operations.additionalRuleLabels.common" }} -{{- if .Values.prometheusRules.additionalRuleLabels.common }} -{{- toYaml .Values.prometheusRules.additionalRuleLabels.common | nindent 6 }} +{{- define "controlplane-operations.additionalRuleLabels" }} +{{- if .Values.prometheusRules.additionalRuleLabels }} +{{- toYaml .Values.prometheusRules.additionalRuleLabels | nindent 6 }} {{- end }} {{- if .Values.global.commonLabels }} {{ tpl (toYaml .Values.global.commonLabels) . }} {{- end }} {{- end }} -{{- define "controlplane-operations.additionalRuleLabels.supportGroupContainers" }} -{{- if .Values.prometheusRules.additionalRuleLabels.supportGroupContainers }} -{{- toYaml .Values.prometheusRules.additionalRuleLabels.supportGroupContainers | nindent 6 }} -{{- end }} -{{- end }} - -{{- define "controlplane-operations.additionalRuleLabels.supportGroupFoundation" }} -{{- if .Values.prometheusRules.additionalRuleLabels.supportGroupFoundation }} -{{- toYaml .Values.prometheusRules.additionalRuleLabels.supportGroupFoundation | nindent 6 }} -{{- end }} -{{- end }} - {{- define "controlplane-operations.dashboardSelectorLabels" }} {{- $path := index . 0 -}} {{- $root := index . 1 -}} diff --git a/charts/controlplane-operations/values.yaml b/charts/controlplane-operations/values.yaml index 887a3f9..978d5fc 100644 --- a/charts/controlplane-operations/values.yaml +++ b/charts/controlplane-operations/values.yaml @@ -26,12 +26,15 @@ prometheusRules: ## Annotations for PrometheusRules annotations: {} + ## Default service label for alerts. This is used if no service label is defined for an alert. + defaultService: "" + + ## Default support group for alerts. This is used if no support group label is defined for an alert. + defaultSupportGroup: "" + ## Additional labels for PrometheusRule alerts ## This is useful for adding additional labels of alerts to each rule - additionalRuleLabels: - common: {} - supportGroupContainers: {} - supportGroupFoundation: {} + additionalRuleLabels: {} ## Additional annotations for PrometheusRule alerts additionalRuleAnnotations: {} @@ -41,6 +44,11 @@ prometheusRules: # KubernetesApiServerDown: true # KubeletDown: true + # NodeVirtualInterfaceDown: + # service: "cc-cp" + # supportGroup: "containers" + # for: "15m" + # severity: "warning" ## Create default dashboards for monitoring the cluster ##