Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion charts/controlplane-operations/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
apiVersion: v2
name: controlplane-operations
version: 1.0.12
version: 1.0.13
description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters.
maintainers:
- name: Vladimir Videlov (d051408)
Expand Down
10 changes: 6 additions & 4 deletions charts/controlplane-operations/alerts/controlplane-bond.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@ groups:
expr: sum(node_bonding_active) by (master, node) < 2
for: {{ dig "NodeBondDegradedMain" "for" "15m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "NodeBondDegradedMain" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/NodeBondDegradedMain.md
{{ include "controlplane-operations.additionalRuleLabels.common" . }}
{{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }}
service: {{ dig "NodeBondDegradedMain" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "NodeBondDegradedMain" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Bond `{{`{{ $labels.master }}`}}` on `{{`{{ $labels.node }}`}}` is degraded. Imminent network outage for this node.
summary: Bond `{{`{{ $labels.master }}`}}` is degraded. Node network connectivity is not HA. Switch failover or upgrade will cause an outage!
Expand All @@ -20,10 +21,11 @@ groups:
expr: sum(node_network_up{device=~"bond.*|vlan.*"} == 0) by (node, device)
for: {{ dig "NodeVirtualInterfaceDown" "for" "15m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "NodeVirtualInterfaceDown" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/NodeVirtualInterfaceDown.md
{{ include "controlplane-operations.additionalRuleLabels.common" . }}
{{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }}
service: {{ dig "NodeVirtualInterfaceDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "NodeVirtualInterfaceDown" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Interface `{{`{{ $labels.device }}`}}` on `{{`{{ $labels.node }}`}}` is down. Tenant network outage for this node.
summary: Interface `{{`{{ $labels.device }}`}}` is down. Node network connectivity is degraded.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@ groups:
expr: kube_node_status_condition{condition="BridgeFilterVLANTagged", status="true"} == 1
for: {{ dig "KubernetesNodeBridgeFilterVLANTagged" "for" "15m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "KubernetesNodeBridgeFilterVLANTagged" "severity" "info" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/KubernetesNodeBridgeFilterVLANTagged.md
{{ include "controlplane-operations.additionalRuleLabels.common" . }}
{{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }}
service: {{ dig "KubernetesNodeBridgeFilterVLANTagged" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "KubernetesNodeBridgeFilterVLANTagged" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: VLAN-tagged ARP/IP traffic is filtered by ARPtables/IPtables on `{{`{{ $labels.node }}`}}`. Network datapath threatened!
summary: Bridged VLAN-tagged traffic is filtered by IPtables.
Expand Down
10 changes: 6 additions & 4 deletions charts/controlplane-operations/alerts/controlplane-pvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@ groups:
expr: kubelet_volume_stats_available_percent < 10
for: {{ dig "KubernetesPVCNoSpaceLeft" "for" "10m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "KubernetesPVCNoSpaceLeft" "severity" "info" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/KubernetesPVCNoSpaceLeft.md
{{ include "controlplane-operations.additionalRuleLabels.common" . }}
{{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }}
service: {{ dig "KubernetesPVCNoSpaceLeft" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "KubernetesPVCNoSpaceLeft" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: "The PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` is almost full. Increase or delete files."
summary: "PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` free space is less than 10%."
Expand All @@ -25,10 +26,11 @@ groups:
expr: kubelet_volume_stats_available_percent < 2
for: {{ dig "KubernetesPVCNoSpaceLeft" "for" "10m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "KubernetesPVCNoSpaceLeft" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/KubernetesPVCNoSpaceLeft.md
{{ include "controlplane-operations.additionalRuleLabels.common" . }}
{{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }}
service: {{ dig "KubernetesPVCNoSpaceLeft" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "KubernetesPVCNoSpaceLeft" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: "The PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` is full. Programs will stop working if relying upon free storage."
summary: "PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` usage is over 98%."
Expand Down
20 changes: 12 additions & 8 deletions charts/controlplane-operations/alerts/controlplane-remote.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@ groups:
== 1
for: {{ dig "ArgoraUpdateInError" "for" "10m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "ArgoraUpdateInError" "severity" "warning" .Values.prometheusRules }}
playbook: https://operations.global.cloud.sap/docs/support/playbook/argora-update-in-error/ #TODO: add playbook
{{ include "controlplane-operations.additionalRuleLabels.common" . }}
{{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }}
service: {{ dig "ArgoraUpdateInError" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "ArgoraUpdateInError" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: "Argora Update CR status is in Error state for more than 10 minutes."
summary: "Update CR in Error state."
Expand All @@ -24,10 +25,11 @@ groups:
== 1
for: {{ dig "ArgoraClusterImportInError" "for" "10m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "ArgoraClusterImportInError" "severity" "warning" .Values.prometheusRules }}
playbook: https://operations.global.cloud.sap/docs/support/playbook/argora-clusterimport-in-error/ #TODO: add playbook
{{ include "controlplane-operations.additionalRuleLabels.common" . }}
{{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }}
service: {{ dig "ArgoraClusterImportInError" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "ArgoraClusterImportInError" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: "Argora ClusterImport CR status is in Error state for more than 10 minutes."
summary: "ClusterImport CR in Error state."
Expand All @@ -39,10 +41,11 @@ groups:
kube_pod_status_ready{pod=~"argora-controller-manager-.+",condition="true"} == 0
for: {{ dig "ArgoraPodNotReadyError" "for" "5m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "ArgoraPodNotReadyError" "severity" "warning" .Values.prometheusRules }}
playbook: https://operations.global.cloud.sap/docs/support/playbook/argora-pod-not-ready/ #TODO: add playbook
{{ include "controlplane-operations.additionalRuleLabels.common" . }}
{{ include "controlplane-operations.additionalRuleLabels.supportGroupContainers" . }}
service: {{ dig "ArgoraPodNotReadyError" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "ArgoraPodNotReadyError" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: "Argora Pod is not ready for more than 5 minutes."
summary: "Pod not ready."
Expand All @@ -55,10 +58,11 @@ groups:
== 1
for: {{ dig "ServerStuckInDiscovery" "for" "15m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "ServerStuckInDiscovery" "severity" "warning" .Values.prometheusRules }}
playbook: https://operations.global.cloud.sap/docs/support/playbook/server-stuck-in-discovery/ #TODO: add playbook
{{ include "controlplane-operations.additionalRuleLabels.common" . }}
{{ include "controlplane-operations.additionalRuleLabels.supportGroupFoundation" . }}
service: {{ dig "ServerStuckInDiscovery" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "ServerStuckInDiscovery" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: "Server is stuck in Discovery for more than 15 minutes."
summary: "Server stuck in Discovery."
Expand Down
4 changes: 2 additions & 2 deletions charts/controlplane-operations/plugindefinition.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@ kind: PluginDefinition
metadata:
name: controlplane-operations
spec:
version: 1.0.12
version: 1.0.13
displayName: Controlplane operations bundle
description: Operations bundle for Controlane clusters
docMarkDownUrl: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/README.md
icon: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/charts/controlplane-operations/kubernetes-logo.png
helmChart:
name: controlplane-operations
repository: oci://ghcr.io/cloudoperators/controlplane-operations/charts
version: 1.0.12
version: 1.0.13
options:
- name: prometheusRules.create
description: Create Prometheus rules
Expand Down
18 changes: 3 additions & 15 deletions charts/controlplane-operations/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -19,27 +19,15 @@ app.kubernetes.io/part-of: {{ $root.Release.Name }}
{{- end }}
{{- end }}

{{- define "controlplane-operations.additionalRuleLabels.common" }}
{{- if .Values.prometheusRules.additionalRuleLabels.common }}
{{- toYaml .Values.prometheusRules.additionalRuleLabels.common | nindent 6 }}
{{- define "controlplane-operations.additionalRuleLabels" }}
{{- if .Values.prometheusRules.additionalRuleLabels }}
{{- toYaml .Values.prometheusRules.additionalRuleLabels | nindent 6 }}
{{- end }}
{{- if .Values.global.commonLabels }}
{{ tpl (toYaml .Values.global.commonLabels) . }}
{{- end }}
{{- end }}

{{- define "controlplane-operations.additionalRuleLabels.supportGroupContainers" }}
{{- if .Values.prometheusRules.additionalRuleLabels.supportGroupContainers }}
{{- toYaml .Values.prometheusRules.additionalRuleLabels.supportGroupContainers | nindent 6 }}
{{- end }}
{{- end }}

{{- define "controlplane-operations.additionalRuleLabels.supportGroupFoundation" }}
{{- if .Values.prometheusRules.additionalRuleLabels.supportGroupFoundation }}
{{- toYaml .Values.prometheusRules.additionalRuleLabels.supportGroupFoundation | nindent 6 }}
{{- end }}
{{- end }}

{{- define "controlplane-operations.dashboardSelectorLabels" }}
{{- $path := index . 0 -}}
{{- $root := index . 1 -}}
Expand Down
16 changes: 12 additions & 4 deletions charts/controlplane-operations/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,15 @@ prometheusRules:
## Annotations for PrometheusRules
annotations: {}

## Default service label for alerts. This is used if no service label is defined for an alert.
defaultService: ""

## Default support group for alerts. This is used if no support group label is defined for an alert.
defaultSupportGroup: ""

## Additional labels for PrometheusRule alerts
## This is useful for adding additional labels of alerts to each rule
additionalRuleLabels:
common: {}
supportGroupContainers: {}
supportGroupFoundation: {}
additionalRuleLabels: {}

## Additional annotations for PrometheusRule alerts
additionalRuleAnnotations: {}
Expand All @@ -41,6 +44,11 @@ prometheusRules:
# KubernetesApiServerDown: true
# KubeletDown: true

# NodeVirtualInterfaceDown:
# service: "cc-cp"
# supportGroup: "containers"
# for: "15m"
# severity: "warning"

## Create default dashboards for monitoring the cluster
##
Expand Down
Loading