From 40b0d16d68a68d33c0f5b2527c86bac5e42ab39f Mon Sep 17 00:00:00 2001 From: Tariq Ibrahim Date: Thu, 26 Feb 2026 19:45:11 -0800 Subject: [PATCH] [dcgm][dcgm-exporter] add liveness and readiness probes This commit adds liveness and readiness probes to the dcgm and dcgm-exporter operands. Adding probes to the DCGM pods ensure that these pods aren't marked as "Ready" until the DCGM is actually ready to serve traffic. The DCGM-Exporter probes have been taken from the default probes configured in the helm chart of the NVIDIA/dcgm-exporter project. Signed-off-by: Tariq Ibrahim --- assets/state-dcgm-exporter/0800_daemonset.yaml | 11 +++++++++++ assets/state-dcgm/0400_dcgm.yml | 8 ++++++++ 2 files changed, 19 insertions(+) diff --git a/assets/state-dcgm-exporter/0800_daemonset.yaml b/assets/state-dcgm-exporter/0800_daemonset.yaml index 30e65f103..c826aafbb 100644 --- a/assets/state-dcgm-exporter/0800_daemonset.yaml +++ b/assets/state-dcgm-exporter/0800_daemonset.yaml @@ -52,6 +52,17 @@ spec: ports: - name: "metrics" containerPort: 9400 + livenessProbe: + httpGet: + port: 9400 + path: /health + initialDelaySeconds: 45 + periodSeconds: 5 + readinessProbe: + httpGet: + port: 9400 + path: /health + initialDelaySeconds: 45 volumeMounts: - name: "pod-gpu-resources" readOnly: true diff --git a/assets/state-dcgm/0400_dcgm.yml b/assets/state-dcgm/0400_dcgm.yml index 14fea317a..0c27fd4e6 100644 --- a/assets/state-dcgm/0400_dcgm.yml +++ b/assets/state-dcgm/0400_dcgm.yml @@ -43,6 +43,14 @@ spec: ports: - name: "dcgm" containerPort: 5555 + livenessProbe: + tcpSocket: + port: 5555 + initialDelaySeconds: 15 + readinessProbe: + tcpSocket: + port: 5555 + initialDelaySeconds: 15 volumes: - name: run-nvidia hostPath: