diff --git a/assets/state-dcgm-exporter/0800_daemonset.yaml b/assets/state-dcgm-exporter/0800_daemonset.yaml index 30e65f103..c826aafbb 100644 --- a/assets/state-dcgm-exporter/0800_daemonset.yaml +++ b/assets/state-dcgm-exporter/0800_daemonset.yaml @@ -52,6 +52,17 @@ spec: ports: - name: "metrics" containerPort: 9400 + livenessProbe: + httpGet: + port: 9400 + path: /health + initialDelaySeconds: 45 + periodSeconds: 5 + readinessProbe: + httpGet: + port: 9400 + path: /health + initialDelaySeconds: 45 volumeMounts: - name: "pod-gpu-resources" readOnly: true diff --git a/assets/state-dcgm/0400_dcgm.yml b/assets/state-dcgm/0400_dcgm.yml index 14fea317a..0c27fd4e6 100644 --- a/assets/state-dcgm/0400_dcgm.yml +++ b/assets/state-dcgm/0400_dcgm.yml @@ -43,6 +43,14 @@ spec: ports: - name: "dcgm" containerPort: 5555 + livenessProbe: + tcpSocket: + port: 5555 + initialDelaySeconds: 15 + readinessProbe: + tcpSocket: + port: 5555 + initialDelaySeconds: 15 volumes: - name: run-nvidia hostPath: