NVIDIA · pimlock · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026
@@ -169,6 +169,21 @@ const FAILURE_PATTERNS: &[FailurePattern] = &[
         match_mode: MatchMode::Any,
         diagnose: diagnose_docker_not_running,
     },
+    // CDI specs missing — Docker daemon has CDI configured but no spec files exist
+    // or the requested device ID (nvidia.com/gpu=all) is not in any spec.
+    // Matches errors from Docker 25+ and containerd CDI injection paths.
+    FailurePattern {
+        matchers: &[
+            "CDI device not found",
+            "unknown CDI device",
+            "failed to inject CDI devices",
+            "no CDI devices found",
+            "CDI device injection failed",
+            "unresolvable CDI devices",
+        ],
+        match_mode: MatchMode::Any,
+        diagnose: diagnose_cdi_specs_missing,
+    },
 ];
 
 fn diagnose_corrupted_state(gateway_name: &str) -> GatewayFailureDiagnosis {
@@ -396,6 +411,29 @@ fn diagnose_certificate_issue(gateway_name: &str) -> GatewayFailureDiagnosis {
     }
 }
 
+fn diagnose_cdi_specs_missing(_gateway_name: &str) -> GatewayFailureDiagnosis {
+    GatewayFailureDiagnosis {
+        summary: "CDI specs not found on host".to_string(),
+        explanation: "GPU passthrough via CDI was selected (your Docker daemon has CDI spec \
+            directories configured) but no CDI device specs were found on the host. \
+            Specs must be pre-generated before OpenShell can inject the GPU into the \
+            cluster container."
+            .to_string(),
+        recovery_steps: vec![
+            RecoveryStep::with_command(
+                "Generate CDI specs on the host (nvidia-ctk creates /etc/cdi/ if it does not exist)",
+                "sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml",
+            ),
+            RecoveryStep::with_command(
+                "Verify the specs were generated and include nvidia.com/gpu entries",
+                "nvidia-ctk cdi list",
+            ),
+            RecoveryStep::new("Then retry: openshell gateway start --gpu"),
+        ],
+        retryable: false,
+    }
+}
+
 fn diagnose_docker_not_running(_gateway_name: &str) -> GatewayFailureDiagnosis {
     GatewayFailureDiagnosis {
         summary: "Docker is not running".to_string(),
@@ -925,4 +963,76 @@ mod tests {
             "commands should include gateway name, got: {all_commands}"
         );
     }
+
+    #[test]
+    fn test_diagnose_cdi_device_not_found() {
+        let diagnosis = diagnose_failure(
+            "test",
+            "could not run container: CDI device not found: nvidia.com/gpu=all",
+            None,
+        );
+        assert!(diagnosis.is_some());
+        let d = diagnosis.unwrap();
+        assert!(
+            d.summary.contains("CDI"),
+            "expected CDI diagnosis, got: {}",
+            d.summary
+        );
+        assert!(!d.retryable);
+    }
+
+    #[test]
+    fn test_diagnose_cdi_injection_failed_unresolvable() {
+        // Exact error observed from Docker 500 response
+        let diagnosis = diagnose_failure(
+            "test",
+            "Docker responded with status code 500: CDI device injection failed: unresolvable CDI devices nvidia.com/gpu=all",
+            None,
+        );
+        assert!(diagnosis.is_some());
+        let d = diagnosis.unwrap();
+        assert!(
+            d.summary.contains("CDI"),
+            "expected CDI diagnosis, got: {}",
+            d.summary
+        );
+        assert!(!d.retryable);
+    }
+
+    #[test]
+    fn test_diagnose_unknown_cdi_device() {
+        // containerd error path
+        let diagnosis = diagnose_failure(
+            "test",
+            "unknown CDI device requested: nvidia.com/gpu=all",
+            None,
+        );
+        assert!(diagnosis.is_some());
+        let d = diagnosis.unwrap();
+        assert!(
+            d.summary.contains("CDI"),
+            "expected CDI diagnosis, got: {}",
+            d.summary
+        );
+    }
+
+    #[test]
+    fn test_diagnose_cdi_recovery_mentions_nvidia_ctk() {
+        let d = diagnose_failure("test", "CDI device not found", None)
+            .expect("should match CDI pattern");
+        let all_steps: String = d
+            .recovery_steps
+            .iter()
+            .map(|s| format!("{} {}", s.description, s.command.as_deref().unwrap_or("")))
+            .collect::<Vec<_>>()
+            .join("\n");
+        assert!(
+            all_steps.contains("nvidia-ctk cdi generate"),
+            "recovery steps should mention nvidia-ctk cdi generate, got: {all_steps}"
+        );
+        assert!(
+            all_steps.contains("/etc/cdi/"),
+            "recovery steps should mention /etc/cdi/, got: {all_steps}"
+        );
+    }
 }
@@ -877,13 +877,97 @@ pub(crate) fn spawn_route_refresh(
 
 /// Minimum read-only paths required for a proxy-mode sandbox child process to
 /// function: dynamic linker, shared libraries, DNS resolution, CA certs,
-/// Python venv, and openshell logs.
-const PROXY_BASELINE_READ_ONLY: &[&str] = &["/usr", "/lib", "/etc", "/app", "/var/log"];
+/// Python venv, openshell logs, process info, and random bytes.
+///
+/// `/proc` and `/dev/urandom` are included here for the same reasons they
+/// appear in `restrictive_default_policy()`: virtually every process needs
+/// them.  Before the Landlock per-path fix (#677) these were effectively free
+/// because a missing path silently disabled the entire ruleset; now they must
+/// be explicit.
+const PROXY_BASELINE_READ_ONLY: &[&str] = &[
+    "/usr",
+    "/lib",
+    "/etc",
+    "/app",
+    "/var/log",
+    "/proc",
+    "/dev/urandom",
+];
 
 /// Minimum read-write paths required for a proxy-mode sandbox child process:
 /// user working directory and temporary files.
 const PROXY_BASELINE_READ_WRITE: &[&str] = &["/sandbox", "/tmp"];
 
+/// Fixed read-only paths required when a GPU is present.
+///
+/// `/run/nvidia-persistenced`: NVML tries to connect to the persistenced
+/// socket at init time.  If the directory exists but landlock denies traversal
+/// (EACCES vs ECONNREFUSED), NVML returns NVML_ERROR_INSUFFICIENT_PERMISSIONS
+/// even though the daemon is optional.  Only read/traversal access is needed —
+/// NVML connects to the existing socket but does not create or modify files.
+const GPU_BASELINE_READ_ONLY_FIXED: &[&str] = &["/run/nvidia-persistenced"];
+
+/// Fixed read-write paths required when a GPU is present.
+///
+/// `/dev/nvidiactl`, `/dev/nvidia-uvm`, `/dev/nvidia-uvm-tools`,
+/// `/dev/nvidia-modeset`: control and UVM devices injected by CDI.
+/// Landlock READ_FILE/WRITE_FILE restricts open(2) on device files even
+/// when DAC permissions would otherwise allow it.  Device nodes need
+/// read-write because NVML/CUDA opens them with O_RDWR.
+///
+/// `/proc`: CUDA writes to `/proc/<pid>/task/<tid>/comm` during `cuInit()`
+/// to set thread names.  Without write access, `cuInit()` returns error 304
+/// (`cudaErrorOperatingSystem`).  We must use `/proc` (not `/proc/self/task`)
+/// because Landlock rules bind to inodes: `/proc/self/task` in the pre_exec
+/// hook resolves to the shell's PID, but child processes (python, etc.)
+/// have different PIDs and thus different procfs inodes.  Security impact
+/// is limited — writable proc entries (`oom_score_adj`, etc.) are already
+/// kernel-restricted for non-root users; Landlock is defense-in-depth.
+///
+/// Per-GPU device files (`/dev/nvidia0`, `/dev/nvidia1`, …) are enumerated
+/// at runtime by `gpu_baseline_read_write_paths()` since the count varies.
+const GPU_BASELINE_READ_WRITE_FIXED: &[&str] = &[
+    "/dev/nvidiactl",
+    "/dev/nvidia-uvm",
+    "/dev/nvidia-uvm-tools",
+    "/dev/nvidia-modeset",
+    "/proc",
+];
+
+/// Returns true if GPU devices are present in the container.
+fn has_gpu_devices() -> bool {
+    std::path::Path::new("/dev/nvidiactl").exists()
+}
+
+/// Collect all GPU read-only paths (currently just the persistenced directory).
+fn gpu_baseline_read_only_paths() -> Vec<std::path::PathBuf> {
+    GPU_BASELINE_READ_ONLY_FIXED
+        .iter()
+        .map(|p| std::path::PathBuf::from(p))
+        .collect()
+}
+
+/// Collect all GPU read-write paths: fixed devices + per-GPU `/dev/nvidiaX`.
+fn gpu_baseline_read_write_paths() -> Vec<std::path::PathBuf> {
+    let mut paths: Vec<std::path::PathBuf> = GPU_BASELINE_READ_WRITE_FIXED
+        .iter()
+        .map(|p| std::path::PathBuf::from(p))
+        .collect();
+
+    // Enumerate per-GPU device nodes injected by CDI (nvidia0, nvidia1, …).
+    if let Ok(entries) = std::fs::read_dir("/dev") {
+        for entry in entries.flatten() {
+            let name = entry.file_name();
+            let name = name.to_string_lossy();
+            if name.starts_with("nvidia") && name[6..].chars().all(|c| c.is_ascii_digit()) {
+                paths.push(entry.path());
+            }
+        }
+    }
+
+    paths
+}
+
 /// Ensure a proto `SandboxPolicy` includes the baseline filesystem paths
 /// required for proxy-mode sandboxes.  Paths are only added if missing;
 /// user-specified paths are never removed.
@@ -935,6 +1019,46 @@ fn enrich_proto_baseline_paths(proto: &mut openshell_core::proto::SandboxPolicy)
         }
     }
 
+    if has_gpu_devices() {
+        for path in gpu_baseline_read_only_paths() {
+            let path_str = path.to_string_lossy();
+            if !fs.read_only.iter().any(|p| p.as_str() == path_str.as_ref())
+                && !fs
+                    .read_write
+                    .iter()
+                    .any(|p| p.as_str() == path_str.as_ref())
+            {
+                if !path.exists() {
+                    debug!(
+                        path = %path.display(),
+                        "GPU baseline read-only path does not exist, skipping enrichment"
+                    );
+                    continue;
+                }
+                fs.read_only.push(path_str.into_owned());
+                modified = true;
+            }
+        }
+        for path in gpu_baseline_read_write_paths() {
+            let path_str = path.to_string_lossy();
+            if !fs
+                .read_write
+                .iter()
+                .any(|p| p.as_str() == path_str.as_ref())
+            {
+                if !path.exists() {
+                    debug!(
+                        path = %path.display(),
+                        "GPU baseline read-write path does not exist, skipping enrichment"
+                    );
+                    continue;
+                }
+                fs.read_write.push(path_str.into_owned());
+                modified = true;
+            }
+        }
+    }
+
     if modified {
         info!("Enriched policy with baseline filesystem paths for proxy mode");
     }
@@ -982,6 +1106,37 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) {
         }
     }
 
+    if has_gpu_devices() {
+        for p in gpu_baseline_read_only_paths() {
+            if !policy.filesystem.read_only.contains(&p)
+                && !policy.filesystem.read_write.contains(&p)
+            {
+                if !p.exists() {
+                    debug!(
+                        path = %p.display(),
+                        "GPU baseline read-only path does not exist, skipping enrichment"
+                    );
+                    continue;
+                }
+                policy.filesystem.read_only.push(p);
+                modified = true;
+            }
+        }
+        for p in gpu_baseline_read_write_paths() {
+            if !policy.filesystem.read_write.contains(&p) {
+                if !p.exists() {
+                    debug!(
+                        path = %p.display(),
+                        "GPU baseline read-write path does not exist, skipping enrichment"
+                    );
+                    continue;
+                }
+                policy.filesystem.read_write.push(p);
+                modified = true;
+            }
+        }
+    }
+
     if modified {
         info!("Enriched policy with baseline filesystem paths for proxy mode");
     }