diff --git a/crates/openshell-bootstrap/src/errors.rs b/crates/openshell-bootstrap/src/errors.rs index b487c94a..9e385c68 100644 --- a/crates/openshell-bootstrap/src/errors.rs +++ b/crates/openshell-bootstrap/src/errors.rs @@ -169,6 +169,21 @@ const FAILURE_PATTERNS: &[FailurePattern] = &[ match_mode: MatchMode::Any, diagnose: diagnose_docker_not_running, }, + // CDI specs missing — Docker daemon has CDI configured but no spec files exist + // or the requested device ID (nvidia.com/gpu=all) is not in any spec. + // Matches errors from Docker 25+ and containerd CDI injection paths. + FailurePattern { + matchers: &[ + "CDI device not found", + "unknown CDI device", + "failed to inject CDI devices", + "no CDI devices found", + "CDI device injection failed", + "unresolvable CDI devices", + ], + match_mode: MatchMode::Any, + diagnose: diagnose_cdi_specs_missing, + }, ]; fn diagnose_corrupted_state(gateway_name: &str) -> GatewayFailureDiagnosis { @@ -396,6 +411,29 @@ fn diagnose_certificate_issue(gateway_name: &str) -> GatewayFailureDiagnosis { } } +fn diagnose_cdi_specs_missing(_gateway_name: &str) -> GatewayFailureDiagnosis { + GatewayFailureDiagnosis { + summary: "CDI specs not found on host".to_string(), + explanation: "GPU passthrough via CDI was selected (your Docker daemon has CDI spec \ + directories configured) but no CDI device specs were found on the host. \ + Specs must be pre-generated before OpenShell can inject the GPU into the \ + cluster container." + .to_string(), + recovery_steps: vec![ + RecoveryStep::with_command( + "Generate CDI specs on the host (nvidia-ctk creates /etc/cdi/ if it does not exist)", + "sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml", + ), + RecoveryStep::with_command( + "Verify the specs were generated and include nvidia.com/gpu entries", + "nvidia-ctk cdi list", + ), + RecoveryStep::new("Then retry: openshell gateway start --gpu"), + ], + retryable: false, + } +} + fn diagnose_docker_not_running(_gateway_name: &str) -> GatewayFailureDiagnosis { GatewayFailureDiagnosis { summary: "Docker is not running".to_string(), @@ -925,4 +963,76 @@ mod tests { "commands should include gateway name, got: {all_commands}" ); } + + #[test] + fn test_diagnose_cdi_device_not_found() { + let diagnosis = diagnose_failure( + "test", + "could not run container: CDI device not found: nvidia.com/gpu=all", + None, + ); + assert!(diagnosis.is_some()); + let d = diagnosis.unwrap(); + assert!( + d.summary.contains("CDI"), + "expected CDI diagnosis, got: {}", + d.summary + ); + assert!(!d.retryable); + } + + #[test] + fn test_diagnose_cdi_injection_failed_unresolvable() { + // Exact error observed from Docker 500 response + let diagnosis = diagnose_failure( + "test", + "Docker responded with status code 500: CDI device injection failed: unresolvable CDI devices nvidia.com/gpu=all", + None, + ); + assert!(diagnosis.is_some()); + let d = diagnosis.unwrap(); + assert!( + d.summary.contains("CDI"), + "expected CDI diagnosis, got: {}", + d.summary + ); + assert!(!d.retryable); + } + + #[test] + fn test_diagnose_unknown_cdi_device() { + // containerd error path + let diagnosis = diagnose_failure( + "test", + "unknown CDI device requested: nvidia.com/gpu=all", + None, + ); + assert!(diagnosis.is_some()); + let d = diagnosis.unwrap(); + assert!( + d.summary.contains("CDI"), + "expected CDI diagnosis, got: {}", + d.summary + ); + } + + #[test] + fn test_diagnose_cdi_recovery_mentions_nvidia_ctk() { + let d = diagnose_failure("test", "CDI device not found", None) + .expect("should match CDI pattern"); + let all_steps: String = d + .recovery_steps + .iter() + .map(|s| format!("{} {}", s.description, s.command.as_deref().unwrap_or(""))) + .collect::>() + .join("\n"); + assert!( + all_steps.contains("nvidia-ctk cdi generate"), + "recovery steps should mention nvidia-ctk cdi generate, got: {all_steps}" + ); + assert!( + all_steps.contains("/etc/cdi/"), + "recovery steps should mention /etc/cdi/, got: {all_steps}" + ); + } } diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 14963244..6d676805 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -877,13 +877,97 @@ pub(crate) fn spawn_route_refresh( /// Minimum read-only paths required for a proxy-mode sandbox child process to /// function: dynamic linker, shared libraries, DNS resolution, CA certs, -/// Python venv, and openshell logs. -const PROXY_BASELINE_READ_ONLY: &[&str] = &["/usr", "/lib", "/etc", "/app", "/var/log"]; +/// Python venv, openshell logs, process info, and random bytes. +/// +/// `/proc` and `/dev/urandom` are included here for the same reasons they +/// appear in `restrictive_default_policy()`: virtually every process needs +/// them. Before the Landlock per-path fix (#677) these were effectively free +/// because a missing path silently disabled the entire ruleset; now they must +/// be explicit. +const PROXY_BASELINE_READ_ONLY: &[&str] = &[ + "/usr", + "/lib", + "/etc", + "/app", + "/var/log", + "/proc", + "/dev/urandom", +]; /// Minimum read-write paths required for a proxy-mode sandbox child process: /// user working directory and temporary files. const PROXY_BASELINE_READ_WRITE: &[&str] = &["/sandbox", "/tmp"]; +/// Fixed read-only paths required when a GPU is present. +/// +/// `/run/nvidia-persistenced`: NVML tries to connect to the persistenced +/// socket at init time. If the directory exists but landlock denies traversal +/// (EACCES vs ECONNREFUSED), NVML returns NVML_ERROR_INSUFFICIENT_PERMISSIONS +/// even though the daemon is optional. Only read/traversal access is needed — +/// NVML connects to the existing socket but does not create or modify files. +const GPU_BASELINE_READ_ONLY_FIXED: &[&str] = &["/run/nvidia-persistenced"]; + +/// Fixed read-write paths required when a GPU is present. +/// +/// `/dev/nvidiactl`, `/dev/nvidia-uvm`, `/dev/nvidia-uvm-tools`, +/// `/dev/nvidia-modeset`: control and UVM devices injected by CDI. +/// Landlock READ_FILE/WRITE_FILE restricts open(2) on device files even +/// when DAC permissions would otherwise allow it. Device nodes need +/// read-write because NVML/CUDA opens them with O_RDWR. +/// +/// `/proc`: CUDA writes to `/proc//task//comm` during `cuInit()` +/// to set thread names. Without write access, `cuInit()` returns error 304 +/// (`cudaErrorOperatingSystem`). We must use `/proc` (not `/proc/self/task`) +/// because Landlock rules bind to inodes: `/proc/self/task` in the pre_exec +/// hook resolves to the shell's PID, but child processes (python, etc.) +/// have different PIDs and thus different procfs inodes. Security impact +/// is limited — writable proc entries (`oom_score_adj`, etc.) are already +/// kernel-restricted for non-root users; Landlock is defense-in-depth. +/// +/// Per-GPU device files (`/dev/nvidia0`, `/dev/nvidia1`, …) are enumerated +/// at runtime by `gpu_baseline_read_write_paths()` since the count varies. +const GPU_BASELINE_READ_WRITE_FIXED: &[&str] = &[ + "/dev/nvidiactl", + "/dev/nvidia-uvm", + "/dev/nvidia-uvm-tools", + "/dev/nvidia-modeset", + "/proc", +]; + +/// Returns true if GPU devices are present in the container. +fn has_gpu_devices() -> bool { + std::path::Path::new("/dev/nvidiactl").exists() +} + +/// Collect all GPU read-only paths (currently just the persistenced directory). +fn gpu_baseline_read_only_paths() -> Vec { + GPU_BASELINE_READ_ONLY_FIXED + .iter() + .map(|p| std::path::PathBuf::from(p)) + .collect() +} + +/// Collect all GPU read-write paths: fixed devices + per-GPU `/dev/nvidiaX`. +fn gpu_baseline_read_write_paths() -> Vec { + let mut paths: Vec = GPU_BASELINE_READ_WRITE_FIXED + .iter() + .map(|p| std::path::PathBuf::from(p)) + .collect(); + + // Enumerate per-GPU device nodes injected by CDI (nvidia0, nvidia1, …). + if let Ok(entries) = std::fs::read_dir("/dev") { + for entry in entries.flatten() { + let name = entry.file_name(); + let name = name.to_string_lossy(); + if name.starts_with("nvidia") && name[6..].chars().all(|c| c.is_ascii_digit()) { + paths.push(entry.path()); + } + } + } + + paths +} + /// Ensure a proto `SandboxPolicy` includes the baseline filesystem paths /// required for proxy-mode sandboxes. Paths are only added if missing; /// user-specified paths are never removed. @@ -935,6 +1019,46 @@ fn enrich_proto_baseline_paths(proto: &mut openshell_core::proto::SandboxPolicy) } } + if has_gpu_devices() { + for path in gpu_baseline_read_only_paths() { + let path_str = path.to_string_lossy(); + if !fs.read_only.iter().any(|p| p.as_str() == path_str.as_ref()) + && !fs + .read_write + .iter() + .any(|p| p.as_str() == path_str.as_ref()) + { + if !path.exists() { + debug!( + path = %path.display(), + "GPU baseline read-only path does not exist, skipping enrichment" + ); + continue; + } + fs.read_only.push(path_str.into_owned()); + modified = true; + } + } + for path in gpu_baseline_read_write_paths() { + let path_str = path.to_string_lossy(); + if !fs + .read_write + .iter() + .any(|p| p.as_str() == path_str.as_ref()) + { + if !path.exists() { + debug!( + path = %path.display(), + "GPU baseline read-write path does not exist, skipping enrichment" + ); + continue; + } + fs.read_write.push(path_str.into_owned()); + modified = true; + } + } + } + if modified { info!("Enriched policy with baseline filesystem paths for proxy mode"); } @@ -982,6 +1106,37 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) { } } + if has_gpu_devices() { + for p in gpu_baseline_read_only_paths() { + if !policy.filesystem.read_only.contains(&p) + && !policy.filesystem.read_write.contains(&p) + { + if !p.exists() { + debug!( + path = %p.display(), + "GPU baseline read-only path does not exist, skipping enrichment" + ); + continue; + } + policy.filesystem.read_only.push(p); + modified = true; + } + } + for p in gpu_baseline_read_write_paths() { + if !policy.filesystem.read_write.contains(&p) { + if !p.exists() { + debug!( + path = %p.display(), + "GPU baseline read-write path does not exist, skipping enrichment" + ); + continue; + } + policy.filesystem.read_write.push(p); + modified = true; + } + } + } + if modified { info!("Enriched policy with baseline filesystem paths for proxy mode"); }