Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 110 additions & 0 deletions crates/openshell-bootstrap/src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,21 @@ const FAILURE_PATTERNS: &[FailurePattern] = &[
match_mode: MatchMode::Any,
diagnose: diagnose_docker_not_running,
},
// CDI specs missing — Docker daemon has CDI configured but no spec files exist
// or the requested device ID (nvidia.com/gpu=all) is not in any spec.
// Matches errors from Docker 25+ and containerd CDI injection paths.
FailurePattern {
matchers: &[
"CDI device not found",
"unknown CDI device",
"failed to inject CDI devices",
"no CDI devices found",
"CDI device injection failed",
"unresolvable CDI devices",
],
match_mode: MatchMode::Any,
diagnose: diagnose_cdi_specs_missing,
},
];

fn diagnose_corrupted_state(gateway_name: &str) -> GatewayFailureDiagnosis {
Expand Down Expand Up @@ -396,6 +411,29 @@ fn diagnose_certificate_issue(gateway_name: &str) -> GatewayFailureDiagnosis {
}
}

fn diagnose_cdi_specs_missing(_gateway_name: &str) -> GatewayFailureDiagnosis {
GatewayFailureDiagnosis {
summary: "CDI specs not found on host".to_string(),
explanation: "GPU passthrough via CDI was selected (your Docker daemon has CDI spec \
directories configured) but no CDI device specs were found on the host. \
Specs must be pre-generated before OpenShell can inject the GPU into the \
cluster container."
.to_string(),
recovery_steps: vec![
RecoveryStep::with_command(
"Generate CDI specs on the host (nvidia-ctk creates /etc/cdi/ if it does not exist)",
"sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml",
),
RecoveryStep::with_command(
"Verify the specs were generated and include nvidia.com/gpu entries",
"nvidia-ctk cdi list",
),
RecoveryStep::new("Then retry: openshell gateway start --gpu"),
],
retryable: false,
}
}

fn diagnose_docker_not_running(_gateway_name: &str) -> GatewayFailureDiagnosis {
GatewayFailureDiagnosis {
summary: "Docker is not running".to_string(),
Expand Down Expand Up @@ -925,4 +963,76 @@ mod tests {
"commands should include gateway name, got: {all_commands}"
);
}

#[test]
fn test_diagnose_cdi_device_not_found() {
let diagnosis = diagnose_failure(
"test",
"could not run container: CDI device not found: nvidia.com/gpu=all",
None,
);
assert!(diagnosis.is_some());
let d = diagnosis.unwrap();
assert!(
d.summary.contains("CDI"),
"expected CDI diagnosis, got: {}",
d.summary
);
assert!(!d.retryable);
}

#[test]
fn test_diagnose_cdi_injection_failed_unresolvable() {
// Exact error observed from Docker 500 response
let diagnosis = diagnose_failure(
"test",
"Docker responded with status code 500: CDI device injection failed: unresolvable CDI devices nvidia.com/gpu=all",
None,
);
assert!(diagnosis.is_some());
let d = diagnosis.unwrap();
assert!(
d.summary.contains("CDI"),
"expected CDI diagnosis, got: {}",
d.summary
);
assert!(!d.retryable);
}

#[test]
fn test_diagnose_unknown_cdi_device() {
// containerd error path
let diagnosis = diagnose_failure(
"test",
"unknown CDI device requested: nvidia.com/gpu=all",
None,
);
assert!(diagnosis.is_some());
let d = diagnosis.unwrap();
assert!(
d.summary.contains("CDI"),
"expected CDI diagnosis, got: {}",
d.summary
);
}

#[test]
fn test_diagnose_cdi_recovery_mentions_nvidia_ctk() {
let d = diagnose_failure("test", "CDI device not found", None)
.expect("should match CDI pattern");
let all_steps: String = d
.recovery_steps
.iter()
.map(|s| format!("{} {}", s.description, s.command.as_deref().unwrap_or("")))
.collect::<Vec<_>>()
.join("\n");
assert!(
all_steps.contains("nvidia-ctk cdi generate"),
"recovery steps should mention nvidia-ctk cdi generate, got: {all_steps}"
);
assert!(
all_steps.contains("/etc/cdi/"),
"recovery steps should mention /etc/cdi/, got: {all_steps}"
);
}
}
159 changes: 157 additions & 2 deletions crates/openshell-sandbox/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -877,13 +877,97 @@ pub(crate) fn spawn_route_refresh(

/// Minimum read-only paths required for a proxy-mode sandbox child process to
/// function: dynamic linker, shared libraries, DNS resolution, CA certs,
/// Python venv, and openshell logs.
const PROXY_BASELINE_READ_ONLY: &[&str] = &["/usr", "/lib", "/etc", "/app", "/var/log"];
/// Python venv, openshell logs, process info, and random bytes.
///
/// `/proc` and `/dev/urandom` are included here for the same reasons they
/// appear in `restrictive_default_policy()`: virtually every process needs
/// them. Before the Landlock per-path fix (#677) these were effectively free
/// because a missing path silently disabled the entire ruleset; now they must
/// be explicit.
const PROXY_BASELINE_READ_ONLY: &[&str] = &[
"/usr",
"/lib",
"/etc",
"/app",
"/var/log",
"/proc",
"/dev/urandom",
Comment on lines +893 to +894
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

];

/// Minimum read-write paths required for a proxy-mode sandbox child process:
/// user working directory and temporary files.
const PROXY_BASELINE_READ_WRITE: &[&str] = &["/sandbox", "/tmp"];

/// Fixed read-only paths required when a GPU is present.
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This whole section is similar to the baseline stuff, but activated when the GPU passthrough is on.

This could benefit from the mechanism we will have for providers bringing in their policies, though this is different as it applies sandbox wide and is not limited by binaries.

///
/// `/run/nvidia-persistenced`: NVML tries to connect to the persistenced
/// socket at init time. If the directory exists but landlock denies traversal
/// (EACCES vs ECONNREFUSED), NVML returns NVML_ERROR_INSUFFICIENT_PERMISSIONS
/// even though the daemon is optional. Only read/traversal access is needed —
/// NVML connects to the existing socket but does not create or modify files.
const GPU_BASELINE_READ_ONLY_FIXED: &[&str] = &["/run/nvidia-persistenced"];

/// Fixed read-write paths required when a GPU is present.
///
/// `/dev/nvidiactl`, `/dev/nvidia-uvm`, `/dev/nvidia-uvm-tools`,
/// `/dev/nvidia-modeset`: control and UVM devices injected by CDI.
/// Landlock READ_FILE/WRITE_FILE restricts open(2) on device files even
/// when DAC permissions would otherwise allow it. Device nodes need
/// read-write because NVML/CUDA opens them with O_RDWR.
///
/// `/proc`: CUDA writes to `/proc/<pid>/task/<tid>/comm` during `cuInit()`
/// to set thread names. Without write access, `cuInit()` returns error 304
/// (`cudaErrorOperatingSystem`). We must use `/proc` (not `/proc/self/task`)
/// because Landlock rules bind to inodes: `/proc/self/task` in the pre_exec
/// hook resolves to the shell's PID, but child processes (python, etc.)
/// have different PIDs and thus different procfs inodes. Security impact
/// is limited — writable proc entries (`oom_score_adj`, etc.) are already
/// kernel-restricted for non-root users; Landlock is defense-in-depth.
///
/// Per-GPU device files (`/dev/nvidia0`, `/dev/nvidia1`, …) are enumerated
/// at runtime by `gpu_baseline_read_write_paths()` since the count varies.
const GPU_BASELINE_READ_WRITE_FIXED: &[&str] = &[
"/dev/nvidiactl",
"/dev/nvidia-uvm",
"/dev/nvidia-uvm-tools",
"/dev/nvidia-modeset",
"/proc",
];

/// Returns true if GPU devices are present in the container.
fn has_gpu_devices() -> bool {
std::path::Path::new("/dev/nvidiactl").exists()
}

/// Collect all GPU read-only paths (currently just the persistenced directory).
fn gpu_baseline_read_only_paths() -> Vec<std::path::PathBuf> {
GPU_BASELINE_READ_ONLY_FIXED
.iter()
.map(|p| std::path::PathBuf::from(p))
.collect()
}

/// Collect all GPU read-write paths: fixed devices + per-GPU `/dev/nvidiaX`.
fn gpu_baseline_read_write_paths() -> Vec<std::path::PathBuf> {
let mut paths: Vec<std::path::PathBuf> = GPU_BASELINE_READ_WRITE_FIXED
.iter()
.map(|p| std::path::PathBuf::from(p))
.collect();

// Enumerate per-GPU device nodes injected by CDI (nvidia0, nvidia1, …).
if let Ok(entries) = std::fs::read_dir("/dev") {
for entry in entries.flatten() {
let name = entry.file_name();
let name = name.to_string_lossy();
if name.starts_with("nvidia") && name[6..].chars().all(|c| c.is_ascii_digit()) {
paths.push(entry.path());
}
}
}

paths
}

/// Ensure a proto `SandboxPolicy` includes the baseline filesystem paths
/// required for proxy-mode sandboxes. Paths are only added if missing;
/// user-specified paths are never removed.
Expand Down Expand Up @@ -935,6 +1019,46 @@ fn enrich_proto_baseline_paths(proto: &mut openshell_core::proto::SandboxPolicy)
}
}

if has_gpu_devices() {
for path in gpu_baseline_read_only_paths() {
let path_str = path.to_string_lossy();
if !fs.read_only.iter().any(|p| p.as_str() == path_str.as_ref())
&& !fs
.read_write
.iter()
.any(|p| p.as_str() == path_str.as_ref())
{
if !path.exists() {
debug!(
path = %path.display(),
"GPU baseline read-only path does not exist, skipping enrichment"
);
continue;
}
fs.read_only.push(path_str.into_owned());
modified = true;
}
}
for path in gpu_baseline_read_write_paths() {
let path_str = path.to_string_lossy();
if !fs
.read_write
.iter()
.any(|p| p.as_str() == path_str.as_ref())
{
if !path.exists() {
debug!(
path = %path.display(),
"GPU baseline read-write path does not exist, skipping enrichment"
);
continue;
}
fs.read_write.push(path_str.into_owned());
modified = true;
}
}
}

if modified {
info!("Enriched policy with baseline filesystem paths for proxy mode");
}
Expand Down Expand Up @@ -982,6 +1106,37 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) {
}
}

if has_gpu_devices() {
for p in gpu_baseline_read_only_paths() {
if !policy.filesystem.read_only.contains(&p)
&& !policy.filesystem.read_write.contains(&p)
{
if !p.exists() {
debug!(
path = %p.display(),
"GPU baseline read-only path does not exist, skipping enrichment"
);
continue;
}
policy.filesystem.read_only.push(p);
modified = true;
}
}
for p in gpu_baseline_read_write_paths() {
if !policy.filesystem.read_write.contains(&p) {
if !p.exists() {
debug!(
path = %p.display(),
"GPU baseline read-write path does not exist, skipping enrichment"
);
continue;
}
policy.filesystem.read_write.push(p);
modified = true;
}
}
}

if modified {
info!("Enriched policy with baseline filesystem paths for proxy mode");
}
Expand Down
Loading