diff --git a/src/remote/proxyCommandRetry.ts b/src/remote/proxyCommandRetry.ts new file mode 100644 index 00000000..83ecce72 --- /dev/null +++ b/src/remote/proxyCommandRetry.ts @@ -0,0 +1,71 @@ +import * as fs from "node:fs/promises"; +import * as path from "node:path"; + +import { renameWithRetry, tempFilePath } from "../util"; + +import type { Logger } from "../logging/logger"; + +/** + * POSIX shell script that wraps `coder ssh` with retry logic for the SSH + * ProxyCommand. After sleep/wake, DNS failures cause `coder ssh` to exit + * instantly, producing unparsable output that the Remote SSH extension + * treats as a permanent error ("Reload Window"). Retrying with a delay + * allows DNS to recover, including across system suspend/resume. + * + * Only retries when the command exits quickly (< CODER_RETRY_MIN_RUNTIME). + */ +const RETRY_SCRIPT = `#!/bin/sh +# Coder SSH ProxyCommand retry wrapper. +# Written by the Coder VS Code extension; do not edit. +max_retries=\${CODER_RETRY_MAX_RETRIES:-10} +retry_sleep=\${CODER_RETRY_SLEEP:-5} +min_runtime=\${CODER_RETRY_MIN_RUNTIME:-10} +n=0 +while [ $n -lt $max_retries ]; do + start=$(date +%s) + "$@" + rc=$? + elapsed=$(($(date +%s) - start)) + [ $elapsed -ge $min_runtime ] && exit $rc + [ $rc -eq 0 ] && exit 0 + n=$((n + 1)) + echo "coder-retry: attempt $n/$max_retries failed (rc=$rc, elapsed=\${elapsed}s)" >&2 + [ $n -lt $max_retries ] && sleep $retry_sleep +done +exit "$rc" +`; + +const SCRIPT_NAME = "coder-ssh-retry.sh"; + +/** + * Ensure the retry wrapper script exists on disk and return its path. + */ +export async function ensureRetryScript( + baseDir: string, + logger: Logger, +): Promise { + await fs.mkdir(baseDir, { recursive: true }); + const scriptPath = path.join(baseDir, SCRIPT_NAME); + + // Atomic write: temp file + rename to avoid races between concurrent + // VS Code windows writing the same script simultaneously. + const tmpPath = tempFilePath(scriptPath, "tmp"); + await fs.writeFile(tmpPath, RETRY_SCRIPT, { mode: 0o755 }); + try { + await renameWithRetry( + (src, dest) => fs.rename(src, dest), + tmpPath, + scriptPath, + ); + } catch (error) { + await fs.unlink(tmpPath).catch((unlinkErr: NodeJS.ErrnoException) => { + if (unlinkErr.code !== "ENOENT") { + logger.warn("Failed to clean up temp retry script", tmpPath, unlinkErr); + } + }); + throw new Error(`Failed to write retry script to ${scriptPath}`, { + cause: error, + }); + } + return scriptPath; +} diff --git a/src/remote/remote.ts b/src/remote/remote.ts index e482725a..06d5d31c 100644 --- a/src/remote/remote.ts +++ b/src/remote/remote.ts @@ -51,6 +51,7 @@ import { import { vscodeProposed } from "../vscodeProposed"; import { WorkspaceMonitor } from "../workspace/workspaceMonitor"; +import { ensureRetryScript } from "./proxyCommandRetry"; import { SshConfig, type SSHValues, @@ -660,8 +661,8 @@ export class Remote { /** * Builds the ProxyCommand for SSH connections to Coder workspaces. - * Uses `coder ssh` for modern deployments with wildcard support, - * or falls back to `coder vscodessh` for older deployments. + * On macOS/Linux, wraps with a retry script to survive transient DNS + * failures after sleep/wake (see proxyCommandRetry.ts). */ private async buildProxyCommand( binaryPath: string, @@ -670,6 +671,41 @@ export class Remote { logDir: string, useWildcardSSH: boolean, cliAuth: CliAuth, + ): Promise { + const coderCommand = await this.buildCoderCommand( + binaryPath, + label, + hostPrefix, + logDir, + useWildcardSSH, + cliAuth, + ); + + if (os.platform() === "win32") { + return coderCommand; + } + try { + const retryScript = await ensureRetryScript( + this.pathResolver.getGlobalConfigDir(""), + this.logger, + ); + return `${escapeCommandArg(retryScript)} ${coderCommand}`; + } catch (error) { + this.logger.warn("Failed to write retry wrapper, skipping", error); + return coderCommand; + } + } + + /** + * Builds the raw `coder ssh` or `coder vscodessh` command string. + */ + private async buildCoderCommand( + binaryPath: string, + label: string, + hostPrefix: string, + logDir: string, + useWildcardSSH: boolean, + cliAuth: CliAuth, ): Promise { const vscodeConfig = vscode.workspace.getConfiguration(); @@ -678,8 +714,7 @@ export class Remote { const logArgs = await this.getLogArgs(logDir); if (useWildcardSSH) { - // User SSH flags are included first; internally-managed flags - // are appended last so they take precedence. + // User SSH flags first; internal flags last so they take precedence. const userSshFlags = getSshFlags(vscodeConfig); // Make sure to update the `coder.sshFlags` description if we add more internal flags here! const internalFlags = [ @@ -695,28 +730,28 @@ export class Remote { const allFlags = [...userSshFlags, ...internalFlags]; return `${escapedBinaryPath} ${globalConfig.join(" ")} ssh ${allFlags.join(" ")}`; - } else { - const networkInfoDir = escapeCommandArg( - this.pathResolver.getNetworkInfoPath(), - ); - const sessionTokenFile = escapeCommandArg( - this.pathResolver.getSessionTokenPath(label), - ); - const urlFile = escapeCommandArg(this.pathResolver.getUrlPath(label)); + } - const sshFlags = [ - "--network-info-dir", - networkInfoDir, - ...logArgs, - "--session-token-file", - sessionTokenFile, - "--url-file", - urlFile, - "%h", - ]; + const networkInfoDir = escapeCommandArg( + this.pathResolver.getNetworkInfoPath(), + ); + const sessionTokenFile = escapeCommandArg( + this.pathResolver.getSessionTokenPath(label), + ); + const urlFile = escapeCommandArg(this.pathResolver.getUrlPath(label)); + + const sshFlags = [ + "--network-info-dir", + networkInfoDir, + ...logArgs, + "--session-token-file", + sessionTokenFile, + "--url-file", + urlFile, + "%h", + ]; - return `${escapedBinaryPath} ${globalConfig.join(" ")} vscodessh ${sshFlags.join(" ")}`; - } + return `${escapedBinaryPath} ${globalConfig.join(" ")} vscodessh ${sshFlags.join(" ")}`; } /** diff --git a/test/unit/remote/proxyCommandRetry.test.ts b/test/unit/remote/proxyCommandRetry.test.ts new file mode 100644 index 00000000..bc28350c --- /dev/null +++ b/test/unit/remote/proxyCommandRetry.test.ts @@ -0,0 +1,109 @@ +import { execFile } from "node:child_process"; +import * as fs from "node:fs/promises"; +import * as os from "node:os"; +import * as path from "node:path"; +import { promisify } from "node:util"; +import { afterEach, beforeEach, describe, expect, it } from "vitest"; + +import { ensureRetryScript } from "@/remote/proxyCommandRetry"; + +import { createMockLogger } from "../../mocks/testHelpers"; +import { isWindows } from "../../utils/platform"; + +const execFileAsync = promisify(execFile); + +/** Run the retry script with fast defaults (0s sleep). */ +function run( + script: string, + args: string[], + env: Record = {}, + timeout?: number, +) { + return execFileAsync(script, args, { + timeout, + env: { + ...process.env, + CODER_RETRY_SLEEP: "0", + CODER_RETRY_MAX_RETRIES: "10", + CODER_RETRY_MIN_RUNTIME: "10", + ...env, + }, + }); +} + +describe.skipIf(isWindows())("proxyCommandRetry", () => { + let tmpDir: string; + let script: string; + + beforeEach(async () => { + tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "coder-retry-test-")); + script = await ensureRetryScript(tmpDir, createMockLogger()); + }); + + afterEach(async () => { + await fs.rm(tmpDir, { recursive: true, force: true }); + }); + + it("passes through on success", async () => { + const { stdout } = await run(script, ["echo", "hello"]); + expect(stdout.trim()).toBe("hello"); + }); + + it("retries on quick failure then succeeds", async () => { + // Fails until marker file exists, then succeeds. + const marker = path.join(tmpDir, "marker"); + const helper = path.join(tmpDir, "h.sh"); + await fs.writeFile( + helper, + `#!/bin/sh\n[ -f "${marker}" ] && echo ok && exit 0\ntouch "${marker}"\nexit 1\n`, + { mode: 0o755 }, + ); + + const { stdout } = await run(script, [helper]); + expect(stdout.trim()).toBe("ok"); + }); + + it("gives up after max retries and logs each attempt", async () => { + try { + await run(script, ["sh", "-c", "exit 42"], { + CODER_RETRY_MAX_RETRIES: "3", + }); + expect.fail("should have thrown"); + } catch (err: unknown) { + const e = err as { code: number; stderr: string }; + expect(e.code).toBe(42); + expect(e.stderr).toContain("attempt 1/3 failed"); + expect(e.stderr).toContain("attempt 3/3 failed"); + } + }); + + it("skips retry when command ran longer than min runtime", async () => { + const marker = path.join(tmpDir, "ran"); + const helper = path.join(tmpDir, "slow.sh"); + await fs.writeFile( + helper, + `#!/bin/sh\n[ -f "${marker}" ] && exit 99\ntouch "${marker}"\nsleep 2\nexit 1\n`, + { mode: 0o755 }, + ); + + try { + await run(script, [helper], { CODER_RETRY_MIN_RUNTIME: "1" }, 10000); + expect.fail("should have thrown"); + } catch (err: unknown) { + // Exit code 1 (not 99) proves it ran once and didn't retry. + expect((err as { code: number }).code).toBe(1); + } + }); + + it("preserves arguments with spaces", async () => { + const { stdout } = await run(script, [ + "sh", + "-c", + 'echo "$1 $2"', + "--", + "hello world", + "it's fine", + ]); + expect(stdout.trim()).toBe("hello world it's fine"); + }); +});