Unsupervisedcom · nhorton · Feb 12, 2026 · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/.mcp.json b/.mcp.json
@@ -5,7 +5,9 @@
       "args": [
         "serve",
         "--path",
-        "."
+        ".",
+        "--external-runner",
+        "claude"
       ]
     }
   }

diff --git a/src/deepwork/cli/serve.py b/src/deepwork/cli/serve.py
@@ -64,33 +64,47 @@ def _load_config(project_path: Path) -> dict:
     default=8000,
     help="Port for SSE transport (default: 8000)",
 )
+@click.option(
+    "--external-runner",
+    type=click.Choice(["claude"]),
+    default=None,
+    help="External runner for quality gate reviews. 'claude' uses Claude CLI subprocess. Default: None (agent self-review).",
+)
 def serve(
     path: Path,
     no_quality_gate: bool,
     transport: str,
     port: int,
+    external_runner: str | None,
 ) -> None:
     """Start the DeepWork MCP server.
 
     Exposes workflow management tools to AI agents via MCP protocol.
     By default uses stdio transport for local integration with Claude Code.
 
-    Quality gate is enabled by default and uses Claude Code to evaluate
-    step outputs against quality criteria.
+    Quality gate is enabled by default. Use --external-runner to specify
+    how quality reviews are executed:
+
+    \b
+    - No flag (default): Agent self-review via instructions file
+    - --external-runner claude: Claude CLI subprocess review
 
     Examples:
 
-        # Start server for current directory
+        # Start server for current directory (agent self-review)
         deepwork serve
 
+        # Start with Claude CLI as quality gate reviewer
+        deepwork serve --external-runner claude
+
         # Start with quality gate disabled
         deepwork serve --no-quality-gate
 
         # Start for a specific project
         deepwork serve --path /path/to/project
     """
     try:
-        _serve_mcp(path, not no_quality_gate, transport, port)
+        _serve_mcp(path, not no_quality_gate, transport, port, external_runner)
     except ServeError as e:
         console.print(f"[red]Error:[/red] {e}")
         raise click.Abort() from e
@@ -104,6 +118,7 @@ def _serve_mcp(
     enable_quality_gate: bool,
     transport: str,
     port: int,
+    external_runner: str | None = None,
 ) -> None:
     """Start the MCP server.
 
@@ -112,6 +127,8 @@ def _serve_mcp(
         enable_quality_gate: Whether to enable quality gate evaluation
         transport: Transport protocol (stdio or sse)
         port: Port for SSE transport
+        external_runner: External runner for quality gate reviews.
+            "claude" uses Claude CLI subprocess. None means agent self-review.
 
     Raises:
         ServeError: If server fails to start
@@ -125,6 +142,7 @@ def _serve_mcp(
     server = create_server(
         project_root=project_path,
         enable_quality_gate=enable_quality_gate,
+        external_runner=external_runner,
     )
 
     if transport == "stdio":

diff --git a/src/deepwork/core/adapters.py b/src/deepwork/core/adapters.py
@@ -562,9 +562,10 @@ def register_mcp_server(self, project_path: Path) -> bool:
 
         # Build the new MCP server config
         # Assume deepwork is available in PATH
+        # Include --external-runner claude so quality gate reviews use Claude CLI subprocess
         new_server_config = {
             "command": "deepwork",
-            "args": ["serve", "--path", "."],
+            "args": ["serve", "--path", ".", "--external-runner", "claude"],
         }
 
         # Check if already registered with same config

diff --git a/src/deepwork/mcp/quality_gate.py b/src/deepwork/mcp/quality_gate.py
@@ -58,16 +58,32 @@ class QualityGate:
     """Evaluates step outputs against quality criteria.
 
     Uses ClaudeCLI to invoke a review agent that evaluates outputs
-    and returns structured feedback.
+    and returns structured feedback. Can also build review instructions
+    files for agent self-review when no external runner is configured.
     """
 
-    def __init__(self, cli: ClaudeCLI | None = None):
+    # Default maximum number of files to include inline in the review payload.
+    # Beyond this threshold, only file paths are listed.
+    DEFAULT_MAX_INLINE_FILES = 5
+
+    def __init__(
+        self,
+        cli: ClaudeCLI | None = None,
+        max_inline_files: int | None = None,
+    ):
         """Initialize quality gate.
 
         Args:
-            cli: ClaudeCLI instance. If not provided, a default one is created.
+            cli: ClaudeCLI instance. If None, evaluate() cannot be called
+                but instruction-building methods still work.
+            max_inline_files: Maximum number of files to embed inline in
+                review payloads. Beyond this, only file paths are listed.
+                Defaults to DEFAULT_MAX_INLINE_FILES (5).
         """
-        self._cli = cli or ClaudeCLI()
+        self._cli = cli
+        self.max_inline_files = (
+            max_inline_files if max_inline_files is not None else self.DEFAULT_MAX_INLINE_FILES
+        )
 
     def _build_instructions(
         self,
@@ -202,7 +218,8 @@ async def _read_file_sections(
     # WARNING: REVIEW PERFORMANCE IS SENSITIVE TO PAYLOAD SIZE
     #
     # The payload builder below sends file contents to the review agent (Claude
-    # CLI subprocess). Reviews can get REALLY SLOW if the content gets too big:
+    # CLI subprocess or self-review file). Reviews can get REALLY SLOW if the
+    # content gets too big:
     #
     # - Each file's full content is read and embedded in the prompt
     # - The review agent must process ALL of this content to evaluate criteria
@@ -211,18 +228,18 @@ async def _read_file_sections(
     # - Per-file reviews (run_each: <output_name> with type: files) multiply
     #   the problem — each file gets its own review subprocess
     #
-    # To mitigate this, when more than MAX_INLINE_FILES files are present,
-    # the payload switches to a path-listing mode that only shows file paths
-    # instead of dumping all contents inline. The reviewer can then use its
-    # own tools to read specific files as needed.
+    # To mitigate this, when more than self.max_inline_files files are
+    # present, the payload switches to a path-listing mode that only shows
+    # file paths instead of dumping all contents inline. The reviewer can
+    # then use its own tools to read specific files as needed.
+    #
+    # max_inline_files is configurable per instance:
+    #   - external_runner="claude": 5 (embed small sets, list large ones)
+    #   - external_runner=None (self-review): 0 (always list paths)
     #
     # If you're changing the payload builder, keep payload size in mind.
     # =========================================================================
 
-    # Maximum number of files to include inline in the review payload.
-    # Beyond this threshold, only file paths are listed.
-    MAX_INLINE_FILES = 5
-
     @staticmethod
     def _build_path_listing(file_paths: dict[str, str | list[str]]) -> list[str]:
         """Build a path-only listing for large file sets.
@@ -262,7 +279,7 @@ async def _build_payload(
         parts: list[str] = []
         total_files = len(self._flatten_output_paths(outputs))
 
-        if total_files > self.MAX_INLINE_FILES:
+        if total_files > self.max_inline_files:
             # Too many files — list paths only so the reviewer reads selectively
             path_lines = self._build_path_listing(outputs)
             parts.append(f"{SECTION_SEPARATOR} BEGIN OUTPUTS {SECTION_SEPARATOR}")
@@ -318,6 +335,112 @@ def _parse_result(self, data: dict[str, Any]) -> QualityGateResult:
                 f"Failed to interpret quality gate result: {e}\nData was: {data}"
             ) from e
 
+    async def build_review_instructions_file(
+        self,
+        reviews: list[dict[str, Any]],
+        outputs: dict[str, str | list[str]],
+        output_specs: dict[str, str],
+        project_root: Path,
+        notes: str | None = None,
+    ) -> str:
+        """Build complete review instructions content for writing to a file.
+
+        Used in self-review mode (no external runner) to generate a file that
+        a subagent can read and follow to evaluate quality criteria.
+
+        Args:
+            reviews: List of review dicts with run_each, quality_criteria,
+                and optional additional_review_guidance
+            outputs: Map of output names to file path(s)
+            output_specs: Map of output names to their type ("file" or "files")
+            project_root: Project root path
+            notes: Optional notes from the agent about work done
+
+        Returns:
+            Complete review instructions as a string
+        """
+        parts: list[str] = []
+
+        parts.append("# Quality Review Instructions")
+        parts.append("")
+        parts.append(
+            "You are an editor responsible for reviewing the outputs of a workflow step. "
+            "Your job is to evaluate whether the outputs meet the specified quality criteria."
+        )
+        parts.append("")
+
+        # Build outputs listing (uses self.max_inline_files to decide inline vs path-only)
+        payload = await self._build_payload(outputs, project_root)
+        parts.append(payload)
+        parts.append("")
+
+        # Build review sections
+        for i, review in enumerate(reviews, 1):
+            run_each = review["run_each"]
+            quality_criteria = review["quality_criteria"]
+            guidance = review.get("additional_review_guidance")
+
+            if len(reviews) > 1:
+                scope = "all outputs together" if run_each == "step" else f"output '{run_each}'"
+                parts.append(f"## Review {i} (scope: {scope})")
+            else:
+                parts.append("## Criteria to Evaluate")
+            parts.append("")
+
+            criteria_list = "\n".join(
+                f"- **{name}**: {question}" for name, question in quality_criteria.items()
+            )
+            parts.append(criteria_list)
+            parts.append("")
+
+            if run_each != "step" and run_each in outputs:
+                output_type = output_specs.get(run_each, "file")
+                output_value = outputs[run_each]
+                if output_type == "files" and isinstance(output_value, list):
+                    parts.append(
+                        f"Evaluate the above criteria for **each file** in output '{run_each}':"
+                    )
+                    for fp in output_value:
+                        parts.append(f"- {fp}")
+                    parts.append("")
+
+            if guidance:
+                parts.append("### Additional Context")
+                parts.append("")
+                parts.append(guidance)
+                parts.append("")
+
+        if notes:
+            parts.append("## Author Notes")
+            parts.append("")
+            parts.append(notes)
+            parts.append("")
+
+        parts.append("## Guidelines")
+        parts.append("")
+        parts.append("- Be strict but fair")
+        parts.append(
+            "- Apply criteria pragmatically. If a criterion is not applicable "
+            "to this step's purpose, pass it."
+        )
+        parts.append("- Only mark a criterion as passed if it is clearly met or not applicable.")
+        parts.append("- Provide specific, actionable feedback for failed criteria.")
+        parts.append(
+            "- The overall review should PASS only if ALL criteria across all reviews pass."
+        )
+        parts.append("")
+        parts.append("## Your Task")
+        parts.append("")
+        parts.append("1. Read each output file listed above")
+        parts.append("2. Evaluate every criterion in every review section")
+        parts.append("3. For each criterion, report **PASS** or **FAIL** with specific feedback")
+        parts.append("4. At the end, clearly state the overall result: **PASSED** or **FAILED**")
+        parts.append(
+            "5. If any criteria failed, provide clear actionable feedback on what needs to change"
+        )
+
+        return "\n".join(parts)
+
     @staticmethod
     def compute_timeout(file_count: int) -> int:
         """Compute dynamic timeout based on number of files.
@@ -368,6 +491,12 @@ async def evaluate(
                 criteria_results=[],
             )
 
+        if self._cli is None:
+            raise QualityGateError(
+                "Cannot evaluate quality gate without a CLI runner. "
+                "Use build_review_instructions_file() for self-review mode."
+            )
+
         instructions = self._build_instructions(
             quality_criteria,
             notes=notes,

diff --git a/src/deepwork/mcp/server.py b/src/deepwork/mcp/server.py
@@ -38,6 +38,7 @@ def create_server(
     enable_quality_gate: bool = True,
     quality_gate_timeout: int = 120,
     quality_gate_max_attempts: int = 3,
+    external_runner: str | None = None,
 ) -> FastMCP:
     """Create and configure the MCP server.
 
@@ -46,6 +47,9 @@ def create_server(
         enable_quality_gate: Whether to enable quality gate evaluation (default: True)
         quality_gate_timeout: Timeout in seconds for quality gate (default: 120)
         quality_gate_max_attempts: Max attempts before failing quality gate (default: 3)
+        external_runner: External runner for quality gate reviews.
+            "claude" uses Claude CLI subprocess. None means agent self-review
+            via instructions file. (default: None)
 
     Returns:
         Configured FastMCP server instance
@@ -57,14 +61,20 @@ def create_server(
 
     quality_gate: QualityGate | None = None
     if enable_quality_gate:
-        cli = ClaudeCLI(timeout=quality_gate_timeout)
-        quality_gate = QualityGate(cli=cli)
+        if external_runner == "claude":
+            # Claude CLI subprocess mode: embed up to 5 files inline
+            cli = ClaudeCLI(timeout=quality_gate_timeout)
+            quality_gate = QualityGate(cli=cli, max_inline_files=5)
+        else:
+            # Self-review mode: no CLI, always reference files by path (0 inline)
+            quality_gate = QualityGate(cli=None, max_inline_files=0)
 
     tools = WorkflowTools(
         project_root=project_path,
         state_manager=state_manager,
         quality_gate=quality_gate,
         max_quality_attempts=quality_gate_max_attempts,
+        external_runner=external_runner,
     )
 
     # Create MCP server