diff --git a/.mcp.json b/.mcp.json index 79f9ddc8..c72d476e 100644 --- a/.mcp.json +++ b/.mcp.json @@ -5,7 +5,9 @@ "args": [ "serve", "--path", - "." + ".", + "--external-runner", + "claude" ] } } diff --git a/src/deepwork/cli/serve.py b/src/deepwork/cli/serve.py index 5625056a..91db64fa 100644 --- a/src/deepwork/cli/serve.py +++ b/src/deepwork/cli/serve.py @@ -64,25 +64,39 @@ def _load_config(project_path: Path) -> dict: default=8000, help="Port for SSE transport (default: 8000)", ) +@click.option( + "--external-runner", + type=click.Choice(["claude"]), + default=None, + help="External runner for quality gate reviews. 'claude' uses Claude CLI subprocess. Default: None (agent self-review).", +) def serve( path: Path, no_quality_gate: bool, transport: str, port: int, + external_runner: str | None, ) -> None: """Start the DeepWork MCP server. Exposes workflow management tools to AI agents via MCP protocol. By default uses stdio transport for local integration with Claude Code. - Quality gate is enabled by default and uses Claude Code to evaluate - step outputs against quality criteria. + Quality gate is enabled by default. Use --external-runner to specify + how quality reviews are executed: + + \b + - No flag (default): Agent self-review via instructions file + - --external-runner claude: Claude CLI subprocess review Examples: - # Start server for current directory + # Start server for current directory (agent self-review) deepwork serve + # Start with Claude CLI as quality gate reviewer + deepwork serve --external-runner claude + # Start with quality gate disabled deepwork serve --no-quality-gate @@ -90,7 +104,7 @@ def serve( deepwork serve --path /path/to/project """ try: - _serve_mcp(path, not no_quality_gate, transport, port) + _serve_mcp(path, not no_quality_gate, transport, port, external_runner) except ServeError as e: console.print(f"[red]Error:[/red] {e}") raise click.Abort() from e @@ -104,6 +118,7 @@ def _serve_mcp( enable_quality_gate: bool, transport: str, port: int, + external_runner: str | None = None, ) -> None: """Start the MCP server. @@ -112,6 +127,8 @@ def _serve_mcp( enable_quality_gate: Whether to enable quality gate evaluation transport: Transport protocol (stdio or sse) port: Port for SSE transport + external_runner: External runner for quality gate reviews. + "claude" uses Claude CLI subprocess. None means agent self-review. Raises: ServeError: If server fails to start @@ -125,6 +142,7 @@ def _serve_mcp( server = create_server( project_root=project_path, enable_quality_gate=enable_quality_gate, + external_runner=external_runner, ) if transport == "stdio": diff --git a/src/deepwork/core/adapters.py b/src/deepwork/core/adapters.py index 9225455f..8bdd8321 100644 --- a/src/deepwork/core/adapters.py +++ b/src/deepwork/core/adapters.py @@ -562,9 +562,10 @@ def register_mcp_server(self, project_path: Path) -> bool: # Build the new MCP server config # Assume deepwork is available in PATH + # Include --external-runner claude so quality gate reviews use Claude CLI subprocess new_server_config = { "command": "deepwork", - "args": ["serve", "--path", "."], + "args": ["serve", "--path", ".", "--external-runner", "claude"], } # Check if already registered with same config diff --git a/src/deepwork/mcp/quality_gate.py b/src/deepwork/mcp/quality_gate.py index 7b749a38..8b055de3 100644 --- a/src/deepwork/mcp/quality_gate.py +++ b/src/deepwork/mcp/quality_gate.py @@ -58,16 +58,32 @@ class QualityGate: """Evaluates step outputs against quality criteria. Uses ClaudeCLI to invoke a review agent that evaluates outputs - and returns structured feedback. + and returns structured feedback. Can also build review instructions + files for agent self-review when no external runner is configured. """ - def __init__(self, cli: ClaudeCLI | None = None): + # Default maximum number of files to include inline in the review payload. + # Beyond this threshold, only file paths are listed. + DEFAULT_MAX_INLINE_FILES = 5 + + def __init__( + self, + cli: ClaudeCLI | None = None, + max_inline_files: int | None = None, + ): """Initialize quality gate. Args: - cli: ClaudeCLI instance. If not provided, a default one is created. + cli: ClaudeCLI instance. If None, evaluate() cannot be called + but instruction-building methods still work. + max_inline_files: Maximum number of files to embed inline in + review payloads. Beyond this, only file paths are listed. + Defaults to DEFAULT_MAX_INLINE_FILES (5). """ - self._cli = cli or ClaudeCLI() + self._cli = cli + self.max_inline_files = ( + max_inline_files if max_inline_files is not None else self.DEFAULT_MAX_INLINE_FILES + ) def _build_instructions( self, @@ -202,7 +218,8 @@ async def _read_file_sections( # WARNING: REVIEW PERFORMANCE IS SENSITIVE TO PAYLOAD SIZE # # The payload builder below sends file contents to the review agent (Claude - # CLI subprocess). Reviews can get REALLY SLOW if the content gets too big: + # CLI subprocess or self-review file). Reviews can get REALLY SLOW if the + # content gets too big: # # - Each file's full content is read and embedded in the prompt # - The review agent must process ALL of this content to evaluate criteria @@ -211,18 +228,18 @@ async def _read_file_sections( # - Per-file reviews (run_each: with type: files) multiply # the problem — each file gets its own review subprocess # - # To mitigate this, when more than MAX_INLINE_FILES files are present, - # the payload switches to a path-listing mode that only shows file paths - # instead of dumping all contents inline. The reviewer can then use its - # own tools to read specific files as needed. + # To mitigate this, when more than self.max_inline_files files are + # present, the payload switches to a path-listing mode that only shows + # file paths instead of dumping all contents inline. The reviewer can + # then use its own tools to read specific files as needed. + # + # max_inline_files is configurable per instance: + # - external_runner="claude": 5 (embed small sets, list large ones) + # - external_runner=None (self-review): 0 (always list paths) # # If you're changing the payload builder, keep payload size in mind. # ========================================================================= - # Maximum number of files to include inline in the review payload. - # Beyond this threshold, only file paths are listed. - MAX_INLINE_FILES = 5 - @staticmethod def _build_path_listing(file_paths: dict[str, str | list[str]]) -> list[str]: """Build a path-only listing for large file sets. @@ -262,7 +279,7 @@ async def _build_payload( parts: list[str] = [] total_files = len(self._flatten_output_paths(outputs)) - if total_files > self.MAX_INLINE_FILES: + if total_files > self.max_inline_files: # Too many files — list paths only so the reviewer reads selectively path_lines = self._build_path_listing(outputs) parts.append(f"{SECTION_SEPARATOR} BEGIN OUTPUTS {SECTION_SEPARATOR}") @@ -318,6 +335,112 @@ def _parse_result(self, data: dict[str, Any]) -> QualityGateResult: f"Failed to interpret quality gate result: {e}\nData was: {data}" ) from e + async def build_review_instructions_file( + self, + reviews: list[dict[str, Any]], + outputs: dict[str, str | list[str]], + output_specs: dict[str, str], + project_root: Path, + notes: str | None = None, + ) -> str: + """Build complete review instructions content for writing to a file. + + Used in self-review mode (no external runner) to generate a file that + a subagent can read and follow to evaluate quality criteria. + + Args: + reviews: List of review dicts with run_each, quality_criteria, + and optional additional_review_guidance + outputs: Map of output names to file path(s) + output_specs: Map of output names to their type ("file" or "files") + project_root: Project root path + notes: Optional notes from the agent about work done + + Returns: + Complete review instructions as a string + """ + parts: list[str] = [] + + parts.append("# Quality Review Instructions") + parts.append("") + parts.append( + "You are an editor responsible for reviewing the outputs of a workflow step. " + "Your job is to evaluate whether the outputs meet the specified quality criteria." + ) + parts.append("") + + # Build outputs listing (uses self.max_inline_files to decide inline vs path-only) + payload = await self._build_payload(outputs, project_root) + parts.append(payload) + parts.append("") + + # Build review sections + for i, review in enumerate(reviews, 1): + run_each = review["run_each"] + quality_criteria = review["quality_criteria"] + guidance = review.get("additional_review_guidance") + + if len(reviews) > 1: + scope = "all outputs together" if run_each == "step" else f"output '{run_each}'" + parts.append(f"## Review {i} (scope: {scope})") + else: + parts.append("## Criteria to Evaluate") + parts.append("") + + criteria_list = "\n".join( + f"- **{name}**: {question}" for name, question in quality_criteria.items() + ) + parts.append(criteria_list) + parts.append("") + + if run_each != "step" and run_each in outputs: + output_type = output_specs.get(run_each, "file") + output_value = outputs[run_each] + if output_type == "files" and isinstance(output_value, list): + parts.append( + f"Evaluate the above criteria for **each file** in output '{run_each}':" + ) + for fp in output_value: + parts.append(f"- {fp}") + parts.append("") + + if guidance: + parts.append("### Additional Context") + parts.append("") + parts.append(guidance) + parts.append("") + + if notes: + parts.append("## Author Notes") + parts.append("") + parts.append(notes) + parts.append("") + + parts.append("## Guidelines") + parts.append("") + parts.append("- Be strict but fair") + parts.append( + "- Apply criteria pragmatically. If a criterion is not applicable " + "to this step's purpose, pass it." + ) + parts.append("- Only mark a criterion as passed if it is clearly met or not applicable.") + parts.append("- Provide specific, actionable feedback for failed criteria.") + parts.append( + "- The overall review should PASS only if ALL criteria across all reviews pass." + ) + parts.append("") + parts.append("## Your Task") + parts.append("") + parts.append("1. Read each output file listed above") + parts.append("2. Evaluate every criterion in every review section") + parts.append("3. For each criterion, report **PASS** or **FAIL** with specific feedback") + parts.append("4. At the end, clearly state the overall result: **PASSED** or **FAILED**") + parts.append( + "5. If any criteria failed, provide clear actionable feedback on what needs to change" + ) + + return "\n".join(parts) + @staticmethod def compute_timeout(file_count: int) -> int: """Compute dynamic timeout based on number of files. @@ -368,6 +491,12 @@ async def evaluate( criteria_results=[], ) + if self._cli is None: + raise QualityGateError( + "Cannot evaluate quality gate without a CLI runner. " + "Use build_review_instructions_file() for self-review mode." + ) + instructions = self._build_instructions( quality_criteria, notes=notes, diff --git a/src/deepwork/mcp/server.py b/src/deepwork/mcp/server.py index 4a227ed1..ccf1de6e 100644 --- a/src/deepwork/mcp/server.py +++ b/src/deepwork/mcp/server.py @@ -38,6 +38,7 @@ def create_server( enable_quality_gate: bool = True, quality_gate_timeout: int = 120, quality_gate_max_attempts: int = 3, + external_runner: str | None = None, ) -> FastMCP: """Create and configure the MCP server. @@ -46,6 +47,9 @@ def create_server( enable_quality_gate: Whether to enable quality gate evaluation (default: True) quality_gate_timeout: Timeout in seconds for quality gate (default: 120) quality_gate_max_attempts: Max attempts before failing quality gate (default: 3) + external_runner: External runner for quality gate reviews. + "claude" uses Claude CLI subprocess. None means agent self-review + via instructions file. (default: None) Returns: Configured FastMCP server instance @@ -57,14 +61,20 @@ def create_server( quality_gate: QualityGate | None = None if enable_quality_gate: - cli = ClaudeCLI(timeout=quality_gate_timeout) - quality_gate = QualityGate(cli=cli) + if external_runner == "claude": + # Claude CLI subprocess mode: embed up to 5 files inline + cli = ClaudeCLI(timeout=quality_gate_timeout) + quality_gate = QualityGate(cli=cli, max_inline_files=5) + else: + # Self-review mode: no CLI, always reference files by path (0 inline) + quality_gate = QualityGate(cli=None, max_inline_files=0) tools = WorkflowTools( project_root=project_path, state_manager=state_manager, quality_gate=quality_gate, max_quality_attempts=quality_gate_max_attempts, + external_runner=external_runner, ) # Create MCP server diff --git a/src/deepwork/mcp/tools.py b/src/deepwork/mcp/tools.py index 6bc5c046..cddcf80d 100644 --- a/src/deepwork/mcp/tools.py +++ b/src/deepwork/mcp/tools.py @@ -12,6 +12,8 @@ from pathlib import Path from typing import TYPE_CHECKING +import aiofiles + from deepwork.core.parser import ( JobDefinition, OutputSpec, @@ -57,6 +59,7 @@ def __init__( state_manager: StateManager, quality_gate: QualityGate | None = None, max_quality_attempts: int = 3, + external_runner: str | None = None, ): """Initialize workflow tools. @@ -65,12 +68,15 @@ def __init__( state_manager: State manager instance quality_gate: Optional quality gate for step validation max_quality_attempts: Maximum attempts before failing quality gate + external_runner: External runner for quality gate reviews. + "claude" uses Claude CLI subprocess. None means agent self-review. """ self.project_root = project_root self.jobs_dir = project_root / ".deepwork" / "jobs" self.state_manager = state_manager self.quality_gate = quality_gate self.max_quality_attempts = max_quality_attempts + self.external_runner = external_runner def _load_all_jobs(self) -> list[JobDefinition]: """Load all job definitions from the jobs directory. @@ -391,45 +397,89 @@ async def finished_step(self, input_data: FinishedStepInput) -> FinishedStepResp and current_step.reviews and not input_data.quality_review_override_reason ): - attempts = await self.state_manager.record_quality_attempt( - current_step_id, session_id=sid - ) - - # Build output specs map for evaluate_reviews + # Build review dicts and output specs used by both paths + review_dicts = [ + { + "run_each": r.run_each, + "quality_criteria": r.quality_criteria, + "additional_review_guidance": r.additional_review_guidance, + } + for r in current_step.reviews + ] output_specs = {out.name: out.type for out in current_step.outputs} - failed_reviews = await self.quality_gate.evaluate_reviews( - reviews=[ - { - "run_each": r.run_each, - "quality_criteria": r.quality_criteria, - "additional_review_guidance": r.additional_review_guidance, - } - for r in current_step.reviews - ], - outputs=input_data.outputs, - output_specs=output_specs, - project_root=self.project_root, - notes=input_data.notes, - ) + if self.external_runner is None: + # Self-review mode: build instructions file and return guidance + # to the agent to verify its own work via a subagent. + review_content = await self.quality_gate.build_review_instructions_file( + reviews=review_dicts, + outputs=input_data.outputs, + output_specs=output_specs, + project_root=self.project_root, + notes=input_data.notes, + ) - if failed_reviews: - # Check max attempts - if attempts >= self.max_quality_attempts: - feedback_parts = [r.feedback for r in failed_reviews] - raise ToolError( - f"Quality gate failed after {self.max_quality_attempts} attempts. " - f"Feedback: {'; '.join(feedback_parts)}" - ) + # Write instructions to .deepwork/tmp/ + tmp_dir = self.project_root / ".deepwork" / "tmp" + tmp_dir.mkdir(parents=True, exist_ok=True) + review_filename = f"quality_review_{sid}_{current_step_id}.md" + review_file_path = tmp_dir / review_filename + async with aiofiles.open(review_file_path, "w", encoding="utf-8") as f: + await f.write(review_content) + + relative_path = f".deepwork/tmp/{review_filename}" + feedback = ( + f"Quality review required. Review instructions have been written to: " + f"{relative_path}\n\n" + f"Verify the quality of your work:\n" + f'1. Spawn a subagent with the prompt: "Read the file at ' + f"{relative_path} and follow the instructions in it. " + f"Review the referenced output files and evaluate them against " + f'the criteria specified. Report your detailed findings."\n' + f"2. Review the subagent's findings\n" + f"3. Fix any issues identified by the subagent\n" + f"4. Repeat steps 1-3 until all criteria pass\n" + f"5. Once all criteria pass, call finished_step again with " + f"quality_review_override_reason set to describe the " + f"review outcome (e.g. 'Self-review passed: all criteria met')" + ) - # Return needs_work status - combined_feedback = "; ".join(r.feedback for r in failed_reviews) return FinishedStepResponse( status=StepStatus.NEEDS_WORK, - feedback=combined_feedback, - failed_reviews=failed_reviews, + feedback=feedback, stack=self.state_manager.get_stack(), ) + else: + # External runner mode: use quality gate subprocess evaluation + attempts = await self.state_manager.record_quality_attempt( + current_step_id, session_id=sid + ) + + failed_reviews = await self.quality_gate.evaluate_reviews( + reviews=review_dicts, + outputs=input_data.outputs, + output_specs=output_specs, + project_root=self.project_root, + notes=input_data.notes, + ) + + if failed_reviews: + # Check max attempts + if attempts >= self.max_quality_attempts: + feedback_parts = [r.feedback for r in failed_reviews] + raise ToolError( + f"Quality gate failed after {self.max_quality_attempts} attempts. " + f"Feedback: {'; '.join(feedback_parts)}" + ) + + # Return needs_work status + combined_feedback = "; ".join(r.feedback for r in failed_reviews) + return FinishedStepResponse( + status=StepStatus.NEEDS_WORK, + feedback=combined_feedback, + failed_reviews=failed_reviews, + stack=self.state_manager.get_stack(), + ) # Mark step as completed await self.state_manager.complete_step( diff --git a/tests/unit/mcp/test_quality_gate.py b/tests/unit/mcp/test_quality_gate.py index c6ab3c43..b2c56c6a 100644 --- a/tests/unit/mcp/test_quality_gate.py +++ b/tests/unit/mcp/test_quality_gate.py @@ -46,10 +46,11 @@ def output_file(project_root: Path) -> Path: class TestQualityGate: """Tests for QualityGate class.""" - def test_init_default_cli(self) -> None: - """Test QualityGate creates a default ClaudeCLI if none provided.""" + def test_init_no_cli(self) -> None: + """Test QualityGate with no CLI provided has _cli=None and default max_inline_files.""" gate = QualityGate() - assert isinstance(gate._cli, ClaudeCLI) + assert gate._cli is None + assert gate.max_inline_files == QualityGate.DEFAULT_MAX_INLINE_FILES def test_init_custom_cli(self, mock_cli: ClaudeCLI) -> None: """Test QualityGate uses provided ClaudeCLI.""" @@ -509,7 +510,7 @@ async def test_per_file_review_passes_guidance_to_each( class TestBuildPayloadLargeFileSet: - """Tests for _build_payload behavior when file count exceeds MAX_INLINE_FILES.""" + """Tests for _build_payload behavior when file count exceeds max_inline_files.""" async def test_payload_lists_paths_when_over_threshold( self, quality_gate: QualityGate, project_root: Path @@ -756,3 +757,299 @@ async def test_mock_records_none_guidance_when_omitted(self, project_root: Path) ) assert gate.evaluations[0]["additional_review_guidance"] is None + + +class TestConfigurableMaxInlineFiles: + """Tests for configurable max_inline_files on QualityGate.""" + + def test_default_max_inline_files(self) -> None: + """Test QualityGate defaults to DEFAULT_MAX_INLINE_FILES.""" + gate = QualityGate() + assert gate.max_inline_files == QualityGate.DEFAULT_MAX_INLINE_FILES + assert gate.max_inline_files == 5 + + def test_custom_max_inline_files(self) -> None: + """Test QualityGate respects explicit max_inline_files.""" + gate = QualityGate(max_inline_files=10) + assert gate.max_inline_files == 10 + + def test_zero_max_inline_files(self) -> None: + """Test QualityGate with max_inline_files=0 always lists paths.""" + gate = QualityGate(max_inline_files=0) + assert gate.max_inline_files == 0 + + def test_max_inline_files_none_uses_default(self) -> None: + """Test that passing None explicitly uses the default.""" + gate = QualityGate(max_inline_files=None) + assert gate.max_inline_files == QualityGate.DEFAULT_MAX_INLINE_FILES + + def test_cli_and_max_inline_files_independent(self, mock_cli: ClaudeCLI) -> None: + """Test that cli and max_inline_files are independent parameters.""" + gate = QualityGate(cli=mock_cli, max_inline_files=3) + assert gate._cli is mock_cli + assert gate.max_inline_files == 3 + + async def test_zero_max_inline_always_lists_paths(self, project_root: Path) -> None: + """Test that max_inline_files=0 uses path listing even for 1 file.""" + gate = QualityGate(max_inline_files=0) + (project_root / "single.md").write_text("Single file content") + + payload = await gate._build_payload( + outputs={"report": "single.md"}, + project_root=project_root, + ) + + assert "too many to include inline" in payload + assert "single.md" in payload + assert "Single file content" not in payload + + async def test_high_max_inline_embeds_many_files(self, project_root: Path) -> None: + """Test that a high max_inline_files embeds content for many files.""" + gate = QualityGate(max_inline_files=100) + for i in range(10): + (project_root / f"f{i}.md").write_text(f"Embedded content {i}") + + payload = await gate._build_payload( + outputs={"files": [f"f{i}.md" for i in range(10)]}, + project_root=project_root, + ) + + assert "too many to include inline" not in payload + for i in range(10): + assert f"Embedded content {i}" in payload + + +class TestEvaluateWithoutCli: + """Tests that evaluate() raises when no CLI is configured.""" + + async def test_evaluate_raises_without_cli(self, project_root: Path) -> None: + """Test that evaluate raises QualityGateError when _cli is None.""" + gate = QualityGate(cli=None) + (project_root / "output.md").write_text("content") + + with pytest.raises(QualityGateError, match="Cannot evaluate.*without a CLI runner"): + await gate.evaluate( + quality_criteria={"Valid": "Is it valid?"}, + outputs={"report": "output.md"}, + project_root=project_root, + ) + + async def test_evaluate_no_criteria_still_passes_without_cli(self, project_root: Path) -> None: + """Test that empty criteria auto-passes even without CLI.""" + gate = QualityGate(cli=None) + + result = await gate.evaluate( + quality_criteria={}, + outputs={"report": "output.md"}, + project_root=project_root, + ) + + assert result.passed is True + assert "auto-passing" in result.feedback.lower() + + +class TestBuildReviewInstructionsFile: + """Tests for QualityGate.build_review_instructions_file method.""" + + async def test_basic_structure(self, project_root: Path) -> None: + """Test that the instructions file has the expected structure.""" + gate = QualityGate(max_inline_files=0) + (project_root / "output.md").write_text("content") + + content = await gate.build_review_instructions_file( + reviews=[ + { + "run_each": "step", + "quality_criteria": {"Complete": "Is it complete?"}, + } + ], + outputs={"report": "output.md"}, + output_specs={"report": "file"}, + project_root=project_root, + ) + + assert "# Quality Review Instructions" in content + assert "editor" in content.lower() + assert "BEGIN OUTPUTS" in content + assert "END OUTPUTS" in content + assert "Complete" in content + assert "Is it complete?" in content + assert "## Guidelines" in content + assert "## Your Task" in content + assert "PASS" in content + assert "FAIL" in content + + async def test_contains_all_criteria(self, project_root: Path) -> None: + """Test that all criteria from all reviews appear in the file.""" + gate = QualityGate(max_inline_files=0) + (project_root / "out.md").write_text("x") + + content = await gate.build_review_instructions_file( + reviews=[ + { + "run_each": "step", + "quality_criteria": { + "Accuracy": "Are the facts correct?", + "Completeness": "Is all data present?", + }, + } + ], + outputs={"report": "out.md"}, + output_specs={"report": "file"}, + project_root=project_root, + ) + + assert "**Accuracy**" in content + assert "Are the facts correct?" in content + assert "**Completeness**" in content + assert "Is all data present?" in content + + async def test_multiple_reviews_numbered(self, project_root: Path) -> None: + """Test that multiple reviews get numbered sections.""" + gate = QualityGate(max_inline_files=0) + (project_root / "out.md").write_text("x") + + content = await gate.build_review_instructions_file( + reviews=[ + { + "run_each": "step", + "quality_criteria": {"First": "First check?"}, + }, + { + "run_each": "report", + "quality_criteria": {"Second": "Second check?"}, + }, + ], + outputs={"report": "out.md"}, + output_specs={"report": "file"}, + project_root=project_root, + ) + + assert "## Review 1" in content + assert "## Review 2" in content + assert "scope: all outputs together" in content + assert "scope: output 'report'" in content + + async def test_single_review_not_numbered(self, project_root: Path) -> None: + """Test that a single review uses 'Criteria to Evaluate' heading.""" + gate = QualityGate(max_inline_files=0) + (project_root / "out.md").write_text("x") + + content = await gate.build_review_instructions_file( + reviews=[ + { + "run_each": "step", + "quality_criteria": {"Only": "Only check?"}, + } + ], + outputs={"report": "out.md"}, + output_specs={"report": "file"}, + project_root=project_root, + ) + + assert "## Criteria to Evaluate" in content + assert "Review 1" not in content + + async def test_includes_author_notes(self, project_root: Path) -> None: + """Test that notes are included when provided.""" + gate = QualityGate(max_inline_files=0) + (project_root / "out.md").write_text("x") + + content = await gate.build_review_instructions_file( + reviews=[ + { + "run_each": "step", + "quality_criteria": {"Check": "Is it ok?"}, + } + ], + outputs={"report": "out.md"}, + output_specs={"report": "file"}, + project_root=project_root, + notes="I focused on section 3 the most.", + ) + + assert "## Author Notes" in content + assert "I focused on section 3 the most." in content + + async def test_excludes_notes_when_none(self, project_root: Path) -> None: + """Test that notes section is absent when not provided.""" + gate = QualityGate(max_inline_files=0) + (project_root / "out.md").write_text("x") + + content = await gate.build_review_instructions_file( + reviews=[ + { + "run_each": "step", + "quality_criteria": {"Check": "Is it ok?"}, + } + ], + outputs={"report": "out.md"}, + output_specs={"report": "file"}, + project_root=project_root, + ) + + assert "## Author Notes" not in content + + async def test_includes_guidance(self, project_root: Path) -> None: + """Test that additional_review_guidance is included.""" + gate = QualityGate(max_inline_files=0) + (project_root / "out.md").write_text("x") + + content = await gate.build_review_instructions_file( + reviews=[ + { + "run_each": "step", + "quality_criteria": {"Check": "Is it ok?"}, + "additional_review_guidance": "Also read config.yml for context.", + } + ], + outputs={"report": "out.md"}, + output_specs={"report": "file"}, + project_root=project_root, + ) + + assert "### Additional Context" in content + assert "Also read config.yml for context." in content + + async def test_per_file_review_lists_files(self, project_root: Path) -> None: + """Test that per-file reviews list each file to evaluate.""" + gate = QualityGate(max_inline_files=0) + (project_root / "a.md").write_text("x") + (project_root / "b.md").write_text("x") + + content = await gate.build_review_instructions_file( + reviews=[ + { + "run_each": "pages", + "quality_criteria": {"Valid": "Is it valid?"}, + } + ], + outputs={"pages": ["a.md", "b.md"]}, + output_specs={"pages": "files"}, + project_root=project_root, + ) + + assert "each file" in content.lower() + assert "a.md" in content + assert "b.md" in content + + async def test_output_paths_listed_not_inlined_at_zero(self, project_root: Path) -> None: + """Test that with max_inline_files=0, file contents are NOT embedded.""" + gate = QualityGate(max_inline_files=0) + (project_root / "report.md").write_text("SECRET_CONTENT_MARKER") + + content = await gate.build_review_instructions_file( + reviews=[ + { + "run_each": "step", + "quality_criteria": {"Check": "Is it ok?"}, + } + ], + outputs={"report": "report.md"}, + output_specs={"report": "file"}, + project_root=project_root, + ) + + assert "report.md" in content + assert "SECRET_CONTENT_MARKER" not in content + assert "too many to include inline" in content diff --git a/tests/unit/mcp/test_tools.py b/tests/unit/mcp/test_tools.py index f5b7adce..2cdd26c5 100644 --- a/tests/unit/mcp/test_tools.py +++ b/tests/unit/mcp/test_tools.py @@ -104,6 +104,7 @@ def tools_with_quality(project_root: Path, state_manager: StateManager) -> Workf project_root=project_root, state_manager=state_manager, quality_gate=MockQualityGate(should_pass=True), + external_runner="claude", ) @@ -347,6 +348,7 @@ async def test_finished_step_with_quality_gate_fail( project_root=project_root, state_manager=state_manager, quality_gate=MockQualityGate(should_pass=False, feedback="Needs improvement"), + external_runner="claude", ) # Start workflow @@ -375,6 +377,7 @@ async def test_finished_step_quality_gate_max_attempts( project_root=project_root, state_manager=state_manager, quality_gate=MockQualityGate(should_pass=False, feedback="Always fails"), + external_runner="claude", ) # Start workflow @@ -409,6 +412,7 @@ async def test_finished_step_quality_gate_override( project_root=project_root, state_manager=state_manager, quality_gate=failing_gate, + external_runner="claude", ) # Start workflow @@ -1046,6 +1050,7 @@ async def test_quality_reviewer_receives_only_current_step_outputs( project_root=project_root, state_manager=state_manager, quality_gate=mock_gate, + external_runner="claude", ) # Start workflow @@ -1129,6 +1134,7 @@ async def test_additional_review_guidance_reaches_reviewer( project_root=project_root, state_manager=state_manager, quality_gate=mock_gate, + external_runner="claude", ) await tools.start_workflow( @@ -1378,3 +1384,337 @@ async def test_abort_workflow_with_session_id( assert tools.state_manager.get_stack_depth() == 1 assert tools.state_manager.get_active_session() is not None assert tools.state_manager.get_active_session().session_id == session_b_id + + +class TestExternalRunnerSelfReview: + """Tests for self-review mode (external_runner=None) in finished_step.""" + + @pytest.fixture + def tools_self_review(self, project_root: Path, state_manager: StateManager) -> WorkflowTools: + """Create WorkflowTools with quality gate but no external runner (self-review mode).""" + from deepwork.mcp.quality_gate import QualityGate + + return WorkflowTools( + project_root=project_root, + state_manager=state_manager, + quality_gate=QualityGate(cli=None, max_inline_files=0), + external_runner=None, + ) + + async def test_self_review_returns_needs_work( + self, tools_self_review: WorkflowTools, project_root: Path + ) -> None: + """Test that self-review mode returns NEEDS_WORK with instructions.""" + await tools_self_review.start_workflow( + StartWorkflowInput(goal="Test", job_name="test_job", workflow_name="main") + ) + (project_root / "output1.md").write_text("Some output") + + response = await tools_self_review.finished_step( + FinishedStepInput(outputs={"output1.md": "output1.md"}) + ) + + assert response.status == StepStatus.NEEDS_WORK + assert response.failed_reviews is None # No actual review results yet + + async def test_self_review_feedback_contains_instructions( + self, tools_self_review: WorkflowTools, project_root: Path + ) -> None: + """Test that feedback contains subagent and override instructions.""" + await tools_self_review.start_workflow( + StartWorkflowInput(goal="Test", job_name="test_job", workflow_name="main") + ) + (project_root / "output1.md").write_text("Some output") + + response = await tools_self_review.finished_step( + FinishedStepInput(outputs={"output1.md": "output1.md"}) + ) + + assert "Quality review required" in response.feedback + assert "subagent" in response.feedback.lower() + assert "quality_review_override_reason" in response.feedback + assert ".deepwork/tmp/quality_review_" in response.feedback + + async def test_self_review_writes_instructions_file( + self, tools_self_review: WorkflowTools, project_root: Path + ) -> None: + """Test that an instructions file is written to .deepwork/tmp/.""" + await tools_self_review.start_workflow( + StartWorkflowInput(goal="Test", job_name="test_job", workflow_name="main") + ) + (project_root / "output1.md").write_text("Some output") + + await tools_self_review.finished_step( + FinishedStepInput(outputs={"output1.md": "output1.md"}) + ) + + review_files = list((project_root / ".deepwork" / "tmp").glob("quality_review_*.md")) + assert len(review_files) == 1 + + async def test_self_review_file_contains_criteria( + self, tools_self_review: WorkflowTools, project_root: Path + ) -> None: + """Test that the instructions file contains the quality criteria from the job.""" + await tools_self_review.start_workflow( + StartWorkflowInput(goal="Test", job_name="test_job", workflow_name="main") + ) + (project_root / "output1.md").write_text("Some output") + + await tools_self_review.finished_step( + FinishedStepInput(outputs={"output1.md": "output1.md"}) + ) + + review_files = list((project_root / ".deepwork" / "tmp").glob("quality_review_*.md")) + content = review_files[0].read_text() + + # step1 has review criteria "Output Valid": "Is the output valid?" + assert "Output Valid" in content + assert "Is the output valid?" in content + + async def test_self_review_file_references_outputs_not_inline( + self, tools_self_review: WorkflowTools, project_root: Path + ) -> None: + """Test that the instructions file lists output paths, not inline content.""" + await tools_self_review.start_workflow( + StartWorkflowInput(goal="Test", job_name="test_job", workflow_name="main") + ) + (project_root / "output1.md").write_text("UNIQUE_CONTENT_MARKER_12345") + + await tools_self_review.finished_step( + FinishedStepInput(outputs={"output1.md": "output1.md"}) + ) + + review_files = list((project_root / ".deepwork" / "tmp").glob("quality_review_*.md")) + content = review_files[0].read_text() + + assert "output1.md" in content + assert "UNIQUE_CONTENT_MARKER_12345" not in content + + async def test_self_review_file_named_with_session_and_step( + self, tools_self_review: WorkflowTools, project_root: Path + ) -> None: + """Test that review file name includes session and step IDs.""" + resp = await tools_self_review.start_workflow( + StartWorkflowInput(goal="Test", job_name="test_job", workflow_name="main") + ) + session_id = resp.begin_step.session_id + (project_root / "output1.md").write_text("output") + + await tools_self_review.finished_step( + FinishedStepInput(outputs={"output1.md": "output1.md"}) + ) + + expected_file = project_root / ".deepwork" / "tmp" / f"quality_review_{session_id}_step1.md" + assert expected_file.exists() + + async def test_self_review_then_override_completes_workflow( + self, tools_self_review: WorkflowTools, project_root: Path + ) -> None: + """Test that calling finished_step with override after self-review advances the workflow.""" + await tools_self_review.start_workflow( + StartWorkflowInput(goal="Test", job_name="test_job", workflow_name="main") + ) + (project_root / "output1.md").write_text("output") + + # First call: self-review + resp1 = await tools_self_review.finished_step( + FinishedStepInput(outputs={"output1.md": "output1.md"}) + ) + assert resp1.status == StepStatus.NEEDS_WORK + + # Second call: override, should advance to step2 + resp2 = await tools_self_review.finished_step( + FinishedStepInput( + outputs={"output1.md": "output1.md"}, + quality_review_override_reason="Self-review passed: all criteria met", + ) + ) + assert resp2.status == StepStatus.NEXT_STEP + assert resp2.begin_step.step_id == "step2" + + async def test_self_review_skipped_for_steps_without_reviews( + self, tools_self_review: WorkflowTools, project_root: Path + ) -> None: + """Test that steps without reviews skip self-review entirely.""" + await tools_self_review.start_workflow( + StartWorkflowInput(goal="Test", job_name="test_job", workflow_name="main") + ) + (project_root / "output1.md").write_text("output") + + # Override step1 to advance to step2 (which has no reviews) + await tools_self_review.finished_step( + FinishedStepInput( + outputs={"output1.md": "output1.md"}, + quality_review_override_reason="Skip", + ) + ) + + # step2 has no reviews, so it should complete without self-review + (project_root / "output2.md").write_text("step2 output") + resp = await tools_self_review.finished_step( + FinishedStepInput(outputs={"output2.md": "output2.md"}) + ) + assert resp.status == StepStatus.WORKFLOW_COMPLETE + + async def test_self_review_includes_notes_in_file( + self, tools_self_review: WorkflowTools, project_root: Path + ) -> None: + """Test that agent notes are included in the review instructions file.""" + await tools_self_review.start_workflow( + StartWorkflowInput(goal="Test", job_name="test_job", workflow_name="main") + ) + (project_root / "output1.md").write_text("output") + + await tools_self_review.finished_step( + FinishedStepInput( + outputs={"output1.md": "output1.md"}, + notes="I used the XYZ library for this step.", + ) + ) + + review_files = list((project_root / ".deepwork" / "tmp").glob("quality_review_*.md")) + content = review_files[0].read_text() + assert "I used the XYZ library for this step." in content + + +class TestExternalRunnerClaude: + """Tests that external_runner='claude' uses subprocess evaluation (existing behavior).""" + + async def test_claude_runner_calls_quality_gate_evaluate( + self, project_root: Path, state_manager: StateManager + ) -> None: + """Test that claude runner mode invokes evaluate_reviews on the quality gate.""" + mock_gate = MockQualityGate(should_pass=True) + tools = WorkflowTools( + project_root=project_root, + state_manager=state_manager, + quality_gate=mock_gate, + external_runner="claude", + ) + + await tools.start_workflow( + StartWorkflowInput(goal="Test", job_name="test_job", workflow_name="main") + ) + (project_root / "output1.md").write_text("output") + + response = await tools.finished_step( + FinishedStepInput(outputs={"output1.md": "output1.md"}) + ) + + # Should have called evaluate_reviews and advanced + assert response.status == StepStatus.NEXT_STEP + assert len(mock_gate.evaluations) > 0 + + async def test_claude_runner_does_not_write_instructions_file( + self, project_root: Path, state_manager: StateManager + ) -> None: + """Test that claude runner mode does NOT write an instructions file.""" + mock_gate = MockQualityGate(should_pass=True) + tools = WorkflowTools( + project_root=project_root, + state_manager=state_manager, + quality_gate=mock_gate, + external_runner="claude", + ) + + await tools.start_workflow( + StartWorkflowInput(goal="Test", job_name="test_job", workflow_name="main") + ) + (project_root / "output1.md").write_text("output") + + await tools.finished_step(FinishedStepInput(outputs={"output1.md": "output1.md"})) + + review_files = list((project_root / ".deepwork" / "tmp").glob("quality_review_*.md")) + assert len(review_files) == 0 + + async def test_claude_runner_failing_gate_returns_feedback( + self, project_root: Path, state_manager: StateManager + ) -> None: + """Test that claude runner with failing gate returns NEEDS_WORK with review feedback.""" + mock_gate = MockQualityGate(should_pass=False, feedback="Missing detail in section 2") + tools = WorkflowTools( + project_root=project_root, + state_manager=state_manager, + quality_gate=mock_gate, + external_runner="claude", + ) + + await tools.start_workflow( + StartWorkflowInput(goal="Test", job_name="test_job", workflow_name="main") + ) + (project_root / "output1.md").write_text("output") + + response = await tools.finished_step( + FinishedStepInput(outputs={"output1.md": "output1.md"}) + ) + + assert response.status == StepStatus.NEEDS_WORK + assert response.feedback == "Missing detail in section 2" + assert response.failed_reviews is not None + assert len(response.failed_reviews) == 1 + + async def test_claude_runner_records_quality_attempts( + self, project_root: Path, state_manager: StateManager + ) -> None: + """Test that claude runner mode tracks quality attempt count.""" + mock_gate = MockQualityGate(should_pass=False, feedback="Fail") + tools = WorkflowTools( + project_root=project_root, + state_manager=state_manager, + quality_gate=mock_gate, + external_runner="claude", + ) + + await tools.start_workflow( + StartWorkflowInput(goal="Test", job_name="test_job", workflow_name="main") + ) + (project_root / "output1.md").write_text("output") + + # First two attempts: NEEDS_WORK + for _ in range(2): + resp = await tools.finished_step( + FinishedStepInput(outputs={"output1.md": "output1.md"}) + ) + assert resp.status == StepStatus.NEEDS_WORK + + # Third attempt: raises ToolError + with pytest.raises(ToolError, match="Quality gate failed after.*attempts"): + await tools.finished_step(FinishedStepInput(outputs={"output1.md": "output1.md"})) + + +class TestExternalRunnerInit: + """Tests for external_runner parameter on WorkflowTools initialization.""" + + def test_default_external_runner_is_none( + self, project_root: Path, state_manager: StateManager + ) -> None: + """Test that external_runner defaults to None.""" + tools = WorkflowTools( + project_root=project_root, + state_manager=state_manager, + ) + assert tools.external_runner is None + + def test_explicit_external_runner( + self, project_root: Path, state_manager: StateManager + ) -> None: + """Test that external_runner is stored correctly.""" + tools = WorkflowTools( + project_root=project_root, + state_manager=state_manager, + external_runner="claude", + ) + assert tools.external_runner == "claude" + + def test_no_quality_gate_no_external_runner_skips_review( + self, project_root: Path, state_manager: StateManager + ) -> None: + """Test that without quality gate, external_runner is irrelevant.""" + tools = WorkflowTools( + project_root=project_root, + state_manager=state_manager, + quality_gate=None, + external_runner=None, + ) + assert tools.quality_gate is None + assert tools.external_runner is None diff --git a/tests/unit/test_adapters.py b/tests/unit/test_adapters.py new file mode 100644 index 00000000..49c1e5bd --- /dev/null +++ b/tests/unit/test_adapters.py @@ -0,0 +1,112 @@ +"""Tests for adapter MCP server registration.""" + +import json +from pathlib import Path + +import pytest + +from deepwork.core.adapters import ClaudeAdapter + + +@pytest.fixture +def project_root(tmp_path: Path) -> Path: + """Create a temporary project root with .claude dir.""" + claude_dir = tmp_path / ".claude" + claude_dir.mkdir() + return tmp_path + + +class TestClaudeAdapterMCPRegistration: + """Tests for ClaudeAdapter.register_mcp_server.""" + + def test_register_creates_mcp_json(self, project_root: Path) -> None: + """Test that register_mcp_server creates .mcp.json.""" + adapter = ClaudeAdapter(project_root=project_root) + result = adapter.register_mcp_server(project_root) + + assert result is True + assert (project_root / ".mcp.json").exists() + + def test_register_includes_external_runner_claude(self, project_root: Path) -> None: + """Test that .mcp.json args include --external-runner claude.""" + adapter = ClaudeAdapter(project_root=project_root) + adapter.register_mcp_server(project_root) + + config = json.loads((project_root / ".mcp.json").read_text()) + args = config["mcpServers"]["deepwork"]["args"] + + assert "--external-runner" in args + idx = args.index("--external-runner") + assert args[idx + 1] == "claude" + + def test_register_full_args(self, project_root: Path) -> None: + """Test the complete args list in .mcp.json.""" + adapter = ClaudeAdapter(project_root=project_root) + adapter.register_mcp_server(project_root) + + config = json.loads((project_root / ".mcp.json").read_text()) + args = config["mcpServers"]["deepwork"]["args"] + + assert args == ["serve", "--path", ".", "--external-runner", "claude"] + + def test_register_command_is_deepwork(self, project_root: Path) -> None: + """Test that the command in .mcp.json is 'deepwork'.""" + adapter = ClaudeAdapter(project_root=project_root) + adapter.register_mcp_server(project_root) + + config = json.loads((project_root / ".mcp.json").read_text()) + assert config["mcpServers"]["deepwork"]["command"] == "deepwork" + + def test_register_is_idempotent(self, project_root: Path) -> None: + """Test that registering twice with same config returns False.""" + adapter = ClaudeAdapter(project_root=project_root) + + assert adapter.register_mcp_server(project_root) is True + assert adapter.register_mcp_server(project_root) is False + + def test_register_updates_old_config(self, project_root: Path) -> None: + """Test that registering updates an existing .mcp.json with old args.""" + # Write an old-style config without --external-runner + old_config = { + "mcpServers": { + "deepwork": { + "command": "deepwork", + "args": ["serve", "--path", "."], + } + } + } + (project_root / ".mcp.json").write_text(json.dumps(old_config)) + + adapter = ClaudeAdapter(project_root=project_root) + result = adapter.register_mcp_server(project_root) + + # Should detect the difference and update + assert result is True + config = json.loads((project_root / ".mcp.json").read_text()) + assert config["mcpServers"]["deepwork"]["args"] == [ + "serve", + "--path", + ".", + "--external-runner", + "claude", + ] + + def test_register_preserves_other_servers(self, project_root: Path) -> None: + """Test that registering deepwork preserves other MCP servers.""" + existing_config = { + "mcpServers": { + "other_server": { + "command": "other", + "args": ["--flag"], + } + } + } + (project_root / ".mcp.json").write_text(json.dumps(existing_config)) + + adapter = ClaudeAdapter(project_root=project_root) + adapter.register_mcp_server(project_root) + + config = json.loads((project_root / ".mcp.json").read_text()) + assert "other_server" in config["mcpServers"] + assert config["mcpServers"]["other_server"]["command"] == "other" + assert "deepwork" in config["mcpServers"] diff --git a/tests/unit/test_serve_cli.py b/tests/unit/test_serve_cli.py new file mode 100644 index 00000000..651ea89d --- /dev/null +++ b/tests/unit/test_serve_cli.py @@ -0,0 +1,62 @@ +"""Tests for serve CLI command --external-runner option.""" + +from unittest.mock import MagicMock, patch + +from click.testing import CliRunner + +from deepwork.cli.serve import serve + + +class TestServeExternalRunnerOption: + """Tests for --external-runner CLI option on the serve command.""" + + @patch("deepwork.cli.serve._serve_mcp") + @patch("deepwork.cli.serve._load_config") + def test_default_external_runner_is_none( + self, mock_load: MagicMock, mock_serve: MagicMock, tmp_path: str + ) -> None: + """Test that --external-runner defaults to None when not specified.""" + runner = CliRunner() + with runner.isolated_filesystem(temp_dir=tmp_path) as td: + result = runner.invoke(serve, ["--path", td]) + if result.exit_code != 0 and result.exception: + raise result.exception + + # _serve_mcp should be called with external_runner=None + mock_serve.assert_called_once() + call_args = mock_serve.call_args + assert call_args[0][4] is None or call_args.kwargs.get("external_runner") is None + + @patch("deepwork.cli.serve._serve_mcp") + @patch("deepwork.cli.serve._load_config") + def test_external_runner_claude( + self, mock_load: MagicMock, mock_serve: MagicMock, tmp_path: str + ) -> None: + """Test that --external-runner claude passes 'claude' through.""" + runner = CliRunner() + with runner.isolated_filesystem(temp_dir=tmp_path) as td: + result = runner.invoke(serve, ["--path", td, "--external-runner", "claude"]) + if result.exit_code != 0 and result.exception: + raise result.exception + + mock_serve.assert_called_once() + # external_runner is the 5th positional arg (index 4) + call_args = mock_serve.call_args[0] + assert call_args[4] == "claude" + + def test_external_runner_invalid_choice(self, tmp_path: str) -> None: + """Test that invalid --external-runner values are rejected.""" + runner = CliRunner() + result = runner.invoke(serve, ["--path", ".", "--external-runner", "invalid"]) + + assert result.exit_code != 0 + assert "Invalid value" in result.output or "invalid" in result.output.lower() + + def test_help_shows_external_runner(self) -> None: + """Test that --help shows the --external-runner option.""" + runner = CliRunner() + result = runner.invoke(serve, ["--help"]) + + assert result.exit_code == 0 + assert "--external-runner" in result.output + assert "claude" in result.output