diff --git a/pyproject.toml b/pyproject.toml
index b33bfdf..636ed25 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "uipath-dev"
-version = "0.0.60"
+version = "0.0.61"
description = "UiPath Developer Console"
readme = { file = "README.md", content-type = "text/markdown" }
requires-python = ">=3.11"
@@ -10,6 +10,7 @@ dependencies = [
"pyperclip>=1.11.0, <2.0.0",
"fastapi>=0.128.8",
"uvicorn[standard]>=0.40.0",
+ "uipath"
]
classifiers = [
"Intended Audience :: Developers",
diff --git a/src/uipath/dev/models/eval_data.py b/src/uipath/dev/models/eval_data.py
new file mode 100644
index 0000000..e592d83
--- /dev/null
+++ b/src/uipath/dev/models/eval_data.py
@@ -0,0 +1,125 @@
+"""Data models for evaluation runs."""
+
+from __future__ import annotations
+
+import uuid
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Any
+
+
+@dataclass
+class EvalSetInfo:
+ """Summary of a discovered evaluation set."""
+
+ id: str
+ name: str
+ eval_count: int
+ evaluator_ids: list[str]
+
+
+@dataclass
+class EvalItemResult:
+ """Result of evaluating a single item."""
+
+ name: str
+ inputs: dict[str, Any] = field(default_factory=dict)
+ expected_output: Any = None
+ scores: dict[str, float] = field(default_factory=dict)
+ overall_score: float = 0.0
+ output: Any = None
+ justifications: dict[str, str] = field(default_factory=dict)
+ duration_ms: float | None = None
+ status: str = "pending" # pending | running | completed | failed
+ traces: list[dict[str, Any]] = field(default_factory=list)
+
+
+@dataclass
+class EvalRunState:
+ """Full state of an eval run."""
+
+ id: str = field(default_factory=lambda: str(uuid.uuid4()))
+ eval_set_id: str = ""
+ eval_set_name: str = ""
+ status: str = "pending" # pending | running | completed | failed
+ progress_completed: int = 0
+ progress_total: int = 0
+ overall_score: float | None = None
+ evaluator_scores: dict[str, float] = field(default_factory=dict)
+ results: list[EvalItemResult] = field(default_factory=list)
+ start_time: datetime | None = None
+ end_time: datetime | None = None
+
+ def to_summary(self) -> dict[str, Any]:
+ """Serialize to summary dict (no per-item results)."""
+ return {
+ "id": self.id,
+ "eval_set_id": self.eval_set_id,
+ "eval_set_name": self.eval_set_name,
+ "status": self.status,
+ "progress_completed": self.progress_completed,
+ "progress_total": self.progress_total,
+ "overall_score": self.overall_score,
+ "evaluator_scores": self.evaluator_scores,
+ "start_time": self.start_time.isoformat() if self.start_time else None,
+ "end_time": self.end_time.isoformat() if self.end_time else None,
+ }
+
+ def to_detail(self) -> dict[str, Any]:
+ """Serialize to detail dict (includes per-item results)."""
+ base = self.to_summary()
+ base["results"] = [
+ {
+ "name": r.name,
+ "inputs": r.inputs,
+ "expected_output": r.expected_output,
+ "scores": r.scores,
+ "overall_score": r.overall_score,
+ "output": str(r.output)
+ if isinstance(r.output, Exception)
+ else r.output,
+ "justifications": r.justifications,
+ "duration_ms": r.duration_ms,
+ "status": r.status,
+ "traces": r.traces,
+ }
+ for r in self.results
+ ]
+ return base
+
+ def start(self) -> None:
+ """Mark run as started."""
+ self.status = "running"
+ self.start_time = datetime.now(timezone.utc)
+
+ def complete(self) -> None:
+ """Mark run as completed, computing final scores."""
+ self.status = "completed"
+ self.end_time = datetime.now(timezone.utc)
+ self._compute_scores()
+
+ def fail(self) -> None:
+ """Mark run as failed."""
+ self.status = "failed"
+ self.end_time = datetime.now(timezone.utc)
+
+ def _compute_scores(self) -> None:
+ """Compute overall and per-evaluator scores from item results."""
+ completed = [r for r in self.results if r.status == "completed"]
+ if not completed:
+ self.overall_score = 0.0
+ return
+
+ # Per-evaluator averages
+ evaluator_totals: dict[str, list[float]] = {}
+ for r in completed:
+ for ev_id, score in r.scores.items():
+ evaluator_totals.setdefault(ev_id, []).append(score)
+
+ self.evaluator_scores = {
+ ev_id: sum(scores) / len(scores)
+ for ev_id, scores in evaluator_totals.items()
+ }
+
+ # Overall = average of item overall_scores
+ self.overall_score = sum(r.overall_score for r in completed) / len(completed)
diff --git a/src/uipath/dev/server/__init__.py b/src/uipath/dev/server/__init__.py
index 2e3ba31..f34e90a 100644
--- a/src/uipath/dev/server/__init__.py
+++ b/src/uipath/dev/server/__init__.py
@@ -24,8 +24,10 @@
StateData,
TraceData,
)
+from uipath.dev.models.eval_data import EvalItemResult, EvalRunState
from uipath.dev.models.execution import ExecutionRun
from uipath.dev.server.debug_bridge import WebDebugBridge
+from uipath.dev.services.eval_service import EvalService
from uipath.dev.services.run_service import RunService
logger = logging.getLogger(__name__)
@@ -86,6 +88,14 @@ def __init__(
on_run_removed=self.connection_manager.remove_run_subscriptions,
)
+ self.eval_service = EvalService(
+ runtime_factory=self.runtime_factory,
+ trace_manager=self.trace_manager,
+ on_eval_run_created=self._on_eval_run_created,
+ on_eval_run_progress=self._on_eval_run_progress,
+ on_eval_run_completed=self._on_eval_run_completed,
+ )
+
def create_app(self) -> Any:
"""Create and return a FastAPI application."""
from uipath.dev.server.app import create_app
@@ -231,6 +241,26 @@ def _on_state(self, state_data: StateData) -> None:
"""Broadcast state transition to subscribed WebSocket clients."""
self.connection_manager.broadcast_state(state_data)
+ def _on_eval_run_created(self, run: EvalRunState) -> None:
+ """Broadcast eval run created to all connected clients."""
+ self.connection_manager.broadcast_eval_run_created(run)
+
+ def _on_eval_run_progress(
+ self,
+ run_id: str,
+ completed: int,
+ total: int,
+ item_result: EvalItemResult | None,
+ ) -> None:
+ """Broadcast eval run progress to all connected clients."""
+ self.connection_manager.broadcast_eval_run_progress(
+ run_id, completed, total, item_result
+ )
+
+ def _on_eval_run_completed(self, run: EvalRunState) -> None:
+ """Broadcast eval run completed to all connected clients."""
+ self.connection_manager.broadcast_eval_run_completed(run)
+
@staticmethod
def _find_free_port(host: str, start_port: int, max_attempts: int = 100) -> int:
"""Find a free port starting from *start_port*.
diff --git a/src/uipath/dev/server/app.py b/src/uipath/dev/server/app.py
index d608338..209a801 100644
--- a/src/uipath/dev/server/app.py
+++ b/src/uipath/dev/server/app.py
@@ -150,6 +150,8 @@ async def _config():
# Register routes
from uipath.dev.server.routes.entrypoints import router as entrypoints_router
+ from uipath.dev.server.routes.evals import router as evals_router
+ from uipath.dev.server.routes.evaluators import router as evaluators_router
from uipath.dev.server.routes.graph import router as graph_router
from uipath.dev.server.routes.reload import router as reload_router
from uipath.dev.server.routes.runs import router as runs_router
@@ -166,6 +168,8 @@ async def _config():
app.include_router(runs_router, prefix="/api")
app.include_router(graph_router, prefix="/api")
app.include_router(reload_router, prefix="/api")
+ app.include_router(evaluators_router, prefix="/api")
+ app.include_router(evals_router, prefix="/api")
app.include_router(ws_router)
# Auto-build frontend if source is available and build is stale
diff --git a/src/uipath/dev/server/frontend/src/App.tsx b/src/uipath/dev/server/frontend/src/App.tsx
index 8249df3..9e37a5b 100644
--- a/src/uipath/dev/server/frontend/src/App.tsx
+++ b/src/uipath/dev/server/frontend/src/App.tsx
@@ -6,13 +6,25 @@ import { useWebSocket } from "./store/useWebSocket";
import { listRuns, listEntrypoints, getRun } from "./api/client";
import type { RunDetail } from "./types/run";
import { useHashRoute } from "./hooks/useHashRoute";
+import type { Section } from "./hooks/useHashRoute";
import { useIsMobile } from "./hooks/useIsMobile";
-import Sidebar from "./components/layout/Sidebar";
+import ActivityBar from "./components/layout/ActivityBar";
+import DebugSidebar from "./components/layout/DebugSidebar";
import StatusBar from "./components/layout/StatusBar";
import NewRunPanel from "./components/runs/NewRunPanel";
import SetupView from "./components/runs/SetupView";
import RunDetailsPanel from "./components/runs/RunDetailsPanel";
import ReloadToast from "./components/shared/ReloadToast";
+import ToastContainer from "./components/shared/ToastContainer";
+import { useEvalStore } from "./store/useEvalStore";
+import { listEvalSets, listEvaluators, listEvalRuns, listLocalEvaluators } from "./api/eval-client";
+import EvalsSidebar from "./components/evals/EvalsSidebar";
+import EvalSetDetail from "./components/evals/EvalSetDetail";
+import EvalRunResults from "./components/evals/EvalRunResults";
+import CreateEvalSetView from "./components/evals/CreateEvalSetView";
+import EvaluatorsSidebar from "./components/evaluators/EvaluatorsSidebar";
+import EvaluatorsView from "./components/evaluators/EvaluatorDetail";
+import CreateEvaluatorView from "./components/evaluators/CreateEvaluatorView";
export default function App() {
const ws = useWebSocket();
@@ -33,14 +45,30 @@ export default function App() {
setActiveNode,
removeActiveNode,
} = useRunStore();
- const { view, runId: routeRunId, setupEntrypoint, setupMode, navigate } = useHashRoute();
+ const {
+ section,
+ view,
+ runId: routeRunId,
+ setupEntrypoint,
+ setupMode,
+ evalCreating,
+ evalSetId,
+ evalRunId,
+ evalRunItemName,
+ evaluatorCreateType,
+ evaluatorId,
+ evaluatorFilter,
+ navigate,
+ } = useHashRoute();
+
+ const { setEvalSets, setEvaluators, setLocalEvaluators, setEvalRuns } = useEvalStore();
// Sync route runId → store selection
useEffect(() => {
- if (view === "details" && routeRunId && routeRunId !== selectedRunId) {
+ if (section === "debug" && view === "details" && routeRunId && routeRunId !== selectedRunId) {
selectRun(routeRunId);
}
- }, [view, routeRunId, selectedRunId, selectRun]);
+ }, [section, view, routeRunId, selectedRunId, selectRun]);
// Load existing runs, entrypoints, auth status, and config on mount
const initAuth = useAuthStore((s) => s.init);
@@ -54,6 +82,49 @@ export default function App() {
initConfig();
}, [setRuns, setEntrypoints, initAuth, initConfig]);
+ // Load eval data when switching to evals/evaluators section
+ useEffect(() => {
+ if (section === "evals") {
+ listEvalSets().then((sets) => setEvalSets(sets)).catch(console.error);
+ listEvalRuns().then((runs) => setEvalRuns(runs)).catch(console.error);
+ }
+ if (section === "evals" || section === "evaluators") {
+ listEvaluators().then((evs) => setEvaluators(evs)).catch(console.error);
+ listLocalEvaluators().then((evs) => setLocalEvaluators(evs)).catch(console.error);
+ }
+ }, [section, setEvalSets, setEvaluators, setLocalEvaluators, setEvalRuns]);
+
+ // Auto-select latest run or first eval set when navigating to evals with no selection
+ const evalSets = useEvalStore((s) => s.evalSets);
+ const evalRuns = useEvalStore((s) => s.evalRuns);
+ useEffect(() => {
+ if (section !== "evals" || evalCreating || evalSetId || evalRunId) return;
+ // Pick latest run by start_time
+ const runs = Object.values(evalRuns).sort(
+ (a, b) => new Date(b.start_time ?? 0).getTime() - new Date(a.start_time ?? 0).getTime(),
+ );
+ if (runs.length > 0) {
+ navigate(`#/evals/runs/${runs[0].id}`);
+ return;
+ }
+ // Fallback: first eval set
+ const sets = Object.values(evalSets);
+ if (sets.length > 0) {
+ navigate(`#/evals/sets/${sets[0].id}`);
+ }
+ }, [section, evalCreating, evalSetId, evalRunId, evalRuns, evalSets, navigate]);
+
+ // Keyboard shortcuts
+ useEffect(() => {
+ const onKeyDown = (e: KeyboardEvent) => {
+ if (e.key === "Escape" && sidebarOpen) {
+ setSidebarOpen(false);
+ }
+ };
+ window.addEventListener("keydown", onKeyDown);
+ return () => window.removeEventListener("keydown", onKeyDown);
+ }, [sidebarOpen]);
+
const selectedRun = selectedRunId ? runs[selectedRunId] : null;
// Shared helper: apply a full run detail response to the store
@@ -169,70 +240,188 @@ export default function App() {
}, [selectedRunId, selectedRun?.status, applyRunDetail]);
const handleRunCreated = (runId: string) => {
- navigate(`#/runs/${runId}/traces`);
+ navigate(`#/debug/runs/${runId}/traces`);
selectRun(runId);
setSidebarOpen(false);
};
const handleSelectRun = (runId: string) => {
- navigate(`#/runs/${runId}/traces`);
+ navigate(`#/debug/runs/${runId}/traces`);
selectRun(runId);
setSidebarOpen(false);
};
const handleNewRun = () => {
- navigate("#/new");
+ navigate("#/debug/new");
setSidebarOpen(false);
};
- return (
-
-
- {/* Mobile hamburger button */}
- {isMobile && !sidebarOpen && (
-
- )}
-
{
+ if (s === "debug") navigate("#/debug/new");
+ else if (s === "evals") navigate("#/evals");
+ else if (s === "evaluators") navigate("#/evaluators");
+ };
+
+ // --- Render main content based on section ---
+ const renderMainContent = () => {
+ if (section === "evals") {
+ if (evalCreating) return ;
+ if (evalRunId) return ;
+ if (evalSetId) return ;
+ return ;
+ }
+
+ if (section === "evaluators") {
+ if (evaluatorCreateType) {
+ return ;
+ }
+ return ;
+ }
+
+ // Debug section
+ if (view === "new") {
+ return ;
+ }
+ if (view === "setup" && setupEntrypoint && setupMode) {
+ return (
+ setSidebarOpen(false)}
/>
-
- {view === "new" ? (
-
- ) : view === "setup" && setupEntrypoint && setupMode ? (
-
- ) : selectedRun ? (
-
- ) : (
-
- Select a run or create a new one
-
+ );
+ }
+ if (selectedRun) {
+ return ;
+ }
+ return (
+
+ Select a run or create a new one
+
+ );
+ };
+
+ // --- Mobile layout ---
+ if (isMobile) {
+ return (
+
+
+ {!sidebarOpen && (
+
+ )}
+ {sidebarOpen && (
+ <>
+
setSidebarOpen(false)}
+ />
+
+ >
)}
+
+ {renderMainContent()}
+
+
+
+
+
+
+ );
+ }
+
+ // --- Desktop layout ---
+ return (
+
+
+ {/* Left aside: shared header + ActivityBar + section sidebar */}
+
+
+ {renderMainContent()}
+
);
}
diff --git a/src/uipath/dev/server/frontend/src/api/eval-client.ts b/src/uipath/dev/server/frontend/src/api/eval-client.ts
new file mode 100644
index 0000000..63089b7
--- /dev/null
+++ b/src/uipath/dev/server/frontend/src/api/eval-client.ts
@@ -0,0 +1,112 @@
+import type { EvaluatorInfo, LocalEvaluator, EvalSetSummary, EvalSetDetail, EvalItem, EvalRunSummary, EvalRunDetail } from "../types/eval";
+
+const BASE = "/api";
+
+async function fetchJson
(url: string, options?: RequestInit): Promise {
+ const res = await fetch(url, options);
+ if (!res.ok) {
+ let errorDetail;
+ try {
+ const body = await res.json();
+ errorDetail = body.detail || res.statusText;
+ } catch {
+ errorDetail = res.statusText;
+ }
+ const error = new Error(`HTTP ${res.status}`);
+ (error as any).detail = errorDetail;
+ (error as any).status = res.status;
+ throw error;
+ }
+ return res.json();
+}
+
+export async function listEvaluators(): Promise {
+ return fetchJson(`${BASE}/evaluators`);
+}
+
+export async function listEvalSets(): Promise {
+ return fetchJson(`${BASE}/eval-sets`);
+}
+
+export async function createEvalSet(body: {
+ name: string;
+ evaluator_refs: string[];
+}): Promise {
+ return fetchJson(`${BASE}/eval-sets`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify(body),
+ });
+}
+
+export async function addEvalItem(
+ evalSetId: string,
+ item: { name: string; inputs: Record; expected_output: unknown },
+): Promise {
+ return fetchJson(`${BASE}/eval-sets/${encodeURIComponent(evalSetId)}/items`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify(item),
+ });
+}
+
+export async function getEvalSet(id: string): Promise {
+ return fetchJson(`${BASE}/eval-sets/${encodeURIComponent(id)}`);
+}
+
+export async function startEvalRun(evalSetId: string): Promise {
+ return fetchJson(`${BASE}/eval-sets/${encodeURIComponent(evalSetId)}/runs`, {
+ method: "POST",
+ });
+}
+
+export async function listEvalRuns(): Promise {
+ return fetchJson(`${BASE}/eval-runs`);
+}
+
+export async function getEvalRun(id: string): Promise {
+ return fetchJson(`${BASE}/eval-runs/${encodeURIComponent(id)}`);
+}
+
+export async function listLocalEvaluators(): Promise {
+ return fetchJson(`${BASE}/local-evaluators`);
+}
+
+export async function createLocalEvaluator(body: {
+ name: string;
+ description: string;
+ evaluator_type_id: string;
+ config: Record;
+}): Promise {
+ return fetchJson(`${BASE}/local-evaluators`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify(body),
+ });
+}
+
+export async function updateEvalSetEvaluators(
+ evalSetId: string,
+ evaluatorRefs: string[],
+): Promise {
+ return fetchJson(`${BASE}/eval-sets/${encodeURIComponent(evalSetId)}/evaluators`, {
+ method: "PATCH",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({ evaluator_refs: evaluatorRefs }),
+ });
+}
+
+export async function updateLocalEvaluator(
+ id: string,
+ body: {
+ description?: string;
+ evaluator_type_id?: string;
+ config?: Record;
+ },
+): Promise {
+ return fetchJson(`${BASE}/local-evaluators/${encodeURIComponent(id)}`, {
+ method: "PUT",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify(body),
+ });
+}
diff --git a/src/uipath/dev/server/frontend/src/components/chat/ChatInput.tsx b/src/uipath/dev/server/frontend/src/components/chat/ChatInput.tsx
index 55a3d72..95ec9e5 100644
--- a/src/uipath/dev/server/frontend/src/components/chat/ChatInput.tsx
+++ b/src/uipath/dev/server/frontend/src/components/chat/ChatInput.tsx
@@ -36,13 +36,14 @@ export default function ChatInput({ onSend, disabled, placeholder }: Props) {
onKeyDown={handleKeyDown}
disabled={disabled}
placeholder={placeholder ?? "Message..."}
- className="flex-1 bg-transparent text-sm py-1 focus:outline-none disabled:opacity-40 placeholder:text-[var(--text-muted)]"
+ className="flex-1 bg-transparent text-sm py-1 disabled:opacity-40 placeholder:text-[var(--text-muted)]"
style={{ color: "var(--text-primary)" }}
/>