Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "uipath-dev"
version = "0.0.60"
version = "0.0.61"
description = "UiPath Developer Console"
readme = { file = "README.md", content-type = "text/markdown" }
requires-python = ">=3.11"
Expand All @@ -10,6 +10,7 @@ dependencies = [
"pyperclip>=1.11.0, <2.0.0",
"fastapi>=0.128.8",
"uvicorn[standard]>=0.40.0",
"uipath"
]
classifiers = [
"Intended Audience :: Developers",
Expand Down
125 changes: 125 additions & 0 deletions src/uipath/dev/models/eval_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
"""Data models for evaluation runs."""

from __future__ import annotations

import uuid
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Any


@dataclass
class EvalSetInfo:
"""Summary of a discovered evaluation set."""

id: str
name: str
eval_count: int
evaluator_ids: list[str]


@dataclass
class EvalItemResult:
"""Result of evaluating a single item."""

name: str
inputs: dict[str, Any] = field(default_factory=dict)
expected_output: Any = None
scores: dict[str, float] = field(default_factory=dict)
overall_score: float = 0.0
output: Any = None
justifications: dict[str, str] = field(default_factory=dict)
duration_ms: float | None = None
status: str = "pending" # pending | running | completed | failed
traces: list[dict[str, Any]] = field(default_factory=list)


@dataclass
class EvalRunState:
"""Full state of an eval run."""

id: str = field(default_factory=lambda: str(uuid.uuid4()))
eval_set_id: str = ""
eval_set_name: str = ""
status: str = "pending" # pending | running | completed | failed
progress_completed: int = 0
progress_total: int = 0
overall_score: float | None = None
evaluator_scores: dict[str, float] = field(default_factory=dict)
results: list[EvalItemResult] = field(default_factory=list)
start_time: datetime | None = None
end_time: datetime | None = None

def to_summary(self) -> dict[str, Any]:
"""Serialize to summary dict (no per-item results)."""
return {
"id": self.id,
"eval_set_id": self.eval_set_id,
"eval_set_name": self.eval_set_name,
"status": self.status,
"progress_completed": self.progress_completed,
"progress_total": self.progress_total,
"overall_score": self.overall_score,
"evaluator_scores": self.evaluator_scores,
"start_time": self.start_time.isoformat() if self.start_time else None,
"end_time": self.end_time.isoformat() if self.end_time else None,
}

def to_detail(self) -> dict[str, Any]:
"""Serialize to detail dict (includes per-item results)."""
base = self.to_summary()
base["results"] = [
{
"name": r.name,
"inputs": r.inputs,
"expected_output": r.expected_output,
"scores": r.scores,
"overall_score": r.overall_score,
"output": str(r.output)
if isinstance(r.output, Exception)
else r.output,
"justifications": r.justifications,
"duration_ms": r.duration_ms,
"status": r.status,
"traces": r.traces,
}
for r in self.results
]
return base

def start(self) -> None:
"""Mark run as started."""
self.status = "running"
self.start_time = datetime.now(timezone.utc)

def complete(self) -> None:
"""Mark run as completed, computing final scores."""
self.status = "completed"
self.end_time = datetime.now(timezone.utc)
self._compute_scores()

def fail(self) -> None:
"""Mark run as failed."""
self.status = "failed"
self.end_time = datetime.now(timezone.utc)

def _compute_scores(self) -> None:
"""Compute overall and per-evaluator scores from item results."""
completed = [r for r in self.results if r.status == "completed"]
if not completed:
self.overall_score = 0.0
return

# Per-evaluator averages
evaluator_totals: dict[str, list[float]] = {}
for r in completed:
for ev_id, score in r.scores.items():
evaluator_totals.setdefault(ev_id, []).append(score)

self.evaluator_scores = {
ev_id: sum(scores) / len(scores)
for ev_id, scores in evaluator_totals.items()
}

# Overall = average of item overall_scores
self.overall_score = sum(r.overall_score for r in completed) / len(completed)
30 changes: 30 additions & 0 deletions src/uipath/dev/server/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@
StateData,
TraceData,
)
from uipath.dev.models.eval_data import EvalItemResult, EvalRunState
from uipath.dev.models.execution import ExecutionRun
from uipath.dev.server.debug_bridge import WebDebugBridge
from uipath.dev.services.eval_service import EvalService
from uipath.dev.services.run_service import RunService

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -86,6 +88,14 @@ def __init__(
on_run_removed=self.connection_manager.remove_run_subscriptions,
)

self.eval_service = EvalService(
runtime_factory=self.runtime_factory,
trace_manager=self.trace_manager,
on_eval_run_created=self._on_eval_run_created,
on_eval_run_progress=self._on_eval_run_progress,
on_eval_run_completed=self._on_eval_run_completed,
)

def create_app(self) -> Any:
"""Create and return a FastAPI application."""
from uipath.dev.server.app import create_app
Expand Down Expand Up @@ -231,6 +241,26 @@ def _on_state(self, state_data: StateData) -> None:
"""Broadcast state transition to subscribed WebSocket clients."""
self.connection_manager.broadcast_state(state_data)

def _on_eval_run_created(self, run: EvalRunState) -> None:
"""Broadcast eval run created to all connected clients."""
self.connection_manager.broadcast_eval_run_created(run)

def _on_eval_run_progress(
self,
run_id: str,
completed: int,
total: int,
item_result: EvalItemResult | None,
) -> None:
"""Broadcast eval run progress to all connected clients."""
self.connection_manager.broadcast_eval_run_progress(
run_id, completed, total, item_result
)

def _on_eval_run_completed(self, run: EvalRunState) -> None:
"""Broadcast eval run completed to all connected clients."""
self.connection_manager.broadcast_eval_run_completed(run)

@staticmethod
def _find_free_port(host: str, start_port: int, max_attempts: int = 100) -> int:
"""Find a free port starting from *start_port*.
Expand Down
4 changes: 4 additions & 0 deletions src/uipath/dev/server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@ async def _config():

# Register routes
from uipath.dev.server.routes.entrypoints import router as entrypoints_router
from uipath.dev.server.routes.evals import router as evals_router
from uipath.dev.server.routes.evaluators import router as evaluators_router
from uipath.dev.server.routes.graph import router as graph_router
from uipath.dev.server.routes.reload import router as reload_router
from uipath.dev.server.routes.runs import router as runs_router
Expand All @@ -166,6 +168,8 @@ async def _config():
app.include_router(runs_router, prefix="/api")
app.include_router(graph_router, prefix="/api")
app.include_router(reload_router, prefix="/api")
app.include_router(evaluators_router, prefix="/api")
app.include_router(evals_router, prefix="/api")
app.include_router(ws_router)

# Auto-build frontend if source is available and build is stale
Expand Down
Loading
Loading