def agent_judge(
config: ModelConfig,
*,
criterion: str,
key: str,
scale: tuple[float, float] = (0.0, 1.0),
include_trace: bool = False,
retries: int = 1,
middleware: Iterable[MiddlewareFactory] = (),
) -> Scorer:
"""Build a single-purpose Agent-as-judge :class:`Scorer`.
Args:
config: Model config for the judge agent (e.g. an ``AnthropicConfig``;
pin temperature to 0 for stable grading).
criterion: The single standard this judge grades against, in plain
English. One judge grades one criterion — compose several judges
for a multi-dimensional scorecard.
key: The ``Feedback`` key this judge emits; becomes its column in
``RunResult`` aggregates. Use a distinct key per criterion.
scale: ``(low, high)`` numeric range. **Enforced** — a score outside the
range is clamped to the nearest bound (and the clamp is noted in the
feedback comment). Default ``(0.0, 1.0)``.
include_trace: When ``True``, the agent's tool-call trajectory (calls,
results, errors) is rendered into the judge prompt (process grading).
Default grades the final answer only.
retries: How many times ``content()`` re-asks the judge if its output
fails :class:`Verdict` validation. Default ``1``.
middleware: Middleware factories attached to the judge agent. Pass
``TelemetryMiddleware`` here to capture the judge's own LLM spans /
token usage (judge cost), tracked separately from the agent graded.
"""
low, high = scale
judge = Agent(
f"judge_{key}",
_system_prompt(criterion, low, high),
config=config,
response_schema=Verdict,
middleware=middleware,
)
async def _judge(
inputs: dict[str, Any],
outputs: dict[str, Any],
reference_outputs: dict[str, Any] | None,
trace: Trace,
) -> Feedback:
prompt = _render_prompt(inputs, outputs, reference_outputs, trace, include_trace=include_trace)
reply = await judge.ask(prompt)
verdict = await reply.content(retries=retries)
if verdict is None:
return Feedback(key=key, score=None, comment="judge returned no verdict")
score = min(max(verdict.score, low), high)
comment = verdict.reasoning
if score != verdict.score:
comment = f"{comment} [score clamped from {verdict.score} to scale {low}-{high}]"
return Feedback(key=key, score=score, comment=comment)
return Scorer(_judge, key=key)