agent_judge

autogen.beta.eval.scorers.judge.agent_judge #

agent_judge(config, *, criterion, key, scale=(0.0, 1.0), include_trace=False, retries=1, middleware=())

Build a single-purpose Agent-as-judge :class:Scorer.

PARAMETER	DESCRIPTION
`config`	Model config for the judge agent (e.g. an `AnthropicConfig`; pin temperature to 0 for stable grading). TYPE: `ModelConfig`
`criterion`	The single standard this judge grades against, in plain English. One judge grades one criterion — compose several judges for a multi-dimensional scorecard. TYPE: `str`
`key`	The `Feedback` key this judge emits; becomes its column in `RunResult` aggregates. Use a distinct key per criterion. TYPE: `str`
`scale`	`(low, high)` numeric range. Enforced — a score outside the range is clamped to the nearest bound (and the clamp is noted in the feedback comment). Default `(0.0, 1.0)`. TYPE: `tuple[float, float]` DEFAULT: `(0.0, 1.0)`
`include_trace`	When `True`, the agent's tool-call trajectory (calls, results, errors) is rendered into the judge prompt (process grading). Default grades the final answer only. TYPE: `bool` DEFAULT: `False`
`retries`	How many times `content()` re-asks the judge if its output fails :class:`Verdict` validation. Default `1`. TYPE: `int` DEFAULT: `1`
`middleware`	Middleware factories attached to the judge agent. Pass `TelemetryMiddleware` here to capture the judge's own LLM spans / token usage (judge cost), tracked separately from the agent graded. TYPE: `Iterable[MiddlewareFactory]` DEFAULT: `()`

Source code in autogen/beta/eval/scorers/judge.py

def agent_judge(
    config: ModelConfig,
    *,
    criterion: str,
    key: str,
    scale: tuple[float, float] = (0.0, 1.0),
    include_trace: bool = False,
    retries: int = 1,
    middleware: Iterable[MiddlewareFactory] = (),
) -> Scorer:
    """Build a single-purpose Agent-as-judge :class:`Scorer`.

    Args:
        config: Model config for the judge agent (e.g. an ``AnthropicConfig``;
            pin temperature to 0 for stable grading).
        criterion: The single standard this judge grades against, in plain
            English. One judge grades one criterion — compose several judges
            for a multi-dimensional scorecard.
        key: The ``Feedback`` key this judge emits; becomes its column in
            ``RunResult`` aggregates. Use a distinct key per criterion.
        scale: ``(low, high)`` numeric range. **Enforced** — a score outside the
            range is clamped to the nearest bound (and the clamp is noted in the
            feedback comment). Default ``(0.0, 1.0)``.
        include_trace: When ``True``, the agent's tool-call trajectory (calls,
            results, errors) is rendered into the judge prompt (process grading).
            Default grades the final answer only.
        retries: How many times ``content()`` re-asks the judge if its output
            fails :class:`Verdict` validation. Default ``1``.
        middleware: Middleware factories attached to the judge agent. Pass
            ``TelemetryMiddleware`` here to capture the judge's own LLM spans /
            token usage (judge cost), tracked separately from the agent graded.
    """
    low, high = scale
    judge = Agent(
        f"judge_{key}",
        _system_prompt(criterion, low, high),
        config=config,
        response_schema=Verdict,
        middleware=middleware,
    )

    async def _judge(
        inputs: dict[str, Any],
        outputs: dict[str, Any],
        reference_outputs: dict[str, Any] | None,
        trace: Trace,
    ) -> Feedback:
        prompt = _render_prompt(inputs, outputs, reference_outputs, trace, include_trace=include_trace)
        reply = await judge.ask(prompt)
        verdict = await reply.content(retries=retries)
        if verdict is None:
            return Feedback(key=key, score=None, comment="judge returned no verdict")
        score = min(max(verdict.score, low), high)
        comment = verdict.reasoning
        if score != verdict.score:
            comment = f"{comment} [score clamped from {verdict.score} to scale {low}-{high}]"
        return Feedback(key=key, score=score, comment=comment)

    return Scorer(_judge, key=key)