Skip to content

agent_judge

autogen.beta.eval.scorers.judge.agent_judge #

agent_judge(config, *, criterion, key, scale=(0.0, 1.0), include_trace=False, retries=1, middleware=())

Build a single-purpose Agent-as-judge :class:Scorer.

PARAMETER DESCRIPTION
config

Model config for the judge agent (e.g. an AnthropicConfig; pin temperature to 0 for stable grading).

TYPE: ModelConfig

criterion

The single standard this judge grades against, in plain English. One judge grades one criterion — compose several judges for a multi-dimensional scorecard.

TYPE: str

key

The Feedback key this judge emits; becomes its column in RunResult aggregates. Use a distinct key per criterion.

TYPE: str

scale

(low, high) numeric range. Enforced — a score outside the range is clamped to the nearest bound (and the clamp is noted in the feedback comment). Default (0.0, 1.0).

TYPE: tuple[float, float] DEFAULT: (0.0, 1.0)

include_trace

When True, the agent's tool-call trajectory (calls, results, errors) is rendered into the judge prompt (process grading). Default grades the final answer only.

TYPE: bool DEFAULT: False

retries

How many times content() re-asks the judge if its output fails :class:Verdict validation. Default 1.

TYPE: int DEFAULT: 1

middleware

Middleware factories attached to the judge agent. Pass TelemetryMiddleware here to capture the judge's own LLM spans / token usage (judge cost), tracked separately from the agent graded.

TYPE: Iterable[MiddlewareFactory] DEFAULT: ()

Source code in autogen/beta/eval/scorers/judge.py
def agent_judge(
    config: ModelConfig,
    *,
    criterion: str,
    key: str,
    scale: tuple[float, float] = (0.0, 1.0),
    include_trace: bool = False,
    retries: int = 1,
    middleware: Iterable[MiddlewareFactory] = (),
) -> Scorer:
    """Build a single-purpose Agent-as-judge :class:`Scorer`.

    Args:
        config: Model config for the judge agent (e.g. an ``AnthropicConfig``;
            pin temperature to 0 for stable grading).
        criterion: The single standard this judge grades against, in plain
            English. One judge grades one criterion — compose several judges
            for a multi-dimensional scorecard.
        key: The ``Feedback`` key this judge emits; becomes its column in
            ``RunResult`` aggregates. Use a distinct key per criterion.
        scale: ``(low, high)`` numeric range. **Enforced** — a score outside the
            range is clamped to the nearest bound (and the clamp is noted in the
            feedback comment). Default ``(0.0, 1.0)``.
        include_trace: When ``True``, the agent's tool-call trajectory (calls,
            results, errors) is rendered into the judge prompt (process grading).
            Default grades the final answer only.
        retries: How many times ``content()`` re-asks the judge if its output
            fails :class:`Verdict` validation. Default ``1``.
        middleware: Middleware factories attached to the judge agent. Pass
            ``TelemetryMiddleware`` here to capture the judge's own LLM spans /
            token usage (judge cost), tracked separately from the agent graded.
    """
    low, high = scale
    judge = Agent(
        f"judge_{key}",
        _system_prompt(criterion, low, high),
        config=config,
        response_schema=Verdict,
        middleware=middleware,
    )

    async def _judge(
        inputs: dict[str, Any],
        outputs: dict[str, Any],
        reference_outputs: dict[str, Any] | None,
        trace: Trace,
    ) -> Feedback:
        prompt = _render_prompt(inputs, outputs, reference_outputs, trace, include_trace=include_trace)
        reply = await judge.ask(prompt)
        verdict = await reply.content(retries=retries)
        if verdict is None:
            return Feedback(key=key, score=None, comment="judge returned no verdict")
        score = min(max(verdict.score, low), high)
        comment = verdict.reasoning
        if score != verdict.score:
            comment = f"{comment} [score clamped from {verdict.score} to scale {low}-{high}]"
        return Feedback(key=key, score=score, comment=comment)

    return Scorer(_judge, key=key)