Skip to content

evaluate_traces

autogen.beta.eval.runtime.evaluate.evaluate_traces `async` #

evaluate_traces(source, *, scorers, store_dir, suite=None, budgets=None, concurrency=4, run_id=None, label=None, stream=None)

Grade every trace from source and persist a :class:RunResult.

PARAMETER	DESCRIPTION
`source`	Where the traces come from (in-memory, disk, or cloud). TYPE: `TraceSource`
`scorers`	Scorer instances; each runs once per trace. TYPE: `Iterable[Scorer]`
`store_dir`	Directory the run JSON is written to as `<run_id>.json`. TYPE: `str \| PathLike[str]`
`suite`	Optional dataset to join traces to by `TraceRef.task_id` for reference-based scorers. When omitted, a suite is synthesized from the traces and scoring is reference-free. TYPE: `Suite \| None` DEFAULT: `None`
`budgets`	Optional observational thresholds; violations are recorded, never aborting. TYPE: `BudgetThresholds \| None` DEFAULT: `None`
`concurrency`	Max traces graded in parallel. TYPE: `int` DEFAULT: `4`
`run_id`	Override for the auto-generated run id. TYPE: `str \| None` DEFAULT: `None`
`label`	Optional user-defined identifier recorded on the run — meant to be shared across runs of the same eval so they can be grouped and trended. `None` if unset; the framework never fills it. TYPE: `str \| None` DEFAULT: `None`
`stream`	Optional :class:`~autogen.beta.stream.Stream` to publish eval lifecycle events to (`EvalStarted` / `TaskEvaluated` / `EvalCompleted`) — observe a grading pass like you observe an agent. TYPE: `Stream \| None` DEFAULT: `None`

Source code in autogen/beta/eval/runtime/evaluate.py

async def evaluate_traces(
    source: TraceSource,
    *,
    scorers: Iterable[Scorer],
    store_dir: str | os.PathLike[str],
    suite: Suite | None = None,
    budgets: BudgetThresholds | None = None,
    concurrency: int = 4,
    run_id: str | None = None,
    label: str | None = None,
    stream: Stream | None = None,
) -> RunResult:
    """Grade every trace from ``source`` and persist a :class:`RunResult`.

    Args:
        source: Where the traces come from (in-memory, disk, or cloud).
        scorers: Scorer instances; each runs once per trace.
        store_dir: Directory the run JSON is written to as ``<run_id>.json``.
        suite: Optional dataset to join traces to by ``TraceRef.task_id`` for
            reference-based scorers. When omitted, a suite is synthesized from
            the traces and scoring is reference-free.
        budgets: Optional observational thresholds; violations are recorded,
            never aborting.
        concurrency: Max traces graded in parallel.
        run_id: Override for the auto-generated run id.
        label: Optional user-defined identifier recorded on the run — meant to
            be *shared* across runs of the same eval so they can be grouped and
            trended. ``None`` if unset; the framework never fills it.
        stream: Optional :class:`~autogen.beta.stream.Stream` to publish eval
            lifecycle events to (``EvalStarted`` / ``TaskEvaluated`` /
            ``EvalCompleted``) — observe a grading pass like you observe an agent.
    """
    started = time.perf_counter()
    return await _grade(
        source,
        scorers=tuple(scorers),
        suite=suite,
        store_dir=store_dir,
        budgets=budgets,
        concurrency=concurrency,
        run_id=run_id,
        label=label,
        stream=stream,
        target_path=f"trace-source:{type(source).__name__}",
        started_at=started,
    )