Skip to content

evaluate_traces

autogen.beta.eval.runtime.evaluate.evaluate_traces async #

evaluate_traces(source, *, scorers, store_dir, suite=None, budgets=None, concurrency=4, run_id=None, label=None, stream=None)

Grade every trace from source and persist a :class:RunResult.

PARAMETER DESCRIPTION
source

Where the traces come from (in-memory, disk, or cloud).

TYPE: TraceSource

scorers

Scorer instances; each runs once per trace.

TYPE: Iterable[Scorer]

store_dir

Directory the run JSON is written to as <run_id>.json.

TYPE: str | PathLike[str]

suite

Optional dataset to join traces to by TraceRef.task_id for reference-based scorers. When omitted, a suite is synthesized from the traces and scoring is reference-free.

TYPE: Suite | None DEFAULT: None

budgets

Optional observational thresholds; violations are recorded, never aborting.

TYPE: BudgetThresholds | None DEFAULT: None

concurrency

Max traces graded in parallel.

TYPE: int DEFAULT: 4

run_id

Override for the auto-generated run id.

TYPE: str | None DEFAULT: None

label

Optional user-defined identifier recorded on the run — meant to be shared across runs of the same eval so they can be grouped and trended. None if unset; the framework never fills it.

TYPE: str | None DEFAULT: None

stream

Optional :class:~autogen.beta.stream.Stream to publish eval lifecycle events to (EvalStarted / TaskEvaluated / EvalCompleted) — observe a grading pass like you observe an agent.

TYPE: Stream | None DEFAULT: None

Source code in autogen/beta/eval/runtime/evaluate.py
async def evaluate_traces(
    source: TraceSource,
    *,
    scorers: Iterable[Scorer],
    store_dir: str | os.PathLike[str],
    suite: Suite | None = None,
    budgets: BudgetThresholds | None = None,
    concurrency: int = 4,
    run_id: str | None = None,
    label: str | None = None,
    stream: Stream | None = None,
) -> RunResult:
    """Grade every trace from ``source`` and persist a :class:`RunResult`.

    Args:
        source: Where the traces come from (in-memory, disk, or cloud).
        scorers: Scorer instances; each runs once per trace.
        store_dir: Directory the run JSON is written to as ``<run_id>.json``.
        suite: Optional dataset to join traces to by ``TraceRef.task_id`` for
            reference-based scorers. When omitted, a suite is synthesized from
            the traces and scoring is reference-free.
        budgets: Optional observational thresholds; violations are recorded,
            never aborting.
        concurrency: Max traces graded in parallel.
        run_id: Override for the auto-generated run id.
        label: Optional user-defined identifier recorded on the run — meant to
            be *shared* across runs of the same eval so they can be grouped and
            trended. ``None`` if unset; the framework never fills it.
        stream: Optional :class:`~autogen.beta.stream.Stream` to publish eval
            lifecycle events to (``EvalStarted`` / ``TaskEvaluated`` /
            ``EvalCompleted``) — observe a grading pass like you observe an agent.
    """
    started = time.perf_counter()
    return await _grade(
        source,
        scorers=tuple(scorers),
        suite=suite,
        store_dir=store_dir,
        budgets=budgets,
        concurrency=concurrency,
        run_id=run_id,
        label=label,
        stream=stream,
        target_path=f"trace-source:{type(source).__name__}",
        started_at=started,
    )