RunResult

autogen.beta.eval.results.result.RunResult #

RunResult(*, run_id, tasks, suite, target_path, concurrency, duration_ms, created_at, label=None, store_dir=None)

The result of a full :func:run_agent.

Holds per-task records, run-level metadata, and computed aggregates. Lookup helpers (pass_rate, score_stats, value_counts) surface single keys; :meth:summary renders a printable table; :meth:save writes the schema-0.1 JSON.

Aggregates are computed once at construction time so repeated lookups are cheap.

Source code in autogen/beta/eval/results/result.py

def __init__(
    self,
    *,
    run_id: str,
    tasks: tuple[TaskResult, ...],
    suite: Suite,
    target_path: str,
    concurrency: int,
    duration_ms: int,
    created_at: str,
    label: str | None = None,
    store_dir: str | os.PathLike[str] | None = None,
) -> None:
    self._run_id = run_id
    self._tasks = tasks
    self._suite = suite
    self._target_path = target_path
    self._concurrency = concurrency
    self._duration_ms = duration_ms
    self._created_at = created_at
    self._label = label
    self._store_dir = Path(store_dir) if store_dir is not None else None
    self._aggregates = _compute_aggregates(tasks)
    self._sliced: dict[str, Aggregates] = {}

run_id `property` #

run_id

Stable identifier for this run — UUID4 hex unless the caller passed one.

schema_version `property` #

schema_version

Run JSON schema version. Always "0.1" in v0.

tasks `property` #

tasks

Per-task records, in suite order.

suite `property` #

suite

The Suite that was executed.

target_path `property` #

target_path

"<module>:<name>" provenance of the evaluated target (factory or instance type).

concurrency `property` #

concurrency

Concurrency cap the runner used (asyncio.Semaphore bound).

duration_ms `property` #

duration_ms

Wall-clock duration of the full run, in milliseconds.

created_at `property` #

created_at

ISO-8601 UTC timestamp of when this run started.

label `property` #

label

User-defined identifier grouping runs of the same eval over time (None if unset).

aggregates `property` #

aggregates

Run-level rollups computed from the per-task feedback and traces.

tags `property` #

tags

Every tag present across the run's tasks — the values usable in tag= lookups.

pass_rate #

pass_rate(key, *, tag=None)

Pass rate for a boolean scorer (0.0 if no boolean feedback under key).

Pass tag to compute it over only the tasks carrying that tag — e.g. pass_rate("tool_called", tag="adversarial"). Unset slices the whole run.

Source code in autogen/beta/eval/results/result.py

def pass_rate(self, key: str, *, tag: str | None = None) -> float:
    """Pass rate for a boolean scorer (``0.0`` if no boolean feedback under ``key``).

    Pass ``tag`` to compute it over only the tasks carrying that tag — e.g.
    ``pass_rate("tool_called", tag="adversarial")``. Unset slices the whole run.
    """
    return self._agg(tag).pass_rate.get(key, 0.0)

score_stats #

score_stats(key, *, tag=None)

Numeric stats for a scorer (zeros when nothing numeric under key).

Pass tag to restrict to tasks carrying that tag.

Source code in autogen/beta/eval/results/result.py

def score_stats(self, key: str, *, tag: str | None = None) -> ScoreStats:
    """Numeric stats for a scorer (zeros when nothing numeric under ``key``).

    Pass ``tag`` to restrict to tasks carrying that tag.
    """
    return self._agg(tag).score_stats.get(key, ScoreStats(mean=0.0, p50=0.0, p95=0.0, n=0))

value_counts #

value_counts(key, *, tag=None)

Categorical label counts for a scorer (empty dict when nothing categorical under key).

Pass tag to restrict to tasks carrying that tag.

Source code in autogen/beta/eval/results/result.py

def value_counts(self, key: str, *, tag: str | None = None) -> dict[str, int]:
    """Categorical label counts for a scorer (empty dict when nothing categorical under ``key``).

    Pass ``tag`` to restrict to tasks carrying that tag.
    """
    return dict(self._agg(tag).value_counts.get(key, {}))

diff #

diff(baseline, *, strict=True)

Compare this run against baseline — "did my change help or hurt?".

Reports per-scorer pass-rate / mean deltas and the tasks that flipped pass<->fail, over the tasks and checks the two runs share. By default (strict=True) raises :class:~autogen.beta.eval.RunsNotComparableError if the runs didn't grade the same tasks + checks; pass strict=False to diff the overlap and have the mismatches reported on the returned :class:~autogen.beta.eval.RunDiff.

Source code in autogen/beta/eval/results/result.py

def diff(self, baseline: "RunResult", *, strict: bool = True) -> "RunDiff":
    """Compare this run against ``baseline`` — "did my change help or hurt?".

    Reports per-scorer pass-rate / mean deltas and the tasks that flipped
    pass<->fail, over the tasks and checks the two runs **share**. By default
    (``strict=True``) raises :class:`~autogen.beta.eval.RunsNotComparableError` if the
    runs didn't grade the same tasks + checks; pass ``strict=False`` to diff the
    overlap and have the mismatches reported on the returned
    :class:`~autogen.beta.eval.RunDiff`.
    """
    # Local import: result.py <-> diff.py circular-import shim (AGENTS.md exempt).
    from .diff import compute_diff

    return compute_diff(self, baseline, strict=strict)

summary #

summary()

Human-readable multi-line table of run metadata and aggregates.

Format is plain ASCII (no charting libraries) so the output can be copied straight into a CI log or a CHANGELOG.

Source code in autogen/beta/eval/results/result.py

def summary(self) -> str:
    """Human-readable multi-line table of run metadata and aggregates.

    Format is plain ASCII (no charting libraries) so the output can be
    copied straight into a CI log or a CHANGELOG.
    """
    return _render_summary(self)

save #

save(path=None)

Write the run as schema-0.1 JSON.

If path is None, saves under the run's configured store_dir (set by the runner via run_agent(..., store_dir=...)) as <run_id>.json. If path ends in .json it's used verbatim; otherwise path is treated as a directory and <run_id>.json is appended.

RAISES	DESCRIPTION
`ValueError`	if `path` is `None` and no `store_dir` was configured on this run.

RETURNS	DESCRIPTION
`The`	class:`Path` that was written. TYPE: `Path`

Source code in autogen/beta/eval/results/result.py

def save(self, path: str | os.PathLike[str] | None = None) -> Path:
    """Write the run as schema-0.1 JSON.

    If ``path`` is ``None``, saves under the run's configured
    ``store_dir`` (set by the runner via ``run_agent(..., store_dir=...)``)
    as ``<run_id>.json``. If ``path`` ends in ``.json`` it's used
    verbatim; otherwise ``path`` is treated as a directory and
    ``<run_id>.json`` is appended.

    Raises:
        ValueError: if ``path`` is ``None`` and no ``store_dir`` was
            configured on this run.

    Returns:
        The :class:`Path` that was written.
    """
    # Local import: store.py also depends on result.py for type names,
    # so importing at module top would create a circular import.
    # AGENTS.md exempts circular-import shims from the no-function-
    # level-imports rule.
    from .store import dump

    target = self._resolve_save_path(path)
    return dump(self, target)

RunResult

autogen.beta.eval.results.result.RunResult #

run_id property #

schema_version property #

tasks property #

suite property #

target_path property #

concurrency property #

duration_ms property #

created_at property #

label property #

aggregates property #

tags property #

pass_rate #

score_stats #

value_counts #

diff #

summary #

save #

run_id `property` #

schema_version `property` #

tasks `property` #

suite `property` #

target_path `property` #

concurrency `property` #

duration_ms `property` #

created_at `property` #

label `property` #

aggregates `property` #

tags `property` #