RunResult(*, run_id, tasks, suite, target_path, concurrency, duration_ms, created_at, label=None, store_dir=None)
The result of a full :func:run_agent.
Holds per-task records, run-level metadata, and computed aggregates. Lookup helpers (pass_rate, score_stats, value_counts) surface single keys; :meth:summary renders a printable table; :meth:save writes the schema-0.1 JSON.
Aggregates are computed once at construction time so repeated lookups are cheap.
Source code in autogen/beta/eval/results/result.py
| def __init__(
self,
*,
run_id: str,
tasks: tuple[TaskResult, ...],
suite: Suite,
target_path: str,
concurrency: int,
duration_ms: int,
created_at: str,
label: str | None = None,
store_dir: str | os.PathLike[str] | None = None,
) -> None:
self._run_id = run_id
self._tasks = tasks
self._suite = suite
self._target_path = target_path
self._concurrency = concurrency
self._duration_ms = duration_ms
self._created_at = created_at
self._label = label
self._store_dir = Path(store_dir) if store_dir is not None else None
self._aggregates = _compute_aggregates(tasks)
self._sliced: dict[str, Aggregates] = {}
|
run_id property
Stable identifier for this run — UUID4 hex unless the caller passed one.
schema_version property
Run JSON schema version. Always "0.1" in v0.
tasks property
Per-task records, in suite order.
suite property
The Suite that was executed.
target_path property
"<module>:<name>" provenance of the evaluated target (factory or instance type).
concurrency property
Concurrency cap the runner used (asyncio.Semaphore bound).
duration_ms property
Wall-clock duration of the full run, in milliseconds.
created_at property
ISO-8601 UTC timestamp of when this run started.
label property
User-defined identifier grouping runs of the same eval over time (None if unset).
aggregates property
Run-level rollups computed from the per-task feedback and traces.
Every tag present across the run's tasks — the values usable in tag= lookups.
pass_rate
pass_rate(key, *, tag=None)
Pass rate for a boolean scorer (0.0 if no boolean feedback under key).
Pass tag to compute it over only the tasks carrying that tag — e.g. pass_rate("tool_called", tag="adversarial"). Unset slices the whole run.
Source code in autogen/beta/eval/results/result.py
| def pass_rate(self, key: str, *, tag: str | None = None) -> float:
"""Pass rate for a boolean scorer (``0.0`` if no boolean feedback under ``key``).
Pass ``tag`` to compute it over only the tasks carrying that tag — e.g.
``pass_rate("tool_called", tag="adversarial")``. Unset slices the whole run.
"""
return self._agg(tag).pass_rate.get(key, 0.0)
|
score_stats
score_stats(key, *, tag=None)
Numeric stats for a scorer (zeros when nothing numeric under key).
Pass tag to restrict to tasks carrying that tag.
Source code in autogen/beta/eval/results/result.py
| def score_stats(self, key: str, *, tag: str | None = None) -> ScoreStats:
"""Numeric stats for a scorer (zeros when nothing numeric under ``key``).
Pass ``tag`` to restrict to tasks carrying that tag.
"""
return self._agg(tag).score_stats.get(key, ScoreStats(mean=0.0, p50=0.0, p95=0.0, n=0))
|
value_counts
value_counts(key, *, tag=None)
Categorical label counts for a scorer (empty dict when nothing categorical under key).
Pass tag to restrict to tasks carrying that tag.
Source code in autogen/beta/eval/results/result.py
| def value_counts(self, key: str, *, tag: str | None = None) -> dict[str, int]:
"""Categorical label counts for a scorer (empty dict when nothing categorical under ``key``).
Pass ``tag`` to restrict to tasks carrying that tag.
"""
return dict(self._agg(tag).value_counts.get(key, {}))
|
diff
diff(baseline, *, strict=True)
Compare this run against baseline — "did my change help or hurt?".
Reports per-scorer pass-rate / mean deltas and the tasks that flipped pass<->fail, over the tasks and checks the two runs share. By default (strict=True) raises :class:~autogen.beta.eval.RunsNotComparableError if the runs didn't grade the same tasks + checks; pass strict=False to diff the overlap and have the mismatches reported on the returned :class:~autogen.beta.eval.RunDiff.
Source code in autogen/beta/eval/results/result.py
| def diff(self, baseline: "RunResult", *, strict: bool = True) -> "RunDiff":
"""Compare this run against ``baseline`` — "did my change help or hurt?".
Reports per-scorer pass-rate / mean deltas and the tasks that flipped
pass<->fail, over the tasks and checks the two runs **share**. By default
(``strict=True``) raises :class:`~autogen.beta.eval.RunsNotComparableError` if the
runs didn't grade the same tasks + checks; pass ``strict=False`` to diff the
overlap and have the mismatches reported on the returned
:class:`~autogen.beta.eval.RunDiff`.
"""
# Local import: result.py <-> diff.py circular-import shim (AGENTS.md exempt).
from .diff import compute_diff
return compute_diff(self, baseline, strict=strict)
|
summary
Human-readable multi-line table of run metadata and aggregates.
Format is plain ASCII (no charting libraries) so the output can be copied straight into a CI log or a CHANGELOG.
Source code in autogen/beta/eval/results/result.py
| def summary(self) -> str:
"""Human-readable multi-line table of run metadata and aggregates.
Format is plain ASCII (no charting libraries) so the output can be
copied straight into a CI log or a CHANGELOG.
"""
return _render_summary(self)
|
save
Write the run as schema-0.1 JSON.
If path is None, saves under the run's configured store_dir (set by the runner via run_agent(..., store_dir=...)) as <run_id>.json. If path ends in .json it's used verbatim; otherwise path is treated as a directory and <run_id>.json is appended.
| RAISES | DESCRIPTION |
ValueError | if path is None and no store_dir was configured on this run. |
| RETURNS | DESCRIPTION |
The | class:Path that was written. TYPE: Path |
Source code in autogen/beta/eval/results/result.py
| def save(self, path: str | os.PathLike[str] | None = None) -> Path:
"""Write the run as schema-0.1 JSON.
If ``path`` is ``None``, saves under the run's configured
``store_dir`` (set by the runner via ``run_agent(..., store_dir=...)``)
as ``<run_id>.json``. If ``path`` ends in ``.json`` it's used
verbatim; otherwise ``path`` is treated as a directory and
``<run_id>.json`` is appended.
Raises:
ValueError: if ``path`` is ``None`` and no ``store_dir`` was
configured on this run.
Returns:
The :class:`Path` that was written.
"""
# Local import: store.py also depends on result.py for type names,
# so importing at module top would create a circular import.
# AGENTS.md exempts circular-import shims from the no-function-
# level-imports rule.
from .store import dump
target = self._resolve_save_path(path)
return dump(self, target)
|