Skip to content

PairwiseRunResult

autogen.beta.eval.pairwise.PairwiseRunResult #

PairwiseRunResult(*, run_id, cases, variant_a, variant_b, keys, created_at, duration_ms, n_pairs, label=None, store_dir=None)

Per-key win/loss/tie, win-rate(B) + Wilson CI, flips, and agreement.

Source code in autogen/beta/eval/pairwise.py
def __init__(
    self,
    *,
    run_id: str,
    cases: tuple[PairwiseCase, ...],
    variant_a: str,
    variant_b: str,
    keys: tuple[str, ...],
    created_at: str,
    duration_ms: int,
    n_pairs: int,
    label: str | None = None,
    store_dir: str | os.PathLike[str] | None = None,
) -> None:
    self._run_id = run_id
    self._label = label
    self._cases = cases
    self._variant_a = variant_a
    self._variant_b = variant_b
    self._keys = keys
    self._created_at = created_at
    self._duration_ms = duration_ms
    self._n_pairs = n_pairs
    self._store_dir = Path(store_dir) if store_dir is not None else None

run_id property #

run_id

label property #

label

cases property #

cases

tally #

tally(key)

(a_wins, b_wins, ties) for one key.

Source code in autogen/beta/eval/pairwise.py
def tally(self, key: str) -> tuple[int, int, int]:
    """``(a_wins, b_wins, ties)`` for one key."""
    a = b = t = 0
    for case in self._cases:
        if case.key != key:
            continue
        if case.winner == "a":
            a += 1
        elif case.winner == "b":
            b += 1
        else:
            t += 1
    return a, b, t

win_rate #

win_rate(key)

Variant B's win-rate on key (ties = 0.5) with a Wilson 95% CI.

Source code in autogen/beta/eval/pairwise.py
def win_rate(self, key: str) -> WinRate:
    """Variant B's win-rate on ``key`` (ties = 0.5) with a Wilson 95% CI."""
    a, b, t = self.tally(key)
    n = a + b + t
    rate = (b + 0.5 * t) / n if n else 0.0
    return WinRate(variant=self._variant_b, rate=rate, wins=b, losses=a, ties=t, n=n, ci=_wilson_ci(rate, n))

flips #

flips(key)

Cases where the swapped orders disagreed (position sensitivity).

Source code in autogen/beta/eval/pairwise.py
def flips(self, key: str) -> int:
    """Cases where the swapped orders disagreed (position sensitivity)."""
    count = 0
    for case in self._cases:
        if case.key != key:
            continue
        detail = case.detail
        if (
            "order1" in detail
            and "order2" in detail
            and _pos_to_ab(detail["order1"], "a", "b") != _pos_to_ab(detail["order2"], "b", "a")
        ):
            count += 1
    return count

agreement #

agreement(key_x, key_y)

Agreement between two keys over the tasks both scored (Cohen's kappa).

Source code in autogen/beta/eval/pairwise.py
def agreement(self, key_x: str, key_y: str) -> Agreement:
    """Agreement between two keys over the tasks both scored (Cohen's kappa)."""
    x = {c.task_id: c.winner for c in self._cases if c.key == key_x}
    y = {c.task_id: c.winner for c in self._cases if c.key == key_y}
    shared = sorted(set(x) & set(y))
    pairs = [(x[t], y[t]) for t in shared]
    n = len(pairs)
    rate = sum(1 for a, b in pairs if a == b) / n if n else 0.0
    disagreements = tuple((t, x[t], y[t]) for t in shared if x[t] != y[t])
    return Agreement(rate=rate, cohen_kappa=_cohen_kappa(pairs), n=n, disagreements=disagreements)

summary #

summary()
Source code in autogen/beta/eval/pairwise.py
def summary(self) -> str:
    lines = [
        f"Pairwise {self._run_id}{self._variant_b!r} (B) vs {self._variant_a!r} (A) · {self._n_pairs} cases",
        f"  {'key':<18} {'B':>3} {'A':>3} {'tie':>4}   win-rate({self._variant_b})   95% CI",
    ]
    for key in self._keys:
        wr = self.win_rate(key)
        lines.append(
            f"  {key:<18} {wr.wins:>3} {wr.losses:>3} {wr.ties:>4}      {wr.rate * 100:5.1f}%   "
            f"[{wr.ci[0] * 100:.0f}%, {wr.ci[1] * 100:.0f}%]"
        )
        flips = self.flips(key)
        if flips:
            lines.append(f"  {'':<18} position-flips → tie: {flips}")
    return "\n".join(lines)

to_dict #

to_dict()
Source code in autogen/beta/eval/pairwise.py
def to_dict(self) -> dict[str, Any]:
    return {
        "schema_version": _SCHEMA_VERSION,
        "run_id": self._run_id,
        "label": self._label,
        "created_at": self._created_at,
        "duration_ms": self._duration_ms,
        "variant_a": self._variant_a,
        "variant_b": self._variant_b,
        "n_pairs": self._n_pairs,
        "keys": list(self._keys),
        "win_rates": {k: _win_rate_to_dict(self.win_rate(k)) for k in self._keys},
        "flips": {k: self.flips(k) for k in self._keys},
        "cases": [
            {
                "task_id": c.task_id,
                "key": c.key,
                "winner": c.winner,
                "reasoning": c.reasoning,
                "detail": dict(c.detail),
            }
            for c in self._cases
        ],
    }

save #

save(path=None)
Source code in autogen/beta/eval/pairwise.py
def save(self, path: str | os.PathLike[str] | None = None) -> Path:
    target = self._resolve_save_path(path)
    target.parent.mkdir(parents=True, exist_ok=True)
    target.write_text(json.dumps(self.to_dict(), indent=2, default=str), encoding="utf-8")
    return target