PairwiseRunResult(*, run_id, cases, variant_a, variant_b, keys, created_at, duration_ms, n_pairs, label=None, store_dir=None)
Per-key win/loss/tie, win-rate(B) + Wilson CI, flips, and agreement.
Source code in autogen/beta/eval/pairwise.py
| def __init__(
self,
*,
run_id: str,
cases: tuple[PairwiseCase, ...],
variant_a: str,
variant_b: str,
keys: tuple[str, ...],
created_at: str,
duration_ms: int,
n_pairs: int,
label: str | None = None,
store_dir: str | os.PathLike[str] | None = None,
) -> None:
self._run_id = run_id
self._label = label
self._cases = cases
self._variant_a = variant_a
self._variant_b = variant_b
self._keys = keys
self._created_at = created_at
self._duration_ms = duration_ms
self._n_pairs = n_pairs
self._store_dir = Path(store_dir) if store_dir is not None else None
|
tally
(a_wins, b_wins, ties) for one key.
Source code in autogen/beta/eval/pairwise.py
| def tally(self, key: str) -> tuple[int, int, int]:
"""``(a_wins, b_wins, ties)`` for one key."""
a = b = t = 0
for case in self._cases:
if case.key != key:
continue
if case.winner == "a":
a += 1
elif case.winner == "b":
b += 1
else:
t += 1
return a, b, t
|
win_rate
Variant B's win-rate on key (ties = 0.5) with a Wilson 95% CI.
Source code in autogen/beta/eval/pairwise.py
| def win_rate(self, key: str) -> WinRate:
"""Variant B's win-rate on ``key`` (ties = 0.5) with a Wilson 95% CI."""
a, b, t = self.tally(key)
n = a + b + t
rate = (b + 0.5 * t) / n if n else 0.0
return WinRate(variant=self._variant_b, rate=rate, wins=b, losses=a, ties=t, n=n, ci=_wilson_ci(rate, n))
|
flips
Cases where the swapped orders disagreed (position sensitivity).
Source code in autogen/beta/eval/pairwise.py
| def flips(self, key: str) -> int:
"""Cases where the swapped orders disagreed (position sensitivity)."""
count = 0
for case in self._cases:
if case.key != key:
continue
detail = case.detail
if (
"order1" in detail
and "order2" in detail
and _pos_to_ab(detail["order1"], "a", "b") != _pos_to_ab(detail["order2"], "b", "a")
):
count += 1
return count
|
agreement
Agreement between two keys over the tasks both scored (Cohen's kappa).
Source code in autogen/beta/eval/pairwise.py
| def agreement(self, key_x: str, key_y: str) -> Agreement:
"""Agreement between two keys over the tasks both scored (Cohen's kappa)."""
x = {c.task_id: c.winner for c in self._cases if c.key == key_x}
y = {c.task_id: c.winner for c in self._cases if c.key == key_y}
shared = sorted(set(x) & set(y))
pairs = [(x[t], y[t]) for t in shared]
n = len(pairs)
rate = sum(1 for a, b in pairs if a == b) / n if n else 0.0
disagreements = tuple((t, x[t], y[t]) for t in shared if x[t] != y[t])
return Agreement(rate=rate, cohen_kappa=_cohen_kappa(pairs), n=n, disagreements=disagreements)
|
summary
Source code in autogen/beta/eval/pairwise.py
| def summary(self) -> str:
lines = [
f"Pairwise {self._run_id} — {self._variant_b!r} (B) vs {self._variant_a!r} (A) · {self._n_pairs} cases",
f" {'key':<18} {'B':>3} {'A':>3} {'tie':>4} win-rate({self._variant_b}) 95% CI",
]
for key in self._keys:
wr = self.win_rate(key)
lines.append(
f" {key:<18} {wr.wins:>3} {wr.losses:>3} {wr.ties:>4} {wr.rate * 100:5.1f}% "
f"[{wr.ci[0] * 100:.0f}%, {wr.ci[1] * 100:.0f}%]"
)
flips = self.flips(key)
if flips:
lines.append(f" {'':<18} position-flips → tie: {flips}")
return "\n".join(lines)
|
to_dict
Source code in autogen/beta/eval/pairwise.py
| def to_dict(self) -> dict[str, Any]:
return {
"schema_version": _SCHEMA_VERSION,
"run_id": self._run_id,
"label": self._label,
"created_at": self._created_at,
"duration_ms": self._duration_ms,
"variant_a": self._variant_a,
"variant_b": self._variant_b,
"n_pairs": self._n_pairs,
"keys": list(self._keys),
"win_rates": {k: _win_rate_to_dict(self.win_rate(k)) for k in self._keys},
"flips": {k: self.flips(k) for k in self._keys},
"cases": [
{
"task_id": c.task_id,
"key": c.key,
"winner": c.winner,
"reasoning": c.reasoning,
"detail": dict(c.detail),
}
for c in self._cases
],
}
|
save
Source code in autogen/beta/eval/pairwise.py
| def save(self, path: str | os.PathLike[str] | None = None) -> Path:
target = self._resolve_save_path(path)
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(json.dumps(self.to_dict(), indent=2, default=str), encoding="utf-8")
return target
|