def compute_diff(current: RunResult, baseline: RunResult, *, strict: bool = True) -> RunDiff:
"""Compare ``current`` against ``baseline``; see :class:`RunDiff` and :meth:`RunResult.diff`."""
cur = {tr.task.task_id: tr for tr in current.tasks}
base = {tr.task.task_id: tr for tr in baseline.tasks}
only_in_current = tuple(sorted(set(cur) - set(base)))
only_in_baseline = tuple(sorted(set(base) - set(cur)))
shared = set(cur) & set(base)
content_changed = tuple(
sorted(
tid
for tid in shared
if cur[tid].task.inputs != base[tid].task.inputs
or cur[tid].task.reference_outputs != base[tid].task.reference_outputs
)
)
comparable = sorted(shared - set(content_changed))
cur_keys = _keys(current.tasks)
base_keys = _keys(baseline.tasks)
scorers_only_in_current = tuple(sorted(cur_keys - base_keys))
scorers_only_in_baseline = tuple(sorted(base_keys - cur_keys))
shared_keys = sorted(cur_keys & base_keys)
if strict:
problems = _problems(
only_in_current, only_in_baseline, content_changed, scorers_only_in_current, scorers_only_in_baseline
)
if problems:
raise RunsNotComparableError(_message(problems))
cur_fb = {tid: _by_key(cur[tid]) for tid in comparable}
base_fb = {tid: _by_key(base[tid]) for tid in comparable}
pass_rate_deltas: dict[str, tuple[float, float]] = {}
mean_deltas: dict[str, tuple[float, float]] = {}
flipped_to_fail: list[tuple[str, str]] = []
flipped_to_pass: list[tuple[str, str]] = []
for key in shared_keys:
base_bools = [s for t in comparable if (s := _bool(base_fb[t].get(key))) is not None]
cur_bools = [s for t in comparable if (s := _bool(cur_fb[t].get(key))) is not None]
if base_bools or cur_bools:
pass_rate_deltas[key] = (_rate(base_bools), _rate(cur_bools))
base_nums = [float(base_fb[t][key].score) for t in comparable if _is_num(base_fb[t].get(key))]
cur_nums = [float(cur_fb[t][key].score) for t in comparable if _is_num(cur_fb[t].get(key))]
if base_nums or cur_nums:
mean_deltas[key] = (_mean(base_nums), _mean(cur_nums))
for t in comparable:
b, c = base_fb[t].get(key), cur_fb[t].get(key)
if b is None or c is None or not isinstance(b.score, bool) or not isinstance(c.score, bool):
continue
if b.score and not c.score:
flipped_to_fail.append((key, t))
elif not b.score and c.score:
flipped_to_pass.append((key, t))
return RunDiff(
current_run_id=current.run_id,
baseline_run_id=baseline.run_id,
comparable_tasks=tuple(comparable),
pass_rate_deltas=pass_rate_deltas,
mean_deltas=mean_deltas,
flipped_to_fail=tuple(flipped_to_fail),
flipped_to_pass=tuple(flipped_to_pass),
only_in_current=only_in_current,
only_in_baseline=only_in_baseline,
content_changed=content_changed,
scorers_only_in_current=scorers_only_in_current,
scorers_only_in_baseline=scorers_only_in_baseline,
)