compute_diff

autogen.beta.eval.results.diff.compute_diff #

compute_diff(current, baseline, *, strict=True)

Compare current against baseline; see :class:RunDiff and :meth:RunResult.diff.

Source code in autogen/beta/eval/results/diff.py

def compute_diff(current: RunResult, baseline: RunResult, *, strict: bool = True) -> RunDiff:
    """Compare ``current`` against ``baseline``; see :class:`RunDiff` and :meth:`RunResult.diff`."""
    cur = {tr.task.task_id: tr for tr in current.tasks}
    base = {tr.task.task_id: tr for tr in baseline.tasks}

    only_in_current = tuple(sorted(set(cur) - set(base)))
    only_in_baseline = tuple(sorted(set(base) - set(cur)))
    shared = set(cur) & set(base)
    content_changed = tuple(
        sorted(
            tid
            for tid in shared
            if cur[tid].task.inputs != base[tid].task.inputs
            or cur[tid].task.reference_outputs != base[tid].task.reference_outputs
        )
    )
    comparable = sorted(shared - set(content_changed))

    cur_keys = _keys(current.tasks)
    base_keys = _keys(baseline.tasks)
    scorers_only_in_current = tuple(sorted(cur_keys - base_keys))
    scorers_only_in_baseline = tuple(sorted(base_keys - cur_keys))
    shared_keys = sorted(cur_keys & base_keys)

    if strict:
        problems = _problems(
            only_in_current, only_in_baseline, content_changed, scorers_only_in_current, scorers_only_in_baseline
        )
        if problems:
            raise RunsNotComparableError(_message(problems))

    cur_fb = {tid: _by_key(cur[tid]) for tid in comparable}
    base_fb = {tid: _by_key(base[tid]) for tid in comparable}

    pass_rate_deltas: dict[str, tuple[float, float]] = {}
    mean_deltas: dict[str, tuple[float, float]] = {}
    flipped_to_fail: list[tuple[str, str]] = []
    flipped_to_pass: list[tuple[str, str]] = []

    for key in shared_keys:
        base_bools = [s for t in comparable if (s := _bool(base_fb[t].get(key))) is not None]
        cur_bools = [s for t in comparable if (s := _bool(cur_fb[t].get(key))) is not None]
        if base_bools or cur_bools:
            pass_rate_deltas[key] = (_rate(base_bools), _rate(cur_bools))

        base_nums = [float(base_fb[t][key].score) for t in comparable if _is_num(base_fb[t].get(key))]
        cur_nums = [float(cur_fb[t][key].score) for t in comparable if _is_num(cur_fb[t].get(key))]
        if base_nums or cur_nums:
            mean_deltas[key] = (_mean(base_nums), _mean(cur_nums))

        for t in comparable:
            b, c = base_fb[t].get(key), cur_fb[t].get(key)
            if b is None or c is None or not isinstance(b.score, bool) or not isinstance(c.score, bool):
                continue
            if b.score and not c.score:
                flipped_to_fail.append((key, t))
            elif not b.score and c.score:
                flipped_to_pass.append((key, t))

    return RunDiff(
        current_run_id=current.run_id,
        baseline_run_id=baseline.run_id,
        comparable_tasks=tuple(comparable),
        pass_rate_deltas=pass_rate_deltas,
        mean_deltas=mean_deltas,
        flipped_to_fail=tuple(flipped_to_fail),
        flipped_to_pass=tuple(flipped_to_pass),
        only_in_current=only_in_current,
        only_in_baseline=only_in_baseline,
        content_changed=content_changed,
        scorers_only_in_current=scorers_only_in_current,
        scorers_only_in_baseline=scorers_only_in_baseline,
    )