Skip to content

export_pairwise_cases

autogen.beta.eval.scorers.human_pairwise.export_pairwise_cases async #

export_pairwise_cases(source_a, source_b, *, criteria, out, suite=None, seed=None)

Write a blinded JSONL labeling manifest for the paired traces.

One line per (task, criterion): {case_id, task_id, criterion, task_input, response_1, response_2, first_variant}. first_variant records which variant is Response 1 (de-blinding key — a labeling UI must not show it). A labeler adds preferred ("1"/"2"/"tie") per line; feed the result to :func:human_labels.

Source code in autogen/beta/eval/scorers/human_pairwise.py
async def export_pairwise_cases(
    source_a: TraceSource,
    source_b: TraceSource,
    *,
    criteria: Iterable[str],
    out: str,
    suite: Suite | None = None,
    seed: int | None = None,
) -> Path:
    """Write a blinded JSONL labeling manifest for the paired traces.

    One line per (task, criterion): ``{case_id, task_id, criterion, task_input,
    response_1, response_2, first_variant}``. ``first_variant`` records which
    variant is Response 1 (de-blinding key — a labeling UI must not show it).
    A labeler adds ``preferred`` ("1"/"2"/"tie") per line; feed the result to
    :func:`human_labels`.
    """
    criteria = list(criteria)
    rng = random.Random(seed)
    tasks_by_id = {task.task_id: task for task in suite} if suite is not None else {}

    refs_a = [ref async for ref in source_a.list()]
    b_by_task: dict[str, TraceRef] = {}
    async for ref in source_b.list():
        if ref.task_id is not None:
            b_by_task[ref.task_id] = ref

    lines: list[dict[str, Any]] = []
    for ref_a in refs_a:
        if ref_a.task_id is None or ref_a.task_id not in b_by_task:
            continue
        answer_a = _final_text(await source_a.load(ref_a))
        answer_b = _final_text(await source_b.load(b_by_task[ref_a.task_id]))
        task = tasks_by_id.get(ref_a.task_id) or Task(task_id=ref_a.task_id, inputs={})
        first_variant = rng.choice(["a", "b"])
        response_1, response_2 = (answer_a, answer_b) if first_variant == "a" else (answer_b, answer_a)
        for criterion in criteria:
            lines.append({
                "case_id": f"{ref_a.task_id}::{criterion}",
                "task_id": ref_a.task_id,
                "criterion": criterion,
                "task_input": task.inputs.get("input"),
                "response_1": response_1,
                "response_2": response_2,
                "first_variant": first_variant,
            })

    path = Path(out)
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text("".join(json.dumps(line) + "\n" for line in lines), encoding="utf-8")
    return path