Image Input/Output with OpenAI V2 Client and AG2 Agents#
Author: Yixuan Zhai
This notebook demonstrates how to use the new OpenAI V2 Client (api_type: "openai_v2") with AG2’s agent system for image input and vision tasks.
What is OpenAI V2 Client?#
The V2 client is a next-generation LLM client that implements both ModelClient and ModelClientV2 protocols, returning rich UnifiedResponse objects with:
- Typed content blocks:
TextContent,ReasoningContent,ToolCallContent,CitationContent - Forward compatibility:
GenericContenthandles unknown future content types - Rich metadata: Full reasoning blocks, citations, and tool execution details
- Type safety: Pydantic validation for all response data
- Cost tracking: Automatic per-response cost calculation
Comparison with Standard OpenAI Client#
| Feature | Standard Client | V2 Client |
|---|---|---|
| Response Type | ChatCompletion (dict) | UnifiedResponse (Pydantic) |
| Content Blocks | Untyped | Strongly typed |
| Reasoning Support | String | ReasoningContent with metadata |
| Agent Integration | ✅ Yes | ✅ Yes (via duck typing) |
| Type Safety | Minimal | Full Pydantic validation |
| Extensibility | Fixed | Forward-compatible |
Installation#
Setup: Create Assistant with V2 Client#
import os
import textwrap
from autogen import AssistantAgent, UserProxyAgent
from autogen.io.run_response import Cost
# Helper function to extract total cost from ChatResult.cost dict
def get_total_cost(cost_dict):
"""Extract total cost from ChatResult.cost dict structure."""
total = 0.0
for usage_type in cost_dict.values():
if isinstance(usage_type, dict):
for model_usage in usage_type.values():
if isinstance(model_usage, dict) and "cost" in model_usage:
total += model_usage["cost"]
return total
# Helper function to extract cost from run response
def get_total_cost_from_run(run_response_cost):
"""Extract total cost from run response object."""
if isinstance(run_response_cost, Cost):
return run_response_cost.usage_including_cached_inference.total_cost
return 0.0
# Configure LLM to use V2 client
llm_config = {
"config_list": [
{
"api_type": "openai_v2", # <-- Key: use V2 client architecture
"model": "gpt-4o-mini", # Vision-capable model
"api_key": os.getenv("OPENAI_API_KEY"),
}
],
"temperature": 0.3,
}
# Create vision assistant
assistant = AssistantAgent(
name="VisionBot",
llm_config=llm_config,
system_message=textwrap.dedent("""
You are an AI assistant with vision capabilities.
You can analyze images and provide detailed, accurate descriptions.
""").strip(),
)
# Create user proxy
user_proxy = UserProxyAgent(
name="User",
human_input_mode="NEVER",
max_consecutive_auto_reply=0,
code_execution_config=False,
)
# Test image URL
IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/3/3b/BlkStdSchnauzer2.jpg"
print("✓ Assistant with V2 client created")
Example 1: Simple Image Description#
Using formal image input format to reduce hallucination.
# Formal image input format (recommended)
message_with_image = {
"role": "user",
"content": [
{"type": "text", "text": "Describe this image in one sentence."},
{"type": "image_url", "image_url": {"url": IMAGE_URL}},
],
}
# Initiate chat with image
chat_result = user_proxy.initiate_chat(assistant, message=message_with_image, max_turns=1, summary_method="last_msg")
print("\n=== Response ===")
print(chat_result.summary)
print(f"\nCost: ${get_total_cost(chat_result.cost):.4f}")
Example 2: Detailed Image Analysis#
detailed_message = {
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image in detail. What breed is this dog? What are its characteristics?"},
{"type": "image_url", "image_url": {"url": IMAGE_URL}},
],
}
chat_result = user_proxy.initiate_chat(
assistant,
message=detailed_message,
max_turns=1,
clear_history=True, # Start fresh conversation
)
print(chat_result.summary)
print(f"\nCost: ${get_total_cost(chat_result.cost):.4f}")
Example 3: Multi-Turn Conversation with Image Context#
The assistant maintains context across turns - no need to resend the image.
# First turn: Show image and ask initial question
initial_message = {
"role": "user",
"content": [
{"type": "text", "text": "What animal is in this image?"},
{"type": "image_url", "image_url": {"url": IMAGE_URL}},
],
}
chat_result = user_proxy.initiate_chat(assistant, message=initial_message, max_turns=1, clear_history=True)
print("Turn 1:")
print(chat_result.summary)
# Second turn: Follow-up question (context maintained)
followup = user_proxy.send(
message="What are the distinctive characteristics of this breed?", recipient=assistant, request_reply=True
)
print("\nTurn 2:")
print(followup)
# Third turn: Another follow-up
followup2 = user_proxy.send(message="Is this dog well-suited for families?", recipient=assistant, request_reply=True)
print("\nTurn 3:")
print(followup2)
Example 4: Using run() Interface#
The V2 client also works with the run() interface.
# Use run() interface
response = assistant.run(
message={
"role": "user",
"content": [
{"type": "text", "text": "What is the primary color of this dog's coat?"},
{"type": "image_url", "image_url": {"url": IMAGE_URL}},
],
},
user_input=True,
max_turns=1,
clear_history=True,
)
# Process the response
response.process()
print("=== Run Interface Result ===")
print(f"Summary: {response.summary}")
print(f"Cost: ${get_total_cost_from_run(response.cost):.4f}")
Example 5: Multiple Images Comparison#
The V2 client can handle multiple images in a single request.
# Two different dog images
IMAGE_URL_1 = "https://upload.wikimedia.org/wikipedia/commons/3/3b/BlkStdSchnauzer2.jpg"
IMAGE_URL_2 = "https://upload.wikimedia.org/wikipedia/commons/6/6e/Golde33443.jpg"
comparison_message = {
"role": "user",
"content": [
{"type": "text", "text": "Compare these two dogs. What are the differences between them?"},
{"type": "image_url", "image_url": {"url": IMAGE_URL_1}},
{"type": "image_url", "image_url": {"url": IMAGE_URL_2}},
],
}
chat_result = user_proxy.initiate_chat(assistant, message=comparison_message, max_turns=1, clear_history=True)
print(chat_result.summary)
print(f"\nCost for 2 images: ${get_total_cost(chat_result.cost):.4f}")
Example 6: Group Chat with V2 Client#
The V2 client works seamlessly in multi-agent group chat scenarios.
from autogen import ConversableAgent
from autogen.agentchat.groupchat import GroupChat, GroupChatManager
# Create specialized agents with V2 client
image_analyst = ConversableAgent(
name="ImageAnalyst",
llm_config=llm_config,
human_input_mode="NEVER",
system_message="You analyze images. Keep responses brief and focused on visual details.",
)
breed_expert = ConversableAgent(
name="BreedExpert",
llm_config=llm_config,
human_input_mode="NEVER",
system_message="You are a dog breed expert. Provide breed-specific information concisely.",
)
# Create user proxy for group chat
group_user = UserProxyAgent(
name="Coordinator", human_input_mode="NEVER", max_consecutive_auto_reply=0, code_execution_config=False
)
# Create group chat
groupchat = GroupChat(
agents=[group_user, image_analyst, breed_expert], messages=[], max_round=4, speaker_selection_method="round_robin"
)
# Create manager with V2 client
manager = GroupChatManager(groupchat=groupchat, llm_config=llm_config)
# Start group chat with image
group_message = {
"role": "user",
"content": [
{"type": "text", "text": "Team, analyze this dog image and tell me about the breed."},
{"type": "image_url", "image_url": {"url": IMAGE_URL}},
],
}
chat_result = group_user.initiate_chat(manager, message=group_message, max_turns=3)
print("\n=== Group Chat Result ===")
print(f"Summary: {chat_result.summary}")
print(f"Cost: ${get_total_cost(chat_result.cost):.4f}")
print("\n=== Participants ===")
participant_names = {msg.get("name") for msg in chat_result.chat_history if msg.get("name")}
print(f"Agents participated: {participant_names}")
Example 6b: Pattern-Based Group Chat#
The V2 client works seamlessly with AG2’s modern pattern-based group orchestration API.
Example 6b: Pattern-Based Group Chat (Modern API)#
AG2’s pattern-based group chat API provides a modern, flexible way to orchestrate multi-agent conversations. The V2 client works seamlessly with this pattern system.
from autogen.agentchat.group.multi_agent_chat import initiate_group_chat
from autogen.agentchat.group.patterns import DefaultPattern
# Create specialized agents with V2 client
data_analyst = ConversableAgent(
name="DataAnalyst",
llm_config=llm_config,
human_input_mode="NEVER",
system_message="You analyze data and provide insights. Be brief and focused.",
)
quality_reviewer = ConversableAgent(
name="QualityReviewer",
llm_config=llm_config,
human_input_mode="NEVER",
system_message="You review analysis quality and provide feedback. Be concise.",
)
# Create pattern-based group chat
pattern = DefaultPattern(
initial_agent=data_analyst, # Starting agent
agents=[data_analyst, quality_reviewer], # All agents in group
)
# Initiate group chat using pattern API
chat_result, context_variables, last_agent = initiate_group_chat(
pattern=pattern,
messages="Analyze the number 42 and discuss its significance.",
max_rounds=3,
)
print("\n=== Pattern-Based Group Chat Result ===")
print(f"Summary: {chat_result.summary}")
print(f"Cost: ${get_total_cost(chat_result.cost):.4f}")
print(f"Last speaker: {last_agent.name}")
print("\n=== Participants ===")
participant_names = {msg.get("name") for msg in chat_result.chat_history if msg.get("name")}
print(f"Agents: {participant_names}")
total_cost = 0
conversations = [
"What animal is this?",
"What breed specifically?",
"What color is it?",
]
# First, show the image
chat_result = user_proxy.initiate_chat(
assistant,
message={
"role": "user",
"content": [{"type": "text", "text": conversations[0]}, {"type": "image_url", "image_url": {"url": IMAGE_URL}}],
},
max_turns=1,
clear_history=True,
)
total_cost += get_total_cost(chat_result.cost)
print(f"Q1: {conversations[0]}")
print(f"A1: {chat_result.summary}")
print(f"Cost: ${get_total_cost(chat_result.cost):.4f}\n")
# Follow-up questions (image context maintained)
for question in conversations[1:]:
response = user_proxy.send(message=question, recipient=assistant, request_reply=True)
print(f"Q: {question}")
print(f"A: {response}\n")
print(f"=== Total Conversation Cost: ${total_cost:.4f} ===")
Example 8: Accessing the UnifiedResponse (Advanced)#
While agents work with the V2 client transparently, you can also use the client directly for advanced features.
# Import the V2 client directly
from autogen.llm_clients import OpenAICompletionsClient
# Create client directly
v2_client = OpenAICompletionsClient(api_key=os.getenv("OPENAI_API_KEY"))
# Make a direct call
response = v2_client.create({
"model": "gpt-4o-mini",
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": "Describe this image."},
{"type": "image_url", "image_url": {"url": IMAGE_URL}},
],
}
],
})
print("=== Direct V2 Client Access ===")
print(f"Response Type: {type(response).__name__}")
print(f"Response ID: {response.id}")
print(f"Model: {response.model}")
print(f"Provider: {response.provider}")
print(f"\nText: {response.text}")
print("\n=== Typed Content Blocks ===")
print(f"Number of messages: {len(response.messages)}")
print(f"Message role: {response.messages[0].role}")
print(f"Content blocks: {len(response.messages[0].content)}")
for i, block in enumerate(response.messages[0].content):
print(f" Block {i}: {block.type} - {type(block).__name__}")
print("\n=== Rich Metadata ===")
print(f"Reasoning blocks: {len(response.reasoning)}")
if hasattr(response, "tool_calls"):
print(f"Tool calls: {len(response.tool_calls)}")
print(f"Usage: {response.usage}")
print(f"Cost: ${response.cost if response.cost is not None else 0:.4f}")
Summary#
Key Benefits of V2 Client with Agents#
- Seamless Integration: Works with existing
AssistantAgentandUserProxyAgent - Rich Response Data: Access to typed content blocks (reasoning, citations, etc.)
- Vision Support: Full multimodal capabilities with formal image input
- Cost Tracking: Automatic per-response cost calculation
- Type Safety: Pydantic validation for all response data
- Forward Compatible: GenericContent handles unknown future types
- Duck Typing: UnifiedResponse works with agent system via duck typing
Usage Pattern#
# Simple: Just change api_type
llm_config = {
"config_list": [{
"api_type": "openai_v2", # <-- That's it!
"model": "gpt-4o-mini",
"api_key": "...",
}]
}
assistant = AssistantAgent(llm_config=llm_config)
# Everything works as before, but with rich UnifiedResponse internally
When to Use V2 Client#
- ✅ Need access to reasoning blocks (o1/o3 models)
- ✅ Want typed, structured response data
- ✅ Building systems that need forward compatibility
- ✅ Require rich metadata and citations
- ✅ Working with vision/multimodal models
Migration from Standard Client#
No code changes needed! Just update api_type: