You can send images to multimodal LLMs for vision-based tasks like screenshot analysis, image processing, and visual QA:
examples/01_standalone_sdk/17_image_input.py
Copy
Ask AI
"""OpenHands Agent SDK — Image Input Example.This script mirrors the basic setup from ``examples/01_hello_world.py`` but addsvision support by sending an image to the agent alongside text instructions."""import osfrom pydantic import SecretStrfrom openhands.sdk import ( LLM, Agent, Conversation, Event, ImageContent, LLMConvertibleEvent, Message, TextContent, get_logger,)from openhands.sdk.tool.registry import register_toolfrom openhands.sdk.tool.spec import Toolfrom openhands.tools.execute_bash import BashToolfrom openhands.tools.file_editor import FileEditorToolfrom openhands.tools.task_tracker import TaskTrackerToollogger = get_logger(__name__)# Configure LLM (vision-capable model)api_key = os.getenv("LLM_API_KEY")assert api_key is not None, "LLM_API_KEY environment variable is not set."model = os.getenv("LLM_MODEL", "openhands/claude-sonnet-4-5-20250929")base_url = os.getenv("LLM_BASE_URL")llm = LLM( usage_id="vision-llm", model=model, base_url=base_url, api_key=SecretStr(api_key),)assert llm.vision_is_active(), "The selected LLM model does not support vision input."cwd = os.getcwd()register_tool("BashTool", BashTool)register_tool("FileEditorTool", FileEditorTool)register_tool("TaskTrackerTool", TaskTrackerTool)agent = Agent( llm=llm, tools=[ Tool( name="BashTool", ), Tool(name="FileEditorTool"), Tool(name="TaskTrackerTool"), ],)llm_messages = [] # collect raw LLM messages for inspectiondef conversation_callback(event: Event) -> None: if isinstance(event, LLMConvertibleEvent): llm_messages.append(event.to_llm_message())conversation = Conversation( agent=agent, callbacks=[conversation_callback], workspace=cwd)IMAGE_URL = "https://github.com/OpenHands/docs/raw/main/openhands/static/img/logo.png"conversation.send_message( Message( role="user", content=[ TextContent( text=( "Study this image and describe the key elements you see. " "Summarize them in a short paragraph and suggest a catchy caption." ) ), ImageContent(image_urls=[IMAGE_URL]), ], ))conversation.run()conversation.send_message( "Great! Please save your description and caption into image_report.md.")conversation.run()print("=" * 100)print("Conversation finished. Got the following LLM messages:")for i, message in enumerate(llm_messages): print(f"Message {i}: {str(message)[:200]}")
Running the Example
Copy
Ask AI
export LLM_API_KEY="your-api-key"cd agent-sdkuv run python examples/01_standalone_sdk/17_image_input.py
The LLM you use must support image inputs (llm.vision_is_active() need to be True).
Pass images along with text in the message content:
Copy
Ask AI
from openhands.sdk import ImageContentIMAGE_URL = "https://github.com/OpenHands/OpenHands/raw/main/docs/static/img/logo.png"conversation.send_message( Message( role="user", content=[ TextContent( text=( "Study this image and describe the key elements you see. " "Summarize them in a short paragraph and suggest a catchy caption." ) ), ImageContent(image_urls=[IMAGE_URL]), ], ))
Works with multimodal LLMs like GPT-4 Vision and Claude with vision capabilities.