Give your LangChain agents visual web perception — screenshot any URL and pass the image to a multimodal LLM.
Requires: langchain-core, httpx. No other dependencies.
from langchain_hermes import HermesScreenshotTool
from langchain_openai import ChatOpenAI
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain_core.prompts import ChatPromptTemplate
# Initialize the tool (no API key needed for basic use)
screenshot_tool = HermesScreenshotTool()
# Or with a free API key for higher limits (50 req/day, free):
# screenshot_tool = HermesScreenshotTool(api_key="hf_your_key_here")
llm = ChatOpenAI(model="gpt-4o")
tools = [screenshot_tool]
prompt = ChatPromptTemplate.from_messages([
("system", "You are a helpful assistant with web vision."),
("human", "{input}"),
("placeholder", "{agent_scratchpad}"),
])
agent = create_tool_calling_agent(llm, tools, prompt)
executor = AgentExecutor(agent=agent, tools=tools)
result = executor.invoke({
"input": "Screenshot https://example.com and describe what you see"
})
print(result["output"])
| Parameter | Type | Default | Description |
|---|---|---|---|
url | str | required | Full URL to screenshot |
full_page | bool | false | Capture full scrolled page height |
width | int | 1280 | Viewport width (320–2560px) |
format | str | "webp" | Output: webp, png, jpeg, pdf |
delay | int | 0 | Wait ms after load (0–10000) |
dark_mode | bool | false | Enable dark mode rendering |
block_ads | bool | true | Block ads and trackers |
from langchain_hermes import HermesAsyncScreenshotTool
# Async version — use in async agent chains
async_tool = HermesAsyncScreenshotTool(api_key="hf_your_key_here")
# Both tools return base64 data URLs:
# data:image/webp;base64,/9j/4AAQSkZJRg...
A base64-encoded image data URL: data:image/webp;base64,...
Pass this directly to a multimodal LLM (GPT-4o, Claude, Gemini) for visual analysis. The tool calls /api/screenshot at hermesforge.dev.