Integrating a Screenshot API as a LangChain Tool
LangChain's tool interface is one of the cleanest patterns for giving language models access to external capabilities. Here's how to wrap a screenshot API as a LangChain tool — giving your agent the ability to see any web page.
The Tool Interface
LangChain tools follow a simple pattern: a name, a description (used by the LLM to decide when to invoke the tool), and a callable that takes a string input and returns a string output.
from langchain.tools import BaseTool
from pydantic import BaseModel, Field
from typing import Optional, Type
import requests
import base64
class ScreenshotInput(BaseModel):
url: str = Field(description="The URL to take a screenshot of")
class ScreenshotTool(BaseTool):
name: str = "web_screenshot"
description: str = (
"Captures a screenshot of any web page and returns the image as a base64 string. "
"Use this when you need to see what a web page looks like, analyze its layout, "
"check for errors, or extract visual information."
)
args_schema: Type[BaseModel] = ScreenshotInput
api_key: str
def _run(self, url: str) -> str:
resp = requests.get(
"https://hermesforge.dev/api/screenshot",
params={"url": url, "width": 1280, "format": "png"},
headers={"Authorization": f"Bearer {self.api_key}"},
timeout=30
)
if resp.status_code == 429:
return "Rate limit reached. Cannot capture screenshot."
resp.raise_for_status()
img_b64 = base64.standard_b64encode(resp.content).decode()
return f"data:image/png;base64,{img_b64}"
async def _arun(self, url: str) -> str:
import httpx
async with httpx.AsyncClient() as client:
resp = await client.get(
"https://hermesforge.dev/api/screenshot",
params={"url": url, "width": 1280, "format": "png"},
headers={"Authorization": f"Bearer {self.api_key}"},
timeout=30
)
resp.raise_for_status()
img_b64 = base64.standard_b64encode(resp.content).decode()
return f"data:image/png;base64,{img_b64}"
Using It With a Vision-Capable Agent
The screenshot tool returns a base64 image. For this to be useful, your agent needs to work with a vision model. Here's a complete agent setup:
import os
from langchain_anthropic import ChatAnthropic
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain_core.prompts import ChatPromptTemplate
# Initialize the screenshot tool
screenshot_tool = ScreenshotTool(api_key=os.environ["SCREENSHOT_API_KEY"])
# Use a vision-capable model
llm = ChatAnthropic(
model="claude-opus-4-6",
api_key=os.environ["ANTHROPIC_API_KEY"]
)
# Create the agent
tools = [screenshot_tool]
prompt = ChatPromptTemplate.from_messages([
("system", "You are a web research assistant. You can capture and analyze screenshots of web pages to answer questions about their content, layout, and functionality."),
("human", "{input}"),
("placeholder", "{agent_scratchpad}"),
])
agent = create_tool_calling_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
Running the Agent
result = agent_executor.invoke({
"input": "Take a screenshot of example.com and tell me what the main heading says."
})
print(result["output"])
Output:
> Entering new AgentExecutor chain...
Invoking: `web_screenshot` with `{'url': 'https://example.com'}`
[image captured]
The main heading on example.com says "Example Domain". Below it, there's a paragraph explaining that this domain may be used as illustrative examples in documents, with a link to more information.
> Finished chain.
A More Practical Example: Monitoring Agent
An agent that checks a list of URLs and reports on their status:
from langchain.tools import tool
@tool
def check_page_status(url: str) -> str:
"""Check if a web page is accessible and rendering correctly by taking a screenshot."""
try:
resp = requests.get(
"https://hermesforge.dev/api/screenshot",
params={"url": url},
headers={"Authorization": f"Bearer {os.environ['SCREENSHOT_API_KEY']}"},
timeout=30
)
if resp.status_code == 200:
size = len(resp.content)
return f"Page is accessible. Screenshot captured ({size:,} bytes). URL: {url}"
else:
return f"Screenshot API returned {resp.status_code} for {url}"
except Exception as e:
return f"Error capturing {url}: {str(e)}"
monitoring_agent = AgentExecutor(
agent=create_tool_calling_agent(llm, [check_page_status], prompt),
tools=[check_page_status],
verbose=True
)
# Run a monitoring sweep
urls_to_check = [
"https://yoursite.com",
"https://yoursite.com/pricing",
"https://yoursite.com/docs",
]
result = monitoring_agent.invoke({
"input": f"Check the following URLs and report which ones are up and which are down: {', '.join(urls_to_check)}"
})
Handling Rate Limits in Chains
When building multi-step chains that might capture many screenshots, handle rate limits explicitly:
from langchain_core.runnables import RunnableLambda
import time
def screenshot_with_retry(url: str, api_key: str, max_retries: int = 2) -> bytes:
for attempt in range(max_retries + 1):
resp = requests.get(
"https://hermesforge.dev/api/screenshot",
params={"url": url},
headers={"Authorization": f"Bearer {api_key}"},
timeout=30
)
if resp.status_code == 429:
if attempt < max_retries:
time.sleep(2 ** attempt) # exponential backoff
continue
raise RuntimeError(f"Rate limited after {max_retries} retries")
resp.raise_for_status()
return resp.content
raise RuntimeError("Should not reach here")
Caching Screenshots in LCEL Chains
In LangChain Expression Language (LCEL) pipelines, you may want to cache screenshots to avoid duplicate API calls:
from functools import lru_cache
class CachedScreenshotTool(ScreenshotTool):
"""Screenshot tool with in-memory caching."""
_cache: dict = {}
def _run(self, url: str) -> str:
if url in self._cache:
return self._cache[url]
result = super()._run(url)
self._cache[url] = result
return result
Full Example: Competitor Analysis Agent
competitor_agent = AgentExecutor(
agent=create_tool_calling_agent(
llm,
[screenshot_tool],
ChatPromptTemplate.from_messages([
("system", """You are a competitive intelligence analyst.
When asked to analyze a competitor, capture screenshots of their key pages
and provide structured analysis covering:
1. Value proposition (what problem they claim to solve)
2. Pricing structure (if visible)
3. Target customer (based on messaging and design)
4. Key differentiators
5. Call-to-action strategy"""),
("human", "{input}"),
("placeholder", "{agent_scratchpad}"),
])
),
tools=[screenshot_tool],
verbose=True
)
result = competitor_agent.invoke({
"input": "Analyze the homepage of stripe.com"
})
Installation
pip install langchain langchain-anthropic anthropic requests
Set environment variables:
export ANTHROPIC_API_KEY=your_key
export SCREENSHOT_API_KEY=your_hermesforge_key
API keys at hermesforge.dev/api/keys. The API also exposes an OpenAPI spec at /openapi.json if you want to use LangChain's create_openapi_agent for auto-generated tools.