Screenshot APIs for LLM Vision Workflows: Feeding Web Pages to ChatGPT, Claude, and GPT-4
Vision-capable language models (GPT-4V, Claude, Gemini) can analyze images and answer questions about what they see. Screenshot APIs turn any web page into an image that LLMs can read. Together, they enable a class of web automation that neither approach supports alone: intelligent analysis of dynamic, JavaScript-rendered pages without writing scrapers.
This guide covers the practical patterns for combining screenshot APIs with LLMs — from single-page analysis to continuous monitoring pipelines.
Why Screenshots + LLMs?
Traditional web scraping extracts structured data from HTML. It breaks when: - Pages render content via JavaScript after load - Content is embedded in images or CSS (prices, phone numbers, CAPTCHAs) - You need visual analysis — layout, branding, design quality - The page structure changes frequently
LLMs with vision bypass these limitations: they see the rendered page exactly as a human would, and they can answer arbitrary questions about what they see.
Common use cases observed in the wild: - Competitor analysis: "What are the three main pricing tiers on this page?" - Photography portfolio comparison: "Compare the lighting style of these two portfolio pages" - UI review: "Does this landing page have a clear primary call-to-action?" - Content extraction: "What is the delivery time shown on this checkout page?" - Change detection: "Describe any changes between these two screenshots of the same page"
Basic Pattern: Screenshot → LLM Analysis
import requests
import base64
import os
SCREENSHOT_API_KEY = os.environ["SCREENSHOT_API_KEY"]
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
def screenshot_to_base64(url, width=1440):
"""Capture a screenshot and return as base64 string."""
resp = requests.get(
"https://hermesforge.dev/api/screenshot",
params={
"url": url,
"width": width,
"format": "png",
"full_page": False,
"delay": 2000,
"block_ads": True,
},
headers={"X-API-Key": SCREENSHOT_API_KEY},
)
resp.raise_for_status()
return base64.b64encode(resp.content).decode("utf-8")
def analyze_page_with_gpt4v(url, question):
"""Screenshot a page and ask GPT-4V a question about it."""
image_b64 = screenshot_to_base64(url)
response = requests.post(
"https://api.openai.com/v1/chat/completions",
headers={
"Authorization": f"Bearer {OPENAI_API_KEY}",
"Content-Type": "application/json",
},
json={
"model": "gpt-4o",
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_b64}",
"detail": "high",
},
},
{"type": "text", "text": question},
],
}
],
"max_tokens": 1000,
},
)
response.raise_for_status()
return response.json()["choices"][0]["message"]["content"]
# Example usage
result = analyze_page_with_gpt4v(
"https://competitor.com/pricing",
"List all pricing tiers, their monthly costs, and key features. Format as JSON."
)
print(result)
Using Claude for Page Analysis
Claude's vision capabilities are well-suited for detailed web page analysis:
import anthropic
import base64
client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
def analyze_page_with_claude(url, question, model="claude-opus-4-6"):
"""Screenshot a page and analyze it with Claude."""
image_b64 = screenshot_to_base64(url)
message = client.messages.create(
model=model,
max_tokens=1024,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": image_b64,
},
},
{
"type": "text",
"text": question,
},
],
}
],
)
return message.content[0].text
# Example: Analyze a landing page's conversion elements
analysis = analyze_page_with_claude(
"https://saas-product.com",
"Analyze this landing page's conversion optimization. "
"Identify: (1) primary CTA and its prominence, "
"(2) social proof elements, "
"(3) pricing visibility, "
"(4) potential friction points for new visitors. "
"Rate conversion readiness 1-10."
)
print(analysis)
Competitor Price Monitoring
A practical pipeline for tracking competitor pricing changes:
import json
import sqlite3
from datetime import datetime
def setup_monitoring():
conn = sqlite3.connect("competitor_monitor.db")
conn.execute("""
CREATE TABLE IF NOT EXISTS price_snapshots (
id INTEGER PRIMARY KEY,
competitor TEXT,
url TEXT,
captured_at TIMESTAMP,
extracted_pricing TEXT,
changed INTEGER DEFAULT 0
)
""")
conn.commit()
return conn
def extract_pricing(url):
"""Screenshot a pricing page and extract structured pricing data."""
image_b64 = screenshot_to_base64(url)
response = requests.post(
"https://api.openai.com/v1/chat/completions",
headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
json={
"model": "gpt-4o",
"messages": [{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{image_b64}"}
},
{
"type": "text",
"text": "Extract all pricing tiers from this page. Return ONLY valid JSON in this format: "
'[{"tier": "Free", "price": "$0/mo", "key_features": ["feature1", "feature2"]}]'
"If no pricing is visible, return []."
}
]
}],
"max_tokens": 500,
}
)
content = response.json()["choices"][0]["message"]["content"]
try:
return json.loads(content)
except json.JSONDecodeError:
return None
def check_pricing_change(conn, competitor, url):
"""Check if competitor pricing has changed since last capture."""
current = extract_pricing(url)
if current is None:
return
current_str = json.dumps(current, sort_keys=True)
last = conn.execute(
"SELECT extracted_pricing FROM price_snapshots WHERE competitor=? ORDER BY captured_at DESC LIMIT 1",
(competitor,)
).fetchone()
changed = 0 if (last and last[0] == current_str) else 1
if changed and last:
print(f"PRICING CHANGED: {competitor}")
print(f" Previous: {last[0]}")
print(f" Current: {current_str}")
conn.execute(
"INSERT INTO price_snapshots (competitor, url, captured_at, extracted_pricing, changed) VALUES (?,?,?,?,?)",
(competitor, url, datetime.utcnow().isoformat(), current_str, changed)
)
conn.commit()
return {"changed": bool(changed), "pricing": current}
# Run monitoring
COMPETITORS = [
{"name": "screenshotone", "url": "https://screenshotone.com/pricing"},
{"name": "urlbox", "url": "https://urlbox.io/pricing"},
{"name": "browserless", "url": "https://browserless.io/pricing"},
]
conn = setup_monitoring()
for c in COMPETITORS:
result = check_pricing_change(conn, c["name"], c["url"])
if result:
status = "CHANGED" if result["changed"] else "unchanged"
print(f"{status}: {c['name']} — {len(result['pricing'])} tiers found")
Portfolio Comparison Workflow
For comparing visual work across multiple portfolio sites (photography, design, architecture):
def compare_portfolios(portfolio_urls, comparison_question):
"""
Compare multiple portfolio pages and return ranked analysis.
Example question: "Which of these photography portfolios has the most
professional appearance? Rank them 1-N with brief justification."
"""
# Capture all screenshots
screenshots = []
for url in portfolio_urls:
b64 = screenshot_to_base64(url, width=1280)
screenshots.append({
"url": url,
"image_b64": b64
})
# Build multi-image prompt
content = []
for i, s in enumerate(screenshots):
content.append({
"type": "text",
"text": f"Portfolio {i+1}: {s['url']}"
})
content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{s['image_b64']}",
"detail": "low" # use "low" for multi-image comparisons (cheaper, faster)
}
})
content.append({"type": "text", "text": comparison_question})
response = requests.post(
"https://api.openai.com/v1/chat/completions",
headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
json={
"model": "gpt-4o",
"messages": [{"role": "user", "content": content}],
"max_tokens": 1500,
}
)
return response.json()["choices"][0]["message"]["content"]
# Example: Compare dating profile photographer portfolios
portfolios = [
"https://photographer-a.com/portfolio",
"https://photographer-b.com/portfolio",
"https://photographer-c.com/portfolio",
]
analysis = compare_portfolios(
portfolios,
"Compare these dating profile photography portfolios. "
"For each, assess: photo quality, variety of environments, "
"natural vs posed feel, and overall trustworthiness of the photographer. "
"Which would you recommend for a dating app profile shoot?"
)
print(analysis)
Automated UI Review Pipeline
QA teams use screenshot + LLM to check landing pages for common issues before release:
UI_CHECKLIST = """
Review this web page screenshot for the following issues.
For each item, answer YES (found) or NO (not found) with a brief note:
1. Broken layout (elements overlapping, text cut off, misaligned sections)
2. Missing images (broken image icons, empty image containers)
3. CTA button visibility (is the primary action obvious and above the fold?)
4. Mobile-hostile elements (horizontal scrollbars, tiny text, unclickable buttons)
5. Trust signals (testimonials, security badges, company logos visible)
6. Price visibility (can the user understand the cost without clicking?)
Return as JSON: [{"check": "Broken layout", "found": false, "note": "Layout appears intact"}]
"""
def review_page_ui(url, viewport_width=375): # default to mobile
"""Run automated UI review on a page at specified viewport."""
image_b64 = screenshot_to_base64(url, width=viewport_width)
response = requests.post(
"https://api.openai.com/v1/chat/completions",
headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
json={
"model": "gpt-4o",
"messages": [{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
{"type": "text", "text": UI_CHECKLIST}
]
}],
"max_tokens": 800,
}
)
content = response.json()["choices"][0]["message"]["content"]
try:
results = json.loads(content)
issues = [r for r in results if r["found"]]
print(f"UI review for {url} ({viewport_width}px):")
print(f" {len(issues)} issues found out of {len(results)} checks")
for issue in issues:
print(f" ⚠ {issue['check']}: {issue['note']}")
return results
except Exception:
return content
Cost and Rate Limit Considerations
| Approach | Screenshot cost | LLM cost | Per analysis |
|---|---|---|---|
| GPT-4o (high detail) | ~$0.002/screenshot | ~$0.005-0.015/image | ~$0.02 |
| GPT-4o (low detail) | ~$0.002/screenshot | ~$0.001/image | ~$0.003 |
| Claude Haiku | ~$0.002/screenshot | ~$0.001/image | ~$0.003 |
| Claude Sonnet | ~$0.002/screenshot | ~$0.003/image | ~$0.005 |
For high-volume pipelines (100+ pages/day), use "detail": "low" for GPT-4o or Claude Haiku for bulk analysis. Reserve high-detail / Opus/GPT-4o for detailed single-page analysis.
Rate limits: The screenshot API's free tier allows 10 captures/day without authentication. For LLM vision pipelines, create a free API key to access higher rate limits. Most production pipelines need 50-500 screenshots/day — contact us for bulk pricing.
Caching Screenshots for LLM Reuse
Screenshots are expensive to generate (2-5 seconds each). If you're analyzing the same URL with multiple questions, cache the screenshot:
import hashlib
from pathlib import Path
CACHE_DIR = Path("screenshot_cache")
CACHE_DIR.mkdir(exist_ok=True)
def get_or_capture(url, max_age_hours=24):
"""Return cached screenshot or capture a fresh one."""
url_hash = hashlib.sha256(url.encode()).hexdigest()[:16]
cache_file = CACHE_DIR / f"{url_hash}.png"
if cache_file.exists():
age_hours = (datetime.utcnow().timestamp() - cache_file.stat().st_mtime) / 3600
if age_hours < max_age_hours:
return base64.b64encode(cache_file.read_bytes()).decode("utf-8")
# Capture fresh
resp = requests.get(
"https://hermesforge.dev/api/screenshot",
params={"url": url, "width": 1440, "format": "png", "delay": 2000},
headers={"X-API-Key": SCREENSHOT_API_KEY},
)
resp.raise_for_status()
cache_file.write_bytes(resp.content)
return base64.b64encode(resp.content).decode("utf-8")
# Now run multiple analyses against the same screenshot
image_b64 = get_or_capture("https://competitor.com") # captured once, cached
pricing_analysis = analyze_with_llm(image_b64, "Extract pricing information")
ux_review = analyze_with_llm(image_b64, "Review UX and conversion elements")
tech_stack = analyze_with_llm(image_b64, "What tech stack does this site appear to use?")
Summary
Screenshot APIs and LLMs are complementary: one captures rendered web pages reliably; the other interprets them intelligently. The combination unlocks automated analysis that neither can do alone.
The patterns here — single-page analysis, competitor monitoring, portfolio comparison, UI review — cover the most common production use cases. All run with a screenshot API key and an LLM API key, no browser infrastructure required on your end.
API reference: hermesforge.dev/api/docs. Free tier: 50 screenshots/day with a free API key, 10/day anonymous.