Content Archiving at Scale: Building a URL Snapshot Pipeline
Web content changes constantly. A page that exists today may be gone tomorrow, restructured next week, or quietly edited without notice. Screenshot-based archiving captures not just the text but the rendered state — the layout, images, dynamic content, and anything a browser would show.
Here's how to build a production-grade URL snapshot pipeline.
Why Screenshots for Archiving?
HTML archiving (saving the raw markup) breaks constantly: external CSS, JS bundles, CDN-hosted images, and dynamic content mean a raw HTML archive often renders as a broken mess. Screenshots are self-contained: the renderer captures the final visual state. The tradeoff is file size and unsearchability — but for compliance, research, and visual change detection, the tradeoff is worth it.
Use cases: - Compliance: Financial advisors, lawyers, and regulated industries often need to retain evidence of what was communicated when - Research: Academic research on web content at a specific point in time - Competitive monitoring: Tracking how a competitor's pricing or messaging evolves - Content integrity: Detecting unauthorized edits to published content - Legal evidence: Capturing what a page said at a specific date and time
The Core Pipeline
import requests
import hashlib
import json
import sqlite3
from pathlib import Path
from datetime import datetime, UTC
SCREENSHOT_API_KEY = "your-api-key"
SCREENSHOT_API_URL = "https://hermesforge.dev/api/screenshot"
ARCHIVE_DIR = Path("./archive")
DB_PATH = Path("./archive_index.db")
def init_db():
"""Initialize SQLite index for the archive."""
conn = sqlite3.connect(DB_PATH)
conn.execute("""
CREATE TABLE IF NOT EXISTS snapshots (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT NOT NULL,
captured_at TEXT NOT NULL,
file_path TEXT NOT NULL,
content_hash TEXT NOT NULL,
file_size_bytes INTEGER,
changed_from_previous INTEGER DEFAULT 0
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_url ON snapshots(url)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_captured_at ON snapshots(captured_at)")
conn.commit()
return conn
def capture_snapshot(url: str, conn: sqlite3.Connection) -> dict:
"""
Capture a screenshot snapshot of a URL.
Detects if content changed since the last capture.
"""
response = requests.get(
SCREENSHOT_API_URL,
params={
"url": url,
"format": "png",
"width": 1440,
"height": 900,
"wait": "networkidle",
"full_page": "true",
},
headers={"X-API-Key": SCREENSHOT_API_KEY},
timeout=45,
)
if response.status_code != 200:
return {
"url": url,
"status": "failed",
"error": f"HTTP {response.status_code}",
}
content = response.content
content_hash = hashlib.sha256(content).hexdigest()
captured_at = datetime.now(UTC).isoformat()
# Check if content changed
previous = conn.execute(
"SELECT content_hash FROM snapshots WHERE url = ? ORDER BY captured_at DESC LIMIT 1",
(url,)
).fetchone()
changed = previous is None or previous[0] != content_hash
# Store the file
url_slug = hashlib.md5(url.encode()).hexdigest()[:12]
timestamp = captured_at.replace(":", "-").replace(".", "-")[:19]
filename = f"{url_slug}_{timestamp}.png"
ARCHIVE_DIR.mkdir(parents=True, exist_ok=True)
(ARCHIVE_DIR / filename).write_bytes(content)
# Index it
conn.execute(
"""INSERT INTO snapshots (url, captured_at, file_path, content_hash, file_size_bytes, changed_from_previous)
VALUES (?, ?, ?, ?, ?, ?)""",
(url, captured_at, filename, content_hash, len(content), 1 if changed else 0)
)
conn.commit()
return {
"url": url,
"status": "ok",
"file": filename,
"changed": changed,
"size_bytes": len(content),
"captured_at": captured_at,
}
Batch Archiving with Rate Limit Handling
For large URL lists, respect rate limits and retry on 429:
import time
def archive_urls(urls: list[str], delay_seconds: float = 1.0) -> list[dict]:
"""
Archive a list of URLs with rate limit handling.
Logs progress and results to a JSONL file for resumability.
"""
conn = init_db()
results = []
progress_log = Path("./archive_progress.jsonl")
# Load already-processed URLs (for resume)
processed = set()
if progress_log.exists():
with open(progress_log) as f:
for line in f:
try:
entry = json.loads(line)
if entry.get("status") == "ok":
processed.add(entry["url"])
except json.JSONDecodeError:
pass
remaining = [u for u in urls if u not in processed]
print(f"Archiving {len(remaining)} URLs ({len(processed)} already done)")
for i, url in enumerate(remaining, 1):
print(f"[{i}/{len(remaining)}] {url[:80]}...", end=" ", flush=True)
result = capture_snapshot(url, conn)
results.append(result)
# Append to progress log
with open(progress_log, "a") as f:
f.write(json.dumps(result) + "\n")
status_indicator = "✓" if result["status"] == "ok" else "✗"
changed_indicator = " [CHANGED]" if result.get("changed") else ""
print(f"{status_indicator}{changed_indicator}")
# Rate limit handling
if i < len(remaining):
time.sleep(delay_seconds)
conn.close()
return results
Querying the Archive
With a SQLite index, you can query the archive history:
def get_change_history(url: str) -> list[dict]:
"""Get all snapshots for a URL, flagging when content changed."""
conn = sqlite3.connect(DB_PATH)
rows = conn.execute(
"""SELECT captured_at, file_path, file_size_bytes, changed_from_previous
FROM snapshots WHERE url = ?
ORDER BY captured_at DESC""",
(url,)
).fetchall()
conn.close()
return [
{
"captured_at": row[0],
"file": row[1],
"size_bytes": row[2],
"changed": bool(row[3]),
}
for row in rows
]
def get_recent_changes(since_hours: int = 24) -> list[dict]:
"""Get all URLs that changed in the last N hours."""
conn = sqlite3.connect(DB_PATH)
cutoff = datetime.now(UTC).isoformat()
# SQLite datetime comparison works with ISO strings
rows = conn.execute(
"""SELECT url, captured_at, file_path
FROM snapshots
WHERE changed_from_previous = 1
AND captured_at > datetime('now', ?)
ORDER BY captured_at DESC""",
(f"-{since_hours} hours",)
).fetchall()
conn.close()
return [{"url": r[0], "captured_at": r[1], "file": r[2]} for r in rows]
Adding Timestamps for Legal Evidence
For compliance use cases, the screenshot alone isn't enough — you need proof that the capture happened at the claimed time. Options:
1. Hash the screenshot and log it externally:
import hashlib
def generate_evidence_record(file_path: str, url: str, captured_at: str) -> dict:
"""Generate an evidence record that can be independently verified."""
content = Path(file_path).read_bytes()
sha256 = hashlib.sha256(content).hexdigest()
record = {
"url": url,
"captured_at": captured_at,
"sha256": sha256,
"file_size_bytes": len(content),
}
# Write the record alongside the screenshot
record_path = Path(file_path).with_suffix(".evidence.json")
record_path.write_text(json.dumps(record, indent=2))
return record
2. Include the timestamp in the screenshot itself:
Request that the page render with a visible timestamp — if you control the target page, add a ?capture=true param that renders a banner with the capture time. The timestamp becomes part of the visual record.
Storage Considerations
Screenshots are large. A full-page screenshot of a content-heavy page can be 500KB-2MB. Plan storage accordingly:
| Archives/day | Avg size | Storage/month |
|---|---|---|
| 100 | 500KB | ~1.5 GB |
| 1,000 | 500KB | ~15 GB |
| 10,000 | 500KB | ~150 GB |
For long-term archiving, compress older screenshots:
# Compress screenshots older than 30 days
find ./archive -name "*.png" -mtime +30 -exec gzip {} \;
Or use WebP format (request format=webp from the API) for ~50% size reduction at minimal quality loss for archiving purposes.
Running as a Daily Cron
#!/usr/bin/env python3
# daily_archive.py — run daily via cron
URLS_TO_ARCHIVE = [
"https://competitor.com/pricing",
"https://competitor.com/features",
"https://your-site.com/terms",
"https://your-site.com/privacy",
]
if __name__ == "__main__":
results = archive_urls(URLS_TO_ARCHIVE)
changes = [r for r in results if r.get("changed")]
if changes:
print(f"\n{len(changes)} pages changed:")
for r in changes:
print(f" {r['url']}")
else:
print("\nNo changes detected.")
Add to crontab:
0 8 * * * cd /path/to/project && python3 daily_archive.py >> /var/log/archive.log 2>&1
hermesforge.dev — screenshot API. Free: 10/day. Starter: $4/30 days (200/day). Pro: $9 (1000/day). Business: $29 (5000/day).