Content Archiving at Scale: Building a URL Snapshot Pipeline

2026-04-17 | Tags: [tutorial, screenshot-api, archiving, python, automation]

Web content changes constantly. A page that exists today may be gone tomorrow, restructured next week, or quietly edited without notice. Screenshot-based archiving captures not just the text but the rendered state — the layout, images, dynamic content, and anything a browser would show.

Here's how to build a production-grade URL snapshot pipeline.

Why Screenshots for Archiving?

HTML archiving (saving the raw markup) breaks constantly: external CSS, JS bundles, CDN-hosted images, and dynamic content mean a raw HTML archive often renders as a broken mess. Screenshots are self-contained: the renderer captures the final visual state. The tradeoff is file size and unsearchability — but for compliance, research, and visual change detection, the tradeoff is worth it.

Use cases: - Compliance: Financial advisors, lawyers, and regulated industries often need to retain evidence of what was communicated when - Research: Academic research on web content at a specific point in time - Competitive monitoring: Tracking how a competitor's pricing or messaging evolves - Content integrity: Detecting unauthorized edits to published content - Legal evidence: Capturing what a page said at a specific date and time

The Core Pipeline

import requests
import hashlib
import json
import sqlite3
from pathlib import Path
from datetime import datetime, UTC

SCREENSHOT_API_KEY = "your-api-key"
SCREENSHOT_API_URL = "https://hermesforge.dev/api/screenshot"
ARCHIVE_DIR = Path("./archive")
DB_PATH = Path("./archive_index.db")

def init_db():
    """Initialize SQLite index for the archive."""
    conn = sqlite3.connect(DB_PATH)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS snapshots (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            url TEXT NOT NULL,
            captured_at TEXT NOT NULL,
            file_path TEXT NOT NULL,
            content_hash TEXT NOT NULL,
            file_size_bytes INTEGER,
            changed_from_previous INTEGER DEFAULT 0
        )
    """)
    conn.execute("CREATE INDEX IF NOT EXISTS idx_url ON snapshots(url)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_captured_at ON snapshots(captured_at)")
    conn.commit()
    return conn


def capture_snapshot(url: str, conn: sqlite3.Connection) -> dict:
    """
    Capture a screenshot snapshot of a URL.
    Detects if content changed since the last capture.
    """
    response = requests.get(
        SCREENSHOT_API_URL,
        params={
            "url": url,
            "format": "png",
            "width": 1440,
            "height": 900,
            "wait": "networkidle",
            "full_page": "true",
        },
        headers={"X-API-Key": SCREENSHOT_API_KEY},
        timeout=45,
    )

    if response.status_code != 200:
        return {
            "url": url,
            "status": "failed",
            "error": f"HTTP {response.status_code}",
        }

    content = response.content
    content_hash = hashlib.sha256(content).hexdigest()
    captured_at = datetime.now(UTC).isoformat()

    # Check if content changed
    previous = conn.execute(
        "SELECT content_hash FROM snapshots WHERE url = ? ORDER BY captured_at DESC LIMIT 1",
        (url,)
    ).fetchone()
    changed = previous is None or previous[0] != content_hash

    # Store the file
    url_slug = hashlib.md5(url.encode()).hexdigest()[:12]
    timestamp = captured_at.replace(":", "-").replace(".", "-")[:19]
    filename = f"{url_slug}_{timestamp}.png"

    ARCHIVE_DIR.mkdir(parents=True, exist_ok=True)
    (ARCHIVE_DIR / filename).write_bytes(content)

    # Index it
    conn.execute(
        """INSERT INTO snapshots (url, captured_at, file_path, content_hash, file_size_bytes, changed_from_previous)
           VALUES (?, ?, ?, ?, ?, ?)""",
        (url, captured_at, filename, content_hash, len(content), 1 if changed else 0)
    )
    conn.commit()

    return {
        "url": url,
        "status": "ok",
        "file": filename,
        "changed": changed,
        "size_bytes": len(content),
        "captured_at": captured_at,
    }

Batch Archiving with Rate Limit Handling

For large URL lists, respect rate limits and retry on 429:

import time

def archive_urls(urls: list[str], delay_seconds: float = 1.0) -> list[dict]:
    """
    Archive a list of URLs with rate limit handling.
    Logs progress and results to a JSONL file for resumability.
    """
    conn = init_db()
    results = []
    progress_log = Path("./archive_progress.jsonl")

    # Load already-processed URLs (for resume)
    processed = set()
    if progress_log.exists():
        with open(progress_log) as f:
            for line in f:
                try:
                    entry = json.loads(line)
                    if entry.get("status") == "ok":
                        processed.add(entry["url"])
                except json.JSONDecodeError:
                    pass

    remaining = [u for u in urls if u not in processed]
    print(f"Archiving {len(remaining)} URLs ({len(processed)} already done)")

    for i, url in enumerate(remaining, 1):
        print(f"[{i}/{len(remaining)}] {url[:80]}...", end=" ", flush=True)

        result = capture_snapshot(url, conn)
        results.append(result)

        # Append to progress log
        with open(progress_log, "a") as f:
            f.write(json.dumps(result) + "\n")

        status_indicator = "✓" if result["status"] == "ok" else "✗"
        changed_indicator = " [CHANGED]" if result.get("changed") else ""
        print(f"{status_indicator}{changed_indicator}")

        # Rate limit handling
        if i < len(remaining):
            time.sleep(delay_seconds)

    conn.close()
    return results

Querying the Archive

With a SQLite index, you can query the archive history:

def get_change_history(url: str) -> list[dict]:
    """Get all snapshots for a URL, flagging when content changed."""
    conn = sqlite3.connect(DB_PATH)
    rows = conn.execute(
        """SELECT captured_at, file_path, file_size_bytes, changed_from_previous
           FROM snapshots WHERE url = ?
           ORDER BY captured_at DESC""",
        (url,)
    ).fetchall()
    conn.close()

    return [
        {
            "captured_at": row[0],
            "file": row[1],
            "size_bytes": row[2],
            "changed": bool(row[3]),
        }
        for row in rows
    ]


def get_recent_changes(since_hours: int = 24) -> list[dict]:
    """Get all URLs that changed in the last N hours."""
    conn = sqlite3.connect(DB_PATH)
    cutoff = datetime.now(UTC).isoformat()
    # SQLite datetime comparison works with ISO strings
    rows = conn.execute(
        """SELECT url, captured_at, file_path
           FROM snapshots
           WHERE changed_from_previous = 1
             AND captured_at > datetime('now', ?)
           ORDER BY captured_at DESC""",
        (f"-{since_hours} hours",)
    ).fetchall()
    conn.close()

    return [{"url": r[0], "captured_at": r[1], "file": r[2]} for r in rows]

For compliance use cases, the screenshot alone isn't enough — you need proof that the capture happened at the claimed time. Options:

1. Hash the screenshot and log it externally:

import hashlib

def generate_evidence_record(file_path: str, url: str, captured_at: str) -> dict:
    """Generate an evidence record that can be independently verified."""
    content = Path(file_path).read_bytes()
    sha256 = hashlib.sha256(content).hexdigest()

    record = {
        "url": url,
        "captured_at": captured_at,
        "sha256": sha256,
        "file_size_bytes": len(content),
    }

    # Write the record alongside the screenshot
    record_path = Path(file_path).with_suffix(".evidence.json")
    record_path.write_text(json.dumps(record, indent=2))

    return record

2. Include the timestamp in the screenshot itself: Request that the page render with a visible timestamp — if you control the target page, add a ?capture=true param that renders a banner with the capture time. The timestamp becomes part of the visual record.

Storage Considerations

Screenshots are large. A full-page screenshot of a content-heavy page can be 500KB-2MB. Plan storage accordingly:

Archives/day Avg size Storage/month
100 500KB ~1.5 GB
1,000 500KB ~15 GB
10,000 500KB ~150 GB

For long-term archiving, compress older screenshots:

# Compress screenshots older than 30 days
find ./archive -name "*.png" -mtime +30 -exec gzip {} \;

Or use WebP format (request format=webp from the API) for ~50% size reduction at minimal quality loss for archiving purposes.

Running as a Daily Cron

#!/usr/bin/env python3
# daily_archive.py — run daily via cron

URLS_TO_ARCHIVE = [
    "https://competitor.com/pricing",
    "https://competitor.com/features",
    "https://your-site.com/terms",
    "https://your-site.com/privacy",
]

if __name__ == "__main__":
    results = archive_urls(URLS_TO_ARCHIVE)
    changes = [r for r in results if r.get("changed")]

    if changes:
        print(f"\n{len(changes)} pages changed:")
        for r in changes:
            print(f"  {r['url']}")
    else:
        print("\nNo changes detected.")

Add to crontab:

0 8 * * * cd /path/to/project && python3 daily_archive.py >> /var/log/archive.log 2>&1

hermesforge.dev — screenshot API. Free: 10/day. Starter: $4/30 days (200/day). Pro: $9 (1000/day). Business: $29 (5000/day).