How to Build a Web Archiving and Page Snapshot System with a Screenshot API
Web archiving is the practice of capturing and preserving the state of web pages over time. The Wayback Machine does this at internet scale; you can do it at project scale with a screenshot API and a simple storage structure.
Visual archives complement HTML archives: they capture how a page actually rendered — fonts, images, layout — rather than just the raw markup. For compliance, research, and competitive intelligence, what the page looked like is often what matters.
Core Pattern: Timestamped Snapshot Storage
The fundamental structure is simple: a directory tree organized by date, with one screenshot per URL per capture run.
import hashlib
import os
import requests
from datetime import datetime, timezone
def snapshot_page(url: str, archive_dir: str, api_key: str) -> dict:
"""Capture a timestamped screenshot of a URL and store it in the archive."""
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
url_hash = hashlib.sha256(url.encode()).hexdigest()[:12]
# Directory: archive/2026/05/22/
date_dir = datetime.now(timezone.utc).strftime("%Y/%m/%d")
output_dir = os.path.join(archive_dir, date_dir)
os.makedirs(output_dir, exist_ok=True)
filename = f"{timestamp}-{url_hash}.png"
output_path = os.path.join(output_dir, filename)
resp = requests.get(
"https://hermesforge.dev/api/screenshot",
params={
"url": url,
"width": 1440,
"format": "png",
"full_page": "true",
"wait_for": "networkidle",
"key": api_key
},
timeout=45
)
if resp.status_code == 200:
with open(output_path, "wb") as f:
f.write(resp.content)
return {
"url": url,
"timestamp": timestamp,
"path": output_path,
"size_bytes": len(resp.content),
"status": "ok"
}
return {
"url": url,
"timestamp": timestamp,
"status": "failed",
"http_status": resp.status_code
}
Building an Archive Manifest
Track every capture in a manifest file for later retrieval:
import json
import os
MANIFEST_FILE = "archive/manifest.jsonl"
def append_to_manifest(record: dict):
"""Append a snapshot record to the JSONL manifest."""
os.makedirs(os.path.dirname(MANIFEST_FILE), exist_ok=True)
with open(MANIFEST_FILE, "a") as f:
f.write(json.dumps(record) + "\n")
def query_manifest(url: str = None, date: str = None) -> list[dict]:
"""Query the manifest for snapshots matching a URL or date."""
if not os.path.exists(MANIFEST_FILE):
return []
results = []
with open(MANIFEST_FILE) as f:
for line in f:
record = json.loads(line.strip())
if url and record.get("url") != url:
continue
if date and not record.get("timestamp", "").startswith(date):
continue
results.append(record)
return results
Combined usage:
import time
URLS_TO_ARCHIVE = [
"https://example.com/terms",
"https://competitor.com/pricing",
"https://news-site.com/article/12345"
]
for url in URLS_TO_ARCHIVE:
record = snapshot_page(url, "archive", "YOUR_API_KEY")
append_to_manifest(record)
print(f"{record['status']}: {url}")
time.sleep(1)
# Later: retrieve all snapshots of a specific URL
history = query_manifest(url="https://competitor.com/pricing")
print(f"Found {len(history)} snapshots of competitor pricing page")
Use Case 1: Regulatory Compliance
For industries with record-keeping requirements — finance, legal, healthcare — archiving the state of disclosures, terms of service, and policy pages demonstrates compliance at a point in time.
COMPLIANCE_PAGES = [
{"url": "https://yoursite.com/terms", "label": "terms-of-service"},
{"url": "https://yoursite.com/privacy", "label": "privacy-policy"},
{"url": "https://yoursite.com/disclosures", "label": "disclosures"},
]
def run_compliance_archive(pages: list, api_key: str):
for page in pages:
record = snapshot_page(page["url"], f"compliance-archive/{page['label']}", api_key)
append_to_manifest({**record, "label": page["label"], "category": "compliance"})
print(f"Archived {page['label']}: {record['status']}")
time.sleep(2)
Run weekly. The timestamped screenshots provide an audit trail: if a terms-of-service page changed on a particular date, the archive shows what it contained before and after.
Use Case 2: Competitive Intelligence Timeline
Track competitor pricing, feature pages, and positioning over time:
COMPETITOR_WATCH = [
"https://competitor-a.com/pricing",
"https://competitor-a.com/features",
"https://competitor-b.com/pricing",
]
# Run daily — the manifest tracks the full timeline
for url in COMPETITOR_WATCH:
record = snapshot_page(url, "competitor-archive", api_key)
append_to_manifest({**record, "category": "competitor"})
time.sleep(2)
# Query the price page history for competitor-a
pricing_history = query_manifest(url="https://competitor-a.com/pricing")
print(f"Competitor A pricing page: {len(pricing_history)} captures")
for entry in pricing_history[-5:]: # Show last 5
print(f" {entry['timestamp']}: {entry['path']}")
Use Case 3: Research Snapshot
For academic research, journalism, or fact-checking — capture a page at the moment it is relevant, so the archived version can be cited even if the original changes or disappears:
def archive_for_citation(url: str, notes: str, api_key: str) -> str:
"""Snapshot a page for citation purposes, returning the archive path."""
record = snapshot_page(url, "research-archive", api_key)
record["notes"] = notes
record["citation_date"] = record.get("timestamp", "")
append_to_manifest(record)
return record.get("path", "")
# Example
path = archive_for_citation(
"https://news-site.com/article/breaking-story",
notes="Primary source for Q1 2026 analysis",
api_key="YOUR_API_KEY"
)
print(f"Archived to: {path}")
Scheduling with Cron
For continuous archiving, run a script on a schedule. With a cron entry like:
0 8 * * * python3 /path/to/archive_runner.py >> /path/to/archive.log 2>&1
The system captures a daily snapshot of every page in the watch list, appends to the manifest, and logs results — entirely hands-off.
Storage Considerations
| Capture frequency | Pages | Daily storage estimate |
|---|---|---|
| Daily | 10 pages | ~5–15 MB/day (full-page PNG) |
| Daily | 10 pages | ~2–5 MB/day (WebP format) |
| Hourly | 5 pages | ~60–150 MB/day (PNG) |
| Weekly | 100 pages | ~50–150 MB/week |
Use format=webp for archives where storage matters — WebP is typically 40–60% smaller than PNG with no visible quality loss for screenshot purposes.
Retrieving a Snapshot by Date
def get_snapshot_on_date(url: str, date: str) -> dict | None:
"""Find the closest snapshot to a given date (YYYY-MM-DD)."""
all_snapshots = query_manifest(url=url)
date_snapshots = [s for s in all_snapshots if s.get("timestamp", "").startswith(date.replace("-", ""))]
if date_snapshots:
return date_snapshots[0]
# Find closest by timestamp sort
all_snapshots.sort(key=lambda x: x.get("timestamp", ""))
candidates = [s for s in all_snapshots if s.get("timestamp", "") <= date.replace("-", "")]
return candidates[-1] if candidates else None
The Hermesforge Screenshot API captures full-page screenshots of any public URL. Get an API key — 50/day free, no credit card required.