Web Page Archiving for Compliance: GDPR, Legal Hold, and SOC 2 with the Screenshot API
Compliance teams and legal departments often need to prove that a webpage looked a specific way at a specific point in time: a privacy policy before a change, a consent notice before a campaign, a product page before a dispute. Screenshots with cryptographic timestamps and chain-of-custody storage provide defensible, court-admissible evidence without expensive archival services.
The Screenshot API makes this tractable: one HTTP call captures the visual state of any page. The compliance infrastructure around it — timestamping, signing, storage, metadata — can be built with standard Python libraries.
Core Archiving Pattern
Every compliance capture has four required properties: 1. Immutability — the capture must not be modifiable after creation 2. Timestamp — a verifiable record of when the capture was taken 3. Chain of custody — an unbroken audit trail from capture to storage 4. Retrievability — the capture must be findable by URL, date, and case ID
import requests
import hashlib
import json
import os
import time
from datetime import datetime, timezone
from pathlib import Path
import boto3
HERMES_API_KEY = os.environ["HERMES_API_KEY"]
ARCHIVE_BUCKET = os.environ["COMPLIANCE_BUCKET"]
s3 = boto3.client("s3")
def capture_and_archive(
url: str,
case_id: str,
reason: str,
full_page: bool = True,
wait_ms: int = 2000,
) -> dict:
"""
Capture a URL and archive it with compliance metadata.
Returns a manifest dict with capture details and storage path.
"""
captured_at = datetime.now(timezone.utc)
iso_ts = captured_at.isoformat()
# Capture
resp = requests.get(
"https://hermesforge.dev/api/screenshot",
headers={"X-API-Key": HERMES_API_KEY},
params={
"url": url,
"format": "png", # PNG for compliance (lossless)
"width": 1280,
"full_page": full_page,
"wait": wait_ms,
},
timeout=90,
)
resp.raise_for_status()
image_bytes = resp.content
# Cryptographic hash for integrity verification
sha256 = hashlib.sha256(image_bytes).hexdigest()
# Manifest — stored alongside the image
manifest = {
"url": url,
"case_id": case_id,
"reason": reason,
"captured_at": iso_ts,
"sha256": sha256,
"size_bytes": len(image_bytes),
"capture_params": {
"format": "png",
"width": 1280,
"full_page": full_page,
"wait_ms": wait_ms,
},
}
# S3 key: case_id/date/url_hash/
url_hash = hashlib.md5(url.encode()).hexdigest()[:12]
date_path = captured_at.strftime("%Y/%m/%d")
prefix = f"compliance/{case_id}/{date_path}/{url_hash}"
# Store image (immutable via Object Lock if enabled on bucket)
s3.put_object(
Bucket=ARCHIVE_BUCKET,
Key=f"{prefix}/screenshot.png",
Body=image_bytes,
ContentType="image/png",
Metadata={
"case-id": case_id,
"captured-at": iso_ts,
"source-url": url[:256], # S3 metadata max
"sha256": sha256,
"reason": reason,
},
)
# Store manifest as JSON
s3.put_object(
Bucket=ARCHIVE_BUCKET,
Key=f"{prefix}/manifest.json",
Body=json.dumps(manifest, indent=2).encode(),
ContentType="application/json",
)
manifest["s3_prefix"] = f"s3://{ARCHIVE_BUCKET}/{prefix}"
return manifest
GDPR: Right-to-Erasure Evidence
When a user submits a right-to-erasure (RTBF) request, you must demonstrate that their data has been removed. Capturing the user's profile page before and after deletion provides visual proof:
def gdpr_rtbf_capture(user_id: str, profile_url: str) -> dict:
"""
Capture before-erasure evidence for a GDPR right-to-erasure request.
Call this BEFORE the deletion job runs.
"""
return capture_and_archive(
url=profile_url,
case_id=f"rtbf-{user_id}",
reason="GDPR Article 17 right-to-erasure — pre-deletion capture",
)
def gdpr_rtbf_verify(user_id: str, profile_url: str) -> dict:
"""
Capture post-erasure evidence. Call this AFTER the deletion job completes.
The capture should show a 404, empty profile, or 'account deleted' page.
"""
return capture_and_archive(
url=profile_url,
case_id=f"rtbf-{user_id}",
reason="GDPR Article 17 right-to-erasure — post-deletion verification",
)
# Example: hook into your deletion workflow
def process_erasure_request(user_id: str, profile_url: str):
# Pre-deletion capture
before = gdpr_rtbf_capture(user_id, profile_url)
print(f"Pre-deletion capture: {before['s3_prefix']}")
# Run your deletion logic here
# delete_user_data(user_id)
# Wait for propagation
time.sleep(10)
# Post-deletion capture
after = gdpr_rtbf_verify(user_id, profile_url)
print(f"Post-deletion capture: {after['s3_prefix']}")
return {"before": before, "after": after}
Legal Hold: Capturing External Pages for Disputes
Legal holds require preserving the exact state of external web content that may be relevant to litigation — competitor pricing pages, third-party claims, public social media posts:
from dataclasses import dataclass
@dataclass
class LegalHoldItem:
url: str
description: str
matter_id: str
def execute_legal_hold(items: list[LegalHoldItem]) -> list[dict]:
"""
Capture and archive a set of URLs for a legal hold.
Each capture gets a matter-scoped case ID.
"""
results = []
for item in items:
try:
manifest = capture_and_archive(
url=item.url,
case_id=f"legal-{item.matter_id}",
reason=item.description,
wait_ms=3000, # Extra wait for dynamic content
)
results.append({"status": "captured", **manifest})
print(f"Captured: {item.url}")
except Exception as e:
results.append({
"status": "failed",
"url": item.url,
"error": str(e),
"case_id": f"legal-{item.matter_id}",
"captured_at": datetime.now(timezone.utc).isoformat(),
})
print(f"Failed: {item.url} — {e}")
time.sleep(2) # Polite interval
return results
# Example: capturing competitor pricing pages for a dispute
hold_items = [
LegalHoldItem(
url="https://competitor.com/pricing",
description="Competitor pricing page — pricing dispute 2026-06",
matter_id="2026-DISP-0042",
),
LegalHoldItem(
url="https://competitor.com/product/widget",
description="Competitor product specification — feature claims",
matter_id="2026-DISP-0042",
),
]
SOC 2: Periodic Policy Capture
SOC 2 audits require evidence that your privacy policy, terms of service, and security notices are current and accurately represent your practices. Capture these pages on a fixed schedule:
import schedule
import time as time_module
SOC2_PAGES = [
{"url": "https://yourapp.com/privacy", "label": "privacy-policy"},
{"url": "https://yourapp.com/terms", "label": "terms-of-service"},
{"url": "https://yourapp.com/security", "label": "security-notice"},
{"url": "https://yourapp.com/cookies", "label": "cookie-policy"},
{"url": "https://yourapp.com/dpa", "label": "data-processing-agreement"},
]
def monthly_soc2_capture():
"""Capture all policy pages for SOC 2 audit trail."""
audit_id = f"soc2-{datetime.now(timezone.utc).strftime('%Y-%m')}"
results = []
for page in SOC2_PAGES:
try:
manifest = capture_and_archive(
url=page["url"],
case_id=audit_id,
reason=f"SOC 2 monthly policy capture — {page['label']}",
)
results.append({"label": page["label"], "status": "ok", **manifest})
except Exception as e:
results.append({"label": page["label"], "status": "error", "error": str(e)})
# Write capture index
index = {
"audit_id": audit_id,
"captured_at": datetime.now(timezone.utc).isoformat(),
"pages": results,
}
s3.put_object(
Bucket=ARCHIVE_BUCKET,
Key=f"compliance/{audit_id}/index.json",
Body=json.dumps(index, indent=2).encode(),
ContentType="application/json",
)
print(f"SOC 2 capture complete: {audit_id} — {len(results)} pages")
return index
# Run on the 1st of each month at 00:00 UTC
schedule.every().month.at("00:00").do(monthly_soc2_capture)
if __name__ == "__main__":
while True:
schedule_module.run_pending()
time_module.sleep(60)
Regulatory: Capturing Public Disclosures
For financial services, healthcare, and other regulated industries, regulators may require evidence that required disclosures were visible on specific dates:
def capture_regulatory_disclosure(
disclosure_url: str,
regulation: str,
effective_date: str,
filing_reference: str,
) -> dict:
"""
Capture a required regulatory disclosure with filing metadata.
"""
return capture_and_archive(
url=disclosure_url,
case_id=f"reg-{filing_reference}",
reason=(
f"Regulatory disclosure capture — {regulation} "
f"(effective {effective_date}, filing {filing_reference})"
),
wait_ms=2500,
)
Verifying Archive Integrity
Before presenting captures to auditors or courts, verify the SHA-256 hash against the stored manifest:
def verify_capture(s3_prefix: str) -> dict:
"""
Retrieve a compliance capture and verify its integrity.
Returns verification result with original and computed hashes.
"""
bucket = ARCHIVE_BUCKET
prefix = s3_prefix.replace(f"s3://{bucket}/", "")
# Fetch manifest
manifest_obj = s3.get_object(Bucket=bucket, Key=f"{prefix}/manifest.json")
manifest = json.loads(manifest_obj["Body"].read())
# Fetch image
image_obj = s3.get_object(Bucket=bucket, Key=f"{prefix}/screenshot.png")
image_bytes = image_obj["Body"].read()
# Verify hash
computed_sha256 = hashlib.sha256(image_bytes).hexdigest()
original_sha256 = manifest["sha256"]
integrity_ok = computed_sha256 == original_sha256
return {
"integrity_ok": integrity_ok,
"original_sha256": original_sha256,
"computed_sha256": computed_sha256,
"captured_at": manifest["captured_at"],
"url": manifest["url"],
"case_id": manifest["case_id"],
"size_bytes": manifest["size_bytes"],
}
Storage Configuration for Compliance
Enable S3 Object Lock to prevent modification or deletion of archived captures:
# Configure bucket with Object Lock (do this at bucket creation)
# aws s3api create-bucket --bucket YOUR_COMPLIANCE_BUCKET \
# --object-lock-enabled-for-bucket \
# --region us-east-1
# Apply a default retention policy (COMPLIANCE mode = even bucket owner cannot delete)
s3.put_object_lock_configuration(
Bucket=ARCHIVE_BUCKET,
ObjectLockConfiguration={
"ObjectLockEnabled": "Enabled",
"Rule": {
"DefaultRetention": {
"Mode": "COMPLIANCE",
"Years": 7, # Adjust per your regulatory requirement
}
},
},
)
Compliance Use Case Reference
| Use case | Regulation | Frequency | Retention |
|---|---|---|---|
| Privacy policy archive | GDPR Art. 13/14 | Monthly + on change | 5 years |
| Right-to-erasure evidence | GDPR Art. 17 | Per request | 3 years |
| Cookie consent notice | ePrivacy Directive | Monthly + on change | 5 years |
| Legal hold | Civil litigation | Per matter | Duration of matter + 5 years |
| Pricing page snapshot | Consumer protection | Monthly | 3 years |
| Financial disclosure | SEC/FCA/MiFID II | Per filing date | 7 years |
| Security notice | SOC 2 / ISO 27001 | Monthly | 7 years |
Free API key at hermesforge.dev. 50 captures/day, no credit card required.