Web Page Archiving for Compliance: GDPR, Legal Hold, and SOC 2 with the Screenshot API

2026-05-11 | Tags: [use-case, compliance, gdpr, legal-hold, soc2, archiving, screenshot-api]

Compliance teams and legal departments often need to prove that a webpage looked a specific way at a specific point in time: a privacy policy before a change, a consent notice before a campaign, a product page before a dispute. Screenshots with cryptographic timestamps and chain-of-custody storage provide defensible, court-admissible evidence without expensive archival services.

The Screenshot API makes this tractable: one HTTP call captures the visual state of any page. The compliance infrastructure around it — timestamping, signing, storage, metadata — can be built with standard Python libraries.

Core Archiving Pattern

Every compliance capture has four required properties: 1. Immutability — the capture must not be modifiable after creation 2. Timestamp — a verifiable record of when the capture was taken 3. Chain of custody — an unbroken audit trail from capture to storage 4. Retrievability — the capture must be findable by URL, date, and case ID

import requests
import hashlib
import json
import os
import time
from datetime import datetime, timezone
from pathlib import Path

import boto3

HERMES_API_KEY = os.environ["HERMES_API_KEY"]
ARCHIVE_BUCKET = os.environ["COMPLIANCE_BUCKET"]
s3 = boto3.client("s3")


def capture_and_archive(
    url: str,
    case_id: str,
    reason: str,
    full_page: bool = True,
    wait_ms: int = 2000,
) -> dict:
    """
    Capture a URL and archive it with compliance metadata.

    Returns a manifest dict with capture details and storage path.
    """
    captured_at = datetime.now(timezone.utc)
    iso_ts = captured_at.isoformat()

    # Capture
    resp = requests.get(
        "https://hermesforge.dev/api/screenshot",
        headers={"X-API-Key": HERMES_API_KEY},
        params={
            "url":       url,
            "format":    "png",   # PNG for compliance (lossless)
            "width":     1280,
            "full_page": full_page,
            "wait":      wait_ms,
        },
        timeout=90,
    )
    resp.raise_for_status()
    image_bytes = resp.content

    # Cryptographic hash for integrity verification
    sha256 = hashlib.sha256(image_bytes).hexdigest()

    # Manifest — stored alongside the image
    manifest = {
        "url":          url,
        "case_id":      case_id,
        "reason":       reason,
        "captured_at":  iso_ts,
        "sha256":       sha256,
        "size_bytes":   len(image_bytes),
        "capture_params": {
            "format":    "png",
            "width":     1280,
            "full_page": full_page,
            "wait_ms":   wait_ms,
        },
    }

    # S3 key: case_id/date/url_hash/
    url_hash = hashlib.md5(url.encode()).hexdigest()[:12]
    date_path = captured_at.strftime("%Y/%m/%d")
    prefix    = f"compliance/{case_id}/{date_path}/{url_hash}"

    # Store image (immutable via Object Lock if enabled on bucket)
    s3.put_object(
        Bucket=ARCHIVE_BUCKET,
        Key=f"{prefix}/screenshot.png",
        Body=image_bytes,
        ContentType="image/png",
        Metadata={
            "case-id":      case_id,
            "captured-at":  iso_ts,
            "source-url":   url[:256],  # S3 metadata max
            "sha256":       sha256,
            "reason":       reason,
        },
    )

    # Store manifest as JSON
    s3.put_object(
        Bucket=ARCHIVE_BUCKET,
        Key=f"{prefix}/manifest.json",
        Body=json.dumps(manifest, indent=2).encode(),
        ContentType="application/json",
    )

    manifest["s3_prefix"] = f"s3://{ARCHIVE_BUCKET}/{prefix}"
    return manifest

When a user submits a right-to-erasure (RTBF) request, you must demonstrate that their data has been removed. Capturing the user's profile page before and after deletion provides visual proof:

def gdpr_rtbf_capture(user_id: str, profile_url: str) -> dict:
    """
    Capture before-erasure evidence for a GDPR right-to-erasure request.
    Call this BEFORE the deletion job runs.
    """
    return capture_and_archive(
        url=profile_url,
        case_id=f"rtbf-{user_id}",
        reason="GDPR Article 17 right-to-erasure — pre-deletion capture",
    )


def gdpr_rtbf_verify(user_id: str, profile_url: str) -> dict:
    """
    Capture post-erasure evidence. Call this AFTER the deletion job completes.
    The capture should show a 404, empty profile, or 'account deleted' page.
    """
    return capture_and_archive(
        url=profile_url,
        case_id=f"rtbf-{user_id}",
        reason="GDPR Article 17 right-to-erasure — post-deletion verification",
    )


# Example: hook into your deletion workflow
def process_erasure_request(user_id: str, profile_url: str):
    # Pre-deletion capture
    before = gdpr_rtbf_capture(user_id, profile_url)
    print(f"Pre-deletion capture: {before['s3_prefix']}")

    # Run your deletion logic here
    # delete_user_data(user_id)

    # Wait for propagation
    time.sleep(10)

    # Post-deletion capture
    after = gdpr_rtbf_verify(user_id, profile_url)
    print(f"Post-deletion capture: {after['s3_prefix']}")

    return {"before": before, "after": after}

Legal Hold: Capturing External Pages for Disputes

Legal holds require preserving the exact state of external web content that may be relevant to litigation — competitor pricing pages, third-party claims, public social media posts:

from dataclasses import dataclass


@dataclass
class LegalHoldItem:
    url: str
    description: str
    matter_id: str


def execute_legal_hold(items: list[LegalHoldItem]) -> list[dict]:
    """
    Capture and archive a set of URLs for a legal hold.
    Each capture gets a matter-scoped case ID.
    """
    results = []
    for item in items:
        try:
            manifest = capture_and_archive(
                url=item.url,
                case_id=f"legal-{item.matter_id}",
                reason=item.description,
                wait_ms=3000,  # Extra wait for dynamic content
            )
            results.append({"status": "captured", **manifest})
            print(f"Captured: {item.url}")
        except Exception as e:
            results.append({
                "status":      "failed",
                "url":         item.url,
                "error":       str(e),
                "case_id":     f"legal-{item.matter_id}",
                "captured_at": datetime.now(timezone.utc).isoformat(),
            })
            print(f"Failed: {item.url} — {e}")
        time.sleep(2)  # Polite interval

    return results


# Example: capturing competitor pricing pages for a dispute
hold_items = [
    LegalHoldItem(
        url="https://competitor.com/pricing",
        description="Competitor pricing page — pricing dispute 2026-06",
        matter_id="2026-DISP-0042",
    ),
    LegalHoldItem(
        url="https://competitor.com/product/widget",
        description="Competitor product specification — feature claims",
        matter_id="2026-DISP-0042",
    ),
]

SOC 2: Periodic Policy Capture

SOC 2 audits require evidence that your privacy policy, terms of service, and security notices are current and accurately represent your practices. Capture these pages on a fixed schedule:

import schedule
import time as time_module

SOC2_PAGES = [
    {"url": "https://yourapp.com/privacy",   "label": "privacy-policy"},
    {"url": "https://yourapp.com/terms",     "label": "terms-of-service"},
    {"url": "https://yourapp.com/security",  "label": "security-notice"},
    {"url": "https://yourapp.com/cookies",   "label": "cookie-policy"},
    {"url": "https://yourapp.com/dpa",       "label": "data-processing-agreement"},
]


def monthly_soc2_capture():
    """Capture all policy pages for SOC 2 audit trail."""
    audit_id  = f"soc2-{datetime.now(timezone.utc).strftime('%Y-%m')}"
    results   = []

    for page in SOC2_PAGES:
        try:
            manifest = capture_and_archive(
                url=page["url"],
                case_id=audit_id,
                reason=f"SOC 2 monthly policy capture — {page['label']}",
            )
            results.append({"label": page["label"], "status": "ok", **manifest})
        except Exception as e:
            results.append({"label": page["label"], "status": "error", "error": str(e)})

    # Write capture index
    index = {
        "audit_id":   audit_id,
        "captured_at": datetime.now(timezone.utc).isoformat(),
        "pages":      results,
    }
    s3.put_object(
        Bucket=ARCHIVE_BUCKET,
        Key=f"compliance/{audit_id}/index.json",
        Body=json.dumps(index, indent=2).encode(),
        ContentType="application/json",
    )
    print(f"SOC 2 capture complete: {audit_id} — {len(results)} pages")
    return index


# Run on the 1st of each month at 00:00 UTC
schedule.every().month.at("00:00").do(monthly_soc2_capture)


if __name__ == "__main__":
    while True:
        schedule_module.run_pending()
        time_module.sleep(60)

Regulatory: Capturing Public Disclosures

For financial services, healthcare, and other regulated industries, regulators may require evidence that required disclosures were visible on specific dates:

def capture_regulatory_disclosure(
    disclosure_url: str,
    regulation: str,
    effective_date: str,
    filing_reference: str,
) -> dict:
    """
    Capture a required regulatory disclosure with filing metadata.
    """
    return capture_and_archive(
        url=disclosure_url,
        case_id=f"reg-{filing_reference}",
        reason=(
            f"Regulatory disclosure capture — {regulation} "
            f"(effective {effective_date}, filing {filing_reference})"
        ),
        wait_ms=2500,
    )

Verifying Archive Integrity

Before presenting captures to auditors or courts, verify the SHA-256 hash against the stored manifest:

def verify_capture(s3_prefix: str) -> dict:
    """
    Retrieve a compliance capture and verify its integrity.
    Returns verification result with original and computed hashes.
    """
    bucket = ARCHIVE_BUCKET
    prefix = s3_prefix.replace(f"s3://{bucket}/", "")

    # Fetch manifest
    manifest_obj  = s3.get_object(Bucket=bucket, Key=f"{prefix}/manifest.json")
    manifest      = json.loads(manifest_obj["Body"].read())

    # Fetch image
    image_obj   = s3.get_object(Bucket=bucket, Key=f"{prefix}/screenshot.png")
    image_bytes = image_obj["Body"].read()

    # Verify hash
    computed_sha256  = hashlib.sha256(image_bytes).hexdigest()
    original_sha256  = manifest["sha256"]
    integrity_ok     = computed_sha256 == original_sha256

    return {
        "integrity_ok":      integrity_ok,
        "original_sha256":   original_sha256,
        "computed_sha256":   computed_sha256,
        "captured_at":       manifest["captured_at"],
        "url":               manifest["url"],
        "case_id":           manifest["case_id"],
        "size_bytes":        manifest["size_bytes"],
    }

Storage Configuration for Compliance

Enable S3 Object Lock to prevent modification or deletion of archived captures:

# Configure bucket with Object Lock (do this at bucket creation)
# aws s3api create-bucket --bucket YOUR_COMPLIANCE_BUCKET \
#   --object-lock-enabled-for-bucket \
#   --region us-east-1

# Apply a default retention policy (COMPLIANCE mode = even bucket owner cannot delete)
s3.put_object_lock_configuration(
    Bucket=ARCHIVE_BUCKET,
    ObjectLockConfiguration={
        "ObjectLockEnabled": "Enabled",
        "Rule": {
            "DefaultRetention": {
                "Mode":  "COMPLIANCE",
                "Years": 7,  # Adjust per your regulatory requirement
            }
        },
    },
)

Compliance Use Case Reference

Use case	Regulation	Frequency	Retention
Privacy policy archive	GDPR Art. 13/14	Monthly + on change	5 years
Right-to-erasure evidence	GDPR Art. 17	Per request	3 years
Cookie consent notice	ePrivacy Directive	Monthly + on change	5 years
Legal hold	Civil litigation	Per matter	Duration of matter + 5 years
Pricing page snapshot	Consumer protection	Monthly	3 years
Financial disclosure	SEC/FCA/MiFID II	Per filing date	7 years
Security notice	SOC 2 / ISO 27001	Monthly	7 years

Free API key at hermesforge.dev. 50 captures/day, no credit card required.