Web archiving methodology

Name: web-archiving
Rating: 5 (278 reviews)
Author: jamditis

Patterns for accessing inaccessible web pages and preserving web content for journalism, research, and legal purposes.

Archive service hierarchy

Try services in this order for maximum coverage:

┌─────────────────────────────────────────────────────────────────┐
│                    ARCHIVE RETRIEVAL CASCADE                     │
├─────────────────────────────────────────────────────────────────┤
│                                                                  │
│  1. Wayback Machine (archive.org)                               │
│     └─ 900B+ pages, historical depth, API access                │
│                         ↓ not found                              │
│  2. Archive.today (archive.is/archive.ph)                       │
│     └─ On-demand snapshots, paywall bypass                      │
│     └─ Caveat (2026): FBI subpoenaed registrar in Oct 2025;     │
│        Wikipedia deprecated as citation source in Feb 2026 —    │
│        prefer Wayback / Perma.cc for legal or citation use      │
│                         ↓ not found                              │
│  3. Memento Time Travel (aggregator)                            │
│     └─ Searches multiple archives simultaneously                │
│                                                                  │
│  Retired (do not use): Google Cache (`cache:` operator) was     │
│  shut down in Sept 2024; Bing Cache dropdown was removed in     │
│  the same year. Both formerly fed this cascade.                 │
│                                                                  │
└─────────────────────────────────────────────────────────────────┘

Wayback Machine API

Check if URL is archived

import requests
from typing import Optional
from datetime import datetime
from urllib.parse import quote, unquote

def check_wayback_availability(url: str) -> Optional[dict]:
    """Check if URL exists in Wayback Machine."""
    api_url = "https://archive.org/wayback/available"

    try:
        response = requests.get(api_url, params={'url': url}, timeout=10)
        data = response.json()

        if data.get('archived_snapshots', {}).get('closest'):
            snapshot = data['archived_snapshots']['closest']
            return {
                'available': snapshot.get('available', False),
                'url': snapshot.get('url'),
                'timestamp': snapshot.get('timestamp'),
                'status': snapshot.get('status')
            }
        return None
    except Exception as e:
        return None

def get_wayback_url(url: str, timestamp: str = None) -> str:
    """Generate Wayback Machine URL for a page.

    Returns the canonical raw form (`.../web/<timestamp>/<url>`) per
    Wayback's replay-URL convention. If you intend to navigate to the
    returned link in a browser AND the target URL has `#` fragments,
    encode at the call site with urllib.parse.quote so the browser
    doesn't strip the fragment before request dispatch.

    Args:
        url: Original URL to retrieve
        timestamp: Optional YYYYMMDDHHMMSS format, or None for latest
    """
    if timestamp:
        return f"https://web.archive.org/web/{timestamp}/{url}"
    return f"https://web.archive.org/web/{url}"

Save page to Wayback Machine

def save_to_wayback(url: str, s3_keys: Optional[tuple[str, str]] = None) -> Optional[str]:
    """Request Wayback Machine to archive a URL via Save Page Now.

    Returns the archived URL if successful.

    Anonymous requests are rate-limited at roughly 15/minute. Pass
    `s3_keys=(access_key, secret)` from an Internet Archive account
    to raise the cap (anonymous → ~50/min with auth) and avoid silent
    drops on paywalled / heavily JS-rendered pages.
    """
    # quote(unquote(url), ...) normalizes any existing %xx escapes
    # first so they don't get double-encoded into %25xx.
    save_url = f"https://web.archive.org/save/{quote(unquote(url), safe='')}"

    headers = {'User-Agent': 'Mozilla/5.0 (research-archiver)'}
    if s3_keys:
        headers['Authorization'] = f'LOW {s3_keys[0]}:{s3_keys[1]}'

    try:
        response = requests.get(save_url, headers=headers, timeout=60)

        if response.status_code == 200:
            # SPN delivers the canonical archive URL via the final URL
            # after redirect-following (or the `Link` header on async
            # captures). `response.url` is the reliable common case.
            return response.url
        return None
    except Exception:
        return None

CDX API for historical snapshots

def get_all_snapshots(url: str, limit: int = 100) -> list[dict]:
    """Get all archived snapshots of a URL using CDX API.

    Returns list of snapshots with timestamps and status codes.
    """
    cdx_url = "https://web.archive.org/cdx/search/cdx"
    params = {
        'url': url,
        'output': 'json',
        'limit': limit,
        'fl': 'timestamp,original,statuscode,digest,length'
    }

    try:
        response = requests.get(cdx_url, params=params, timeout=30)
        data = response.json()

        if len(data) < 2:  # First row is headers
            return []

        headers = data[0]
        snapshots = []

        for row in data[1:]:
            snapshot = dict(zip(headers, row))
            snapshot['wayback_url'] = (
                f"https://web.archive.org/web/{snapshot['timestamp']}/{snapshot['original']}"
            )
            snapshots.append(snapshot)

        return snapshots
    except Exception:
        return []

Archive.today integration

Save to Archive.today

import re
import requests
from urllib.parse import quote, unquote, urljoin

def save_to_archive_today(url: str) -> Optional[str]:
    """Submit URL to Archive.today for archiving.

    Note: Archive.today has rate limiting and CAPTCHA requirements.
    This function works for basic archiving but may require
    manual intervention for high-volume use.

    Operational notes (2026): the FBI subpoenaed archive.today's
    registrar in October 2025; Wikipedia stopped accepting it as a
    citation source in February 2026 after the site shipped
    DDoS-attack code in January 2026. Still useful for capturing
    content the Wayback Machine can't render — but treat as
    secondary to Wayback / Perma.cc for legal or citation use.
    """
    submit_url = "https://archive.today/submit/"

    data = {
        'url': url,
        'anyway': '1'  # Archive even if recent snapshot exists
    }

    try:
        response = requests.post(
            submit_url,
            data=data,
            timeout=60,
            allow_redirects=False,
            headers={'User-Agent': 'Mozilla/5.0 (research-archiver)'},
        )
        # archive.today returns the snapshot URL in one of two shapes:
        #   - 30x with Location: https://archive.today/<snapshot_id>
        #     (Location MAY be relative per RFC 7231)
        #   - 200 with Refresh: 0;url=https://archive.today/<snapshot_id>
        #     (Refresh keyword is case-insensitive per HTML spec)
        # Following redirects silently can land on /wip/ pages or hide
        # the canonical snapshot URL, so handle both headers explicitly.
        if response.status_code in (301, 302, 303, 307, 308):
            location = response.headers.get('Location')
            if location:
                return urljoin(response.url, location)
        if response.status_code == 200:
            refresh = response.headers.get('Refresh', '')
            m = re.search(r'\burl\s*=\s*(.+)', refresh, re.IGNORECASE)
            if m:
                target = m.group(1).strip().strip('\'"')
                return urljoin(response.url, target)
        return None
    except Exception:
        return None

def search_archive_today(url: str) -> Optional[str]:
    """Search for existing Archive.today snapshot.

    Uses the /newest/<url> lookup which 302s to the most recent
    snapshot (or

web-archiving

How to add

Drop this on your repo README

Related skills

algorithmic-art

doc-coauthoring

blog-writing-guide

agents-md

Get new Escrita e Conteúdo skills every Monday