Web archiving methodology
Patterns for accessing inaccessible web pages and preserving web content for journalism, research, and legal purposes.
Archive service hierarchy
Try services in this order for maximum coverage:
┌─────────────────────────────────────────────────────────────────┐
│ ARCHIVE RETRIEVAL CASCADE │
├─────────────────────────────────────────────────────────────────┤
│ │
│ 1. Wayback Machine (archive.org) │
│ └─ 900B+ pages, historical depth, API access │
│ ↓ not found │
│ 2. Archive.today (archive.is/archive.ph) │
│ └─ On-demand snapshots, paywall bypass │
│ └─ Caveat (2026): FBI subpoenaed registrar in Oct 2025; │
│ Wikipedia deprecated as citation source in Feb 2026 — │
│ prefer Wayback / Perma.cc for legal or citation use │
│ ↓ not found │
│ 3. Memento Time Travel (aggregator) │
│ └─ Searches multiple archives simultaneously │
│ │
│ Retired (do not use): Google Cache (`cache:` operator) was │
│ shut down in Sept 2024; Bing Cache dropdown was removed in │
│ the same year. Both formerly fed this cascade. │
│ │
└─────────────────────────────────────────────────────────────────┘
Wayback Machine API
Check if URL is archived
import requests
from typing import Optional
from datetime import datetime
from urllib.parse import quote, unquote
def check_wayback_availability(url: str) -> Optional[dict]:
"""Check if URL exists in Wayback Machine."""
api_url = "https://archive.org/wayback/available"
try:
response = requests.get(api_url, params={'url': url}, timeout=10)
data = response.json()
if data.get('archived_snapshots', {}).get('closest'):
snapshot = data['archived_snapshots']['closest']
return {
'available': snapshot.get('available', False),
'url': snapshot.get('url'),
'timestamp': snapshot.get('timestamp'),
'status': snapshot.get('status')
}
return None
except Exception as e:
return None
def get_wayback_url(url: str, timestamp: str = None) -> str:
"""Generate Wayback Machine URL for a page.
Returns the canonical raw form (`.../web/<timestamp>/<url>`) per
Wayback's replay-URL convention. If you intend to navigate to the
returned link in a browser AND the target URL has `#` fragments,
encode at the call site with urllib.parse.quote so the browser
doesn't strip the fragment before request dispatch.
Args:
url: Original URL to retrieve
timestamp: Optional YYYYMMDDHHMMSS format, or None for latest
"""
if timestamp:
return f"https://web.archive.org/web/{timestamp}/{url}"
return f"https://web.archive.org/web/{url}"
Save page to Wayback Machine
def save_to_wayback(url: str, s3_keys: Optional[tuple[str, str]] = None) -> Optional[str]:
"""Request Wayback Machine to archive a URL via Save Page Now.
Returns the archived URL if successful.
Anonymous requests are rate-limited at roughly 15/minute. Pass
`s3_keys=(access_key, secret)` from an Internet Archive account
to raise the cap (anonymous → ~50/min with auth) and avoid silent
drops on paywalled / heavily JS-rendered pages.
"""
# quote(unquote(url), ...) normalizes any existing %xx escapes
# first so they don't get double-encoded into %25xx.
save_url = f"https://web.archive.org/save/{quote(unquote(url), safe='')}"
headers = {'User-Agent': 'Mozilla/5.0 (research-archiver)'}
if s3_keys:
headers['Authorization'] = f'LOW {s3_keys[0]}:{s3_keys[1]}'
try:
response = requests.get(save_url, headers=headers, timeout=60)
if response.status_code == 200:
# SPN delivers the canonical archive URL via the final URL
# after redirect-following (or the `Link` header on async
# captures). `response.url` is the reliable common case.
return response.url
return None
except Exception:
return None
CDX API for historical snapshots
def get_all_snapshots(url: str, limit: int = 100) -> list[dict]:
"""Get all archived snapshots of a URL using CDX API.
Returns list of snapshots with timestamps and status codes.
"""
cdx_url = "https://web.archive.org/cdx/search/cdx"
params = {
'url': url,
'output': 'json',
'limit': limit,
'fl': 'timestamp,original,statuscode,digest,length'
}
try:
response = requests.get(cdx_url, params=params, timeout=30)
data = response.json()
if len(data) < 2: # First row is headers
return []
headers = data[0]
snapshots = []
for row in data[1:]:
snapshot = dict(zip(headers, row))
snapshot['wayback_url'] = (
f"https://web.archive.org/web/{snapshot['timestamp']}/{snapshot['original']}"
)
snapshots.append(snapshot)
return snapshots
except Exception:
return []
Archive.today integration
Save to Archive.today
import re
import requests
from urllib.parse import quote, unquote, urljoin
def save_to_archive_today(url: str) -> Optional[str]:
"""Submit URL to Archive.today for archiving.
Note: Archive.today has rate limiting and CAPTCHA requirements.
This function works for basic archiving but may require
manual intervention for high-volume use.
Operational notes (2026): the FBI subpoenaed archive.today's
registrar in October 2025; Wikipedia stopped accepting it as a
citation source in February 2026 after the site shipped
DDoS-attack code in January 2026. Still useful for capturing
content the Wayback Machine can't render — but treat as
secondary to Wayback / Perma.cc for legal or citation use.
"""
submit_url = "https://archive.today/submit/"
data = {
'url': url,
'anyway': '1' # Archive even if recent snapshot exists
}
try:
response = requests.post(
submit_url,
data=data,
timeout=60,
allow_redirects=False,
headers={'User-Agent': 'Mozilla/5.0 (research-archiver)'},
)
# archive.today returns the snapshot URL in one of two shapes:
# - 30x with Location: https://archive.today/<snapshot_id>
# (Location MAY be relative per RFC 7231)
# - 200 with Refresh: 0;url=https://archive.today/<snapshot_id>
# (Refresh keyword is case-insensitive per HTML spec)
# Following redirects silently can land on /wip/ pages or hide
# the canonical snapshot URL, so handle both headers explicitly.
if response.status_code in (301, 302, 303, 307, 308):
location = response.headers.get('Location')
if location:
return urljoin(response.url, location)
if response.status_code == 200:
refresh = response.headers.get('Refresh', '')
m = re.search(r'\burl\s*=\s*(.+)', refresh, re.IGNORECASE)
if m:
target = m.group(1).strip().strip('\'"')
return urljoin(response.url, target)
return None
except Exception:
return None
def search_archive_today(url: str) -> Optional[str]:
"""Search for existing Archive.today snapshot.
Uses the /newest/<url> lookup which 302s to the most recent
snapshot (or