Content access methodology
Ethical and legal approaches for accessing restricted web content for journalism and research.
Access hierarchy (most to least preferred)
┌─────────────────────────────────────────────────────────────────┐
│ CONTENT ACCESS DECISION HIERARCHY │
├─────────────────────────────────────────────────────────────────┤
│ │
│ 1. FULLY LEGAL (Always try first) │
│ ├─ Library databases (PressReader, ProQuest, JSTOR) │
│ ├─ Open access tools (Unpaywall, CORE, PubMed Central) │
│ ├─ Author direct contact │
│ └─ Interlibrary loan │
│ │
│ 2. LEGAL (Browser features) │
│ ├─ Reader Mode (Safari, Firefox, Edge) │
│ ├─ Wayback Machine archives │
│ └─ Google Scholar "All versions" │
│ │
│ 3. GREY AREA (Use with caution) │
│ ├─ Archive.is for individual articles │
│ ├─ Disable JavaScript (breaks functionality) │
│ └─ VPNs for geo-blocked content │
│ │
│ 4. NOT RECOMMENDED │
│ ├─ Credential sharing │
│ ├─ Systematic scraping │
│ └─ Commercial use of bypassed content │
│ │
└─────────────────────────────────────────────────────────────────┘
Open access tools for academic papers
Unpaywall browser extension
Unpaywall finds free, legal copies of 50M+ open-access academic records.
# Unpaywall API (free, requires email for identification)
import requests
def find_open_access(doi: str, email: str) -> dict:
"""Find open access version of a paper using Unpaywall API.
Args:
doi: Digital Object Identifier (e.g., "10.1038/nature12373")
email: Your email for API identification
Returns:
Dict with best open access URL if available
"""
url = f"https://api.unpaywall.org/v2/{doi}?email={email}"
response = requests.get(url, timeout=30)
if response.status_code != 200:
return {'error': f'Status {response.status_code}'}
data = response.json()
if data.get('is_oa'):
best_location = data.get('best_oa_location', {})
return {
'is_open_access': True,
'oa_url': best_location.get('url_for_pdf') or best_location.get('url'),
'oa_status': data.get('oa_status'), # gold, green, bronze, hybrid
'host_type': best_location.get('host_type'), # publisher, repository
'version': best_location.get('version') # publishedVersion, acceptedVersion
}
return {
'is_open_access': False,
'title': data.get('title'),
'journal': data.get('journal_name')
}
# Usage
result = find_open_access("10.1038/nature12373", "researcher@example.com")
if result.get('is_open_access'):
print(f"Free PDF at: {result['oa_url']}")
CORE API (290M+ open-access works)
# CORE API - requires free API key from https://core.ac.uk/
import requests
class CORESearch:
def __init__(self, api_key: str):
self.api_key = api_key
self.base_url = "https://api.core.ac.uk/v3"
def search(self, query: str, limit: int = 10) -> list:
"""Search CORE database for open access papers."""
headers = {'Authorization': f'Bearer {self.api_key}'}
params = {
'q': query,
'limit': limit
}
response = requests.get(
f"{self.base_url}/search/works",
headers=headers,
params=params,
timeout=30
)
if response.status_code != 200:
return []
data = response.json()
results = []
for item in data.get('results', []):
results.append({
'title': item.get('title'),
'authors': [a.get('name') for a in item.get('authors', [])],
'year': item.get('yearPublished'),
'doi': item.get('doi'),
'download_url': item.get('downloadUrl'),
'abstract': item.get('abstract', '')[:500]
})
return results
def get_by_doi(self, doi: str) -> dict:
"""Get paper by DOI."""
headers = {'Authorization': f'Bearer {self.api_key}'}
response = requests.get(
f"{self.base_url}/works/{doi}",
headers=headers,
timeout=30
)
return response.json() if response.status_code == 200 else {}
Semantic Scholar API (220M+ papers)
# Semantic Scholar API - free, but request a key from
# https://www.semanticscholar.org/product/api for anything beyond
# ad-hoc calls. Unkeyed access has been tightened to a low shared
# rate limit and is no longer reliable for batch lookups.
import requests
def search_semantic_scholar(query: str, limit: int = 10) -> list:
"""Search Semantic Scholar for papers with open access links."""
url = "https://api.semanticscholar.org/graph/v1/paper/search"
params = {
'query': query,
'limit': limit,
'fields': 'title,authors,year,abstract,openAccessPdf,citationCount'
}
response = requests.get(url, params=params, timeout=30)
if response.status_code != 200:
return []
results = []
for paper in response.json().get('data', []):
oa_pdf = paper.get('openAccessPdf', {})
results.append({
'title': paper.get('title'),
'authors': [a.get('name') for a in paper.get('authors', [])],
'year': paper.get('year'),
'citations': paper.get('citationCount', 0),
'open_access_url': oa_pdf.get('url') if oa_pdf else None,
'abstract': paper.get('abstract', '')[:500] if paper.get('abstract') else ''
})
return results
def get_paper_by_doi(doi: str) -> dict:
"""Get paper details by DOI."""
url = f"https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}"
params = {
'fields': 'title,authors,year,abstract,openAccessPdf,references,citations'
}
response = requests.get(url, params=params, timeout=30)
return response.json() if response.status_code == 200 else {}
OpenAlex API (250M+ scholarly works)
OpenAlex replaced Microsoft Academic Graph after MAG was retired and has become the de-facto open scholarly data backbone — many tools (Unpaywall companion data, Local Citation Network, OpenCitations) now resolve via OpenAlex.
Auth note (2026): OpenAlex moved to API-key-required access on February 13, 2026, with a credit-based rate model. Anonymous access to the website is still free; API access via key has metered limits that step up with paid tiers — verify the current model at https://docs.openalex.org/. Get a free key from your OpenAlex account.
# OpenAlex API client
# https://docs.openalex.org/
# Pricing & key issuance: https://openalex.org/
import requests
def search_openalex(query: str, api_key: str, limit: int = 25,
email: str = None) -> list:
"""Search OpenAlex for works.
Args:
query: free-text search string.
api_key: OpenAlex API key (required as of 2026-02-13).
limit: max results per page (1-200).
email: contact email for the polite pool — recommended even
with a key, since OpenAlex prioritizes requests with