cBioPortal Database
Overview
cBioPortal for Cancer Genomics is a public repository of cancer genomics data including TCGA, ICGC, and hundreds of curated studies spanning 100+ cancer types. It provides somatic mutation profiles, copy number alterations (CNA), gene expression, clinical data (survival, stage, treatment history), and methylation data for tens of thousands of patient samples. Data is accessible via a REST API at https://www.cbioportal.org/api/ with no authentication required.
When to Use
- Retrieving somatic mutation profiles (variant type, amino acid change) for a gene across TCGA studies
- Querying copy number alteration data (amplification, deep deletion) for candidate cancer driver genes
- Accessing clinical data — overall survival, disease-free survival, tumor stage — for survival curve analysis
- Identifying which cancer studies have molecular profiling data for a specific cancer type (e.g., breast, lung)
- Downloading gene expression (RNA-seq FPKM/RSEM) data from specific TCGA cohorts for differential expression analysis
- Correlating genomic alterations with clinical outcomes in a specific study
- Use
gnomad-databaseinstead when you need population-level variant allele frequencies in healthy individuals - For drug-gene interaction lookups use
opentargets-database; cBioPortal provides the genomic alteration data, not drug interaction annotations
Prerequisites
- Python packages:
requests,pandas,matplotlib - Data requirements: Entrez gene symbols (e.g.,
TP53), cBioPortal study IDs (e.g.,tcga_brca), molecular profile IDs - Environment: internet connection; no API key required
- Rate limits: no strict rate limits; use
time.sleep(0.2)between batch requests for polite access
pip install requests pandas matplotlib
Quick Start
import requests
import pandas as pd
BASE_URL = "https://www.cbioportal.org/api"
def cbio_get(endpoint, params=None):
"""GET request to cBioPortal REST API, returns parsed JSON."""
r = requests.get(f"{BASE_URL}/{endpoint}", params=params,
headers={"Accept": "application/json"}, timeout=30)
r.raise_for_status()
return r.json()
# List available cancer types
cancer_types = cbio_get("cancer-types")
print(f"Total cancer types: {len(cancer_types)}")
# Total cancer types: 87
# Find TCGA breast cancer study
studies = cbio_get("studies", params={"keyword": "breast"})
brca = [s for s in studies if "tcga_brca" in s["studyId"]]
if brca:
s = brca[0]
print(f"Study: {s['name']}")
print(f" studyId: {s['studyId']}")
print(f" Samples: {s['allSampleCount']}")
# Study: Breast Invasive Carcinoma (TCGA, PanCancer Atlas)
# studyId: brca_tcga_pan_can_atlas_2018
# Samples: 1084
Core API
Query 1: Cancer Types and Studies
List available cancer types and find studies by cancer type or keyword.
import requests
import pandas as pd
BASE_URL = "https://www.cbioportal.org/api"
def cbio_get(endpoint, params=None):
r = requests.get(f"{BASE_URL}/{endpoint}", params=params,
headers={"Accept": "application/json"}, timeout=30)
r.raise_for_status()
return r.json()
# Get all cancer types
cancer_types = cbio_get("cancer-types")
ct_df = pd.DataFrame(cancer_types)[["cancerTypeId", "name", "dedicatedColor"]]
print(f"Cancer types: {len(ct_df)}")
print(ct_df.head(5).to_string(index=False))
# Find all studies for a cancer type
lung_studies = cbio_get("studies", params={"keyword": "lung adenocarcinoma"})
print(f"\nLung adenocarcinoma studies: {len(lung_studies)}")
for s in lung_studies[:3]:
print(f" {s['studyId']:40s} n={s['allSampleCount']}")
# Get detailed study metadata including available data types
study_id = "brca_tcga_pan_can_atlas_2018"
study = cbio_get(f"studies/{study_id}")
print(f"Study: {study['name']}")
print(f" Reference genome: {study.get('referenceGenome', 'n/a')}")
print(f" All sample count: {study['allSampleCount']}")
# List molecular profiles for the study
profiles = cbio_get("molecular-profiles", params={"studyId": study_id})
print(f"\nMolecular profiles ({len(profiles)} total):")
for p in profiles:
print(f" {p['molecularProfileId']:55s} [{p['molecularAlterationType']}]")
Query 2: Somatic Mutations
Retrieve mutation data for a gene or set of genes in a study's mutation profile.
import requests, json
import pandas as pd
BASE_URL = "https://www.cbioportal.org/api"
def cbio_post(endpoint, body):
"""POST request to cBioPortal REST API."""
r = requests.post(f"{BASE_URL}/{endpoint}", json=body,
headers={"Accept": "application/json",
"Content-Type": "application/json"},
timeout=60)
r.raise_for_status()
return r.json()
def cbio_get(endpoint, params=None):
r = requests.get(f"{BASE_URL}/{endpoint}", params=params,
headers={"Accept": "application/json"}, timeout=30)
r.raise_for_status()
return r.json()
# Get all samples for a study
study_id = "brca_tcga_pan_can_atlas_2018"
samples = cbio_get(f"studies/{study_id}/samples", params={"projection": "ID"})
sample_ids = [s["sampleId"] for s in samples]
print(f"Total samples: {len(sample_ids)}")
# Mutation profile ID follows pattern: {studyId}_mutations
profile_id = f"{study_id}_mutations"
# Fetch mutations for TP53 (Entrez gene ID = 7157)
body = {
"sampleIds": sample_ids[:200], # first 200 samples
"entrezGeneIds": [7157] # TP53
}
mutations = cbio_post(f"molecular-profiles/{profile_id}/mutations/fetch", body)
print(f"TP53 mutations in first 200 samples: {len(mutations)}")
# Summarize by mutation type
mut_df = pd.DataFrame(mutations)
print("\nMutation type distribution:")
print(mut_df["mutationType"].value_counts().head(8).to_string())
# Missense_Mutation 102
# Nonsense_Mutation 28
# Splice_Site 14
# Frame_Shift_Del 12
Query 3: Copy Number Alterations
Fetch discrete CNA data (amplification = 2, gain = 1, diploid = 0, loss = -1, deep deletion = -2).
import requests
import pandas as pd
BASE_URL = "https://www.cbioportal.org/api"
def cbio_post(endpoint, body):
r = requests.post(f"{BASE_URL}/{endpoint}", json=body,
headers={"Accept": "application/json",
"Content-Type": "application/json"},
timeout=60)
r.raise_for_status()
return r.json()
def cbio_get(endpoint, params=None):
r = requests.get(f"{BASE_URL}/{endpoint}", params=params,
headers={"Accept": "application/json"}, timeout=30)
r.raise_for_status()
return r.json()
study_id = "brca_tcga_pan_can_atlas_2018"
# CNA profile: discrete copy number data
cna_profile_id = f"{study_id}_gistic" # GISTIC-derived discrete CNA
samples = cbio_get(f"studies/{study_id}/samples", params={"projection": "ID"})
sample_ids = [s["sampleId"] for s in samples][:300]
# Fetch CNA for ERBB2 (Entrez 2064) and MYC (Entrez 4609)
body = {
"sampleIds": sample_ids,
"entrezGeneIds": [2064, 4609] # ERBB2, MYC
}
cna_data = cbio_post(
f"molecular-profiles/{cna_profile_id}/molecular-data/fetch", body
)
print(f"CNA records retrieved: {len(cna_data)}")
cna_df = pd.DataFrame(cna_data)
# CNA values: 2=amplification, 1=gain, 0=diploid, -1=loss, -2=deep deletion
cna_label = {2: "AMP", 1: "GAIN", 0: "DIPLOID", -1: "LOSS", -2: "HOMDEL"}
print("\nERBB2 CNA distribution:")
erbb2 = cna_df[cna_df["entrezGeneId"] == 2064]
erbb2_counts = erbb2["value"].map(lambda x: cna_label.get(int(x), str(x))).value_counts()
print(erbb2_counts.to_string())
# DIPLOID 210
# AMP 62
# GAIN 18
# LOSS 10
Query 4: Clinical Data
Retrieve per-sample or per-patient clinical attributes including survival, tumor stage, and treatment.
import requests
import pandas as pd
BASE_URL = "https://www.cbioportal.org/a