ClinVar Clinical Variants Database
Overview
ClinVar is NCBI's public archive of interpretations of variants submitted by clinical laboratories, researchers, and expert panels. It contains 2M+ variants with clinical significance classifications (Pathogenic, Likely Pathogenic, VUS, Likely Benign, Benign) for over 6,000 conditions. Access is free and requires no authentication via NCBI E-utilities.
When to Use
- Checking whether a specific variant (rsID, HGVS, or genomic position) has a clinical significance classification
- Retrieving all pathogenic/likely-pathogenic variants in a gene of interest
- Identifying conflicting interpretations between submitting laboratories
- Pulling condition/phenotype associations for a variant (MIM, MeSH, HPO terms)
- Building variant filtering pipelines that prioritize clinically actionable variants
- For somatic cancer variants, also check
cosmic-database; for GWAS associations usegwas-database
Prerequisites
- Python packages:
requests,xml.etree.ElementTree(stdlib) - Data requirements: gene symbols, rsIDs, HGVS strings, or ClinVar Variation IDs
- Environment: internet connection; NCBI Entrez email required (set
emailparameter) - Rate limits: 3 requests/second unauthenticated; 10/second with API key (free at https://www.ncbi.nlm.nih.gov/account/)
pip install requests
# No additional packages required; xml.etree is part of Python stdlib
Quick Start
import requests
EMAIL = "your@email.com" # required by NCBI policy
def clinvar_search(query, retmax=10):
"""Search ClinVar and return a list of ClinVar Variation IDs."""
r = requests.get(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
params={"db": "clinvar", "term": query, "retmax": retmax,
"retmode": "json", "email": EMAIL}
)
r.raise_for_status()
return r.json()["esearchresult"]["idlist"]
# Find pathogenic BRCA1 variants
ids = clinvar_search("BRCA1[gene] AND pathogenic[clinsig]", retmax=5)
print(f"Found variation IDs: {ids}")
Core API
Query 1: Search Variants by Gene and Clinical Significance
Use ESearch to find ClinVar Variation IDs matching a structured query.
import requests
EMAIL = "your@email.com"
BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
def esearch(query, retmax=200):
r = requests.get(f"{BASE}/esearch.fcgi",
params={"db": "clinvar", "term": query,
"retmax": retmax, "retmode": "json", "email": EMAIL})
r.raise_for_status()
result = r.json()["esearchresult"]
return result["idlist"], int(result["count"])
# Gene-specific pathogenic variants
ids, total = esearch("BRCA2[gene] AND (pathogenic[clinsig] OR likely pathogenic[clinsig])")
print(f"Pathogenic/LP BRCA2 variants: {total} total, retrieved {len(ids)}")
print(f"First 5 IDs: {ids[:5]}")
# By rsID
ids, _ = esearch("rs80357906[rs]")
print(f"Variant IDs for rs80357906: {ids}")
# By condition name
ids, total = esearch("breast cancer[dis] AND pathogenic[clinsig]")
print(f"Pathogenic variants for breast cancer: {total}")
Query 2: Fetch Variant Summary Records
Retrieve structured summary data (JSON) for a list of Variation IDs.
import requests, json
EMAIL = "your@email.com"
BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
def esummary(ids):
"""Fetch ESummary records for a list of ClinVar variation IDs."""
r = requests.post(f"{BASE}/esummary.fcgi",
data={"db": "clinvar", "id": ",".join(ids),
"retmode": "json", "email": EMAIL})
r.raise_for_status()
return r.json()["result"]
ids, _ = esearch_func = lambda q: requests.get(
f"{BASE}/esearch.fcgi",
params={"db": "clinvar", "term": q, "retmax": 5, "retmode": "json", "email": EMAIL}
).json()["esearchresult"]["idlist"]
# Manual example with known IDs
sample_ids = ["12375", "17684", "54270"]
result = esummary(sample_ids)
for vid in result.get("uids", []):
rec = result[vid]
# ClinVar 2024 schema: clinical_significance was replaced by germline_classification
# (also: clinical_impact_classification, oncogenicity_classification — same shape, often empty)
gc = rec.get("germline_classification", {})
print(f"\nVariation {vid}: {rec.get('title')}")
print(f" ClinSig : {gc.get('description')}")
print(f" Review : {gc.get('review_status')}")
print(f" Gene : {rec.get('genes', [{}])[0].get('symbol')}")
Query 3: Fetch Full XML Records
Retrieve the complete variant record in XML for detailed submitter and condition data.
import requests
import xml.etree.ElementTree as ET
EMAIL = "your@email.com"
BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
def efetch_xml(variation_ids):
# ClinVar 2024 XML overhaul: "clinvarset" rettype returns an empty stub.
# Use rettype="vcv" + is_variationid="true" to get the new <VariationArchive> records.
r = requests.post(f"{BASE}/efetch.fcgi",
data={"db": "clinvar", "id": ",".join(variation_ids),
"rettype": "vcv", "is_variationid": "true",
"retmode": "xml", "email": EMAIL})
r.raise_for_status()
return ET.fromstring(r.text)
root = efetch_xml(["17677"]) # BRCA1 c.5266dupC (rs80357906)
# Aggregate (germline) classification — one per VariationArchive
for va in root.iter("VariationArchive"):
name = va.get("VariationName")
gc = va.find("./ClassifiedRecord/Classifications/GermlineClassification")
desc = gc.find("Description") if gc is not None else None
rstat = gc.find("ReviewStatus") if gc is not None else None
print(f"{name}: {desc.text if desc is not None else 'n/a'} "
f"({rstat.text if rstat is not None else 'n/a'})")
# Per-submitter assertions
for ca in va.iter("ClinicalAssertion"):
acc = ca.find("ClinVarAccession")
cls = ca.find("Classification/GermlineClassification")
if acc is not None and cls is not None:
print(f" {acc.get('SubmitterName', '?')}: {cls.text}")
Query 4: ClinVar FTP Bulk Data
For large-scale queries, download and parse the full variant summary file.
import urllib.request
import gzip, csv, io
# Full summary (tab-separated, ~300 MB compressed)
URL = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz"
# Stream and parse without full download
with urllib.request.urlopen(URL) as resp:
with gzip.open(resp, "rt", encoding="utf-8") as f:
reader = csv.DictReader(f, delimiter="\t")
pathogenic_brca1 = []
for row in reader:
if row["GeneSymbol"] == "BRCA1" and "Pathogenic" in row["ClinicalSignificance"]:
pathogenic_brca1.append({
"name": row["Name"],
"clinsig": row["ClinicalSignificance"],
"condition": row["PhenotypeList"],
"rsid": row["RS# (dbSNP)"],
})
print(f"Pathogenic BRCA1 variants: {len(pathogenic_brca1)}")
for v in pathogenic_brca1[:3]:
print(f" {v['name']} | {v['clinsig']} | rs{v['rsid']}")
Query 5: Review Status and Conflicting Interpretations
Filter variants by review status (evidence quality) and find conflicts.
import requests
EMAIL = "your@email.com"
BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
# Stars correspond to review levels:
# 0 = no assertion criteria, 1 = criteria provided (single),
# 2 = criteria provided (multiple), 3 = expert panel, 4 = practice guideline
def search_by_review_stars(gene, min_stars=2):
"""Search for variants with at least min_stars review status."""
star_terms = {1: "criteria provided, single submitter",
2: "criteria provided, multiple submitters, no conflicts",
3: "reviewed by expert panel",
4: "p