dbSNP Database
Overview
NCBI dbSNP is the primary public repository for short human genetic variants, cataloguing over 1 billion SNPs, indels, and MNVs with allele frequencies, functional annotations, and cross-references to ClinVar, gnomAD, and 1000 Genomes. Variants are identified by stable rsIDs (reference SNP cluster IDs). Access is free via two APIs: the legacy NCBI E-utilities and the newer NCBI Variation Services REST API, which returns structured JSON.
When to Use
- Looking up allele frequencies and variant class for a known rsID
- Searching all dbSNP variants in a gene or chromosomal region by name or coordinates
- Resolving rsIDs to genomic coordinates (GRCh38/GRCh37) and HGVS notation
- Checking whether a variant of interest has clinical significance links to ClinVar entries
- Batch-fetching hundreds of rsIDs efficiently using epost+efetch history server
- Cross-referencing a list of variant positions to dbSNP rsIDs for downstream annotation
- For clinical pathogenicity classifications use
clinvar-database; dbSNP provides IDs and frequency but not curated clinical significance - For population frequency stratified by ancestry use
gnomad-database; dbSNP MAF is a single aggregate frequency
Prerequisites
- Python packages:
requests,pandas,matplotlib,xml.etree.ElementTree(stdlib) - Data requirements: rsIDs (
rs80357906), gene symbols, or chromosomal coordinates - Environment: internet connection; NCBI Entrez email required for E-utilities (set
emailparameter) - Rate limits: 3 requests/second without API key; 10 requests/second with free NCBI API key. Register at https://www.ncbi.nlm.nih.gov/account/ — add
&api_key=YOUR_KEYto all requests
pip install requests pandas matplotlib
# xml.etree.ElementTree is part of Python stdlib — no additional install needed
Quick Start
import requests
import json
EMAIL = "your@email.com" # required by NCBI policy
BASE_EUTILS = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
BASE_VARIATION = "https://api.ncbi.nlm.nih.gov/variation/v0"
def fetch_snp_by_rsid(rsid: str) -> dict:
"""Fetch a dbSNP record by rsID using the NCBI Variation Services API (structured JSON)."""
rs_num = str(rsid).lstrip("rs")
r = requests.get(f"{BASE_VARIATION}/refsnp/{rs_num}", timeout=15)
r.raise_for_status()
return r.json()
record = fetch_snp_by_rsid("rs1800497") # DRD2 Taq1A
print(f"rsID: rs{record['refsnp_id']}")
print(f"Variant type: {record['primary_snapshot_data'].get('variant_type')}")
# Top-level keys: citations, create_date, dbsnp1_merges, last_update_build_id,
# last_update_date, lost_obs_movements, mane_select_ids, present_obs_movements,
# primary_snapshot_data, refsnp_id. (No top-level `organism` field.)
# rsID: rs1800497
# Variant type: snv
Core API
Query 1: rsID Lookup via E-utilities
Fetch the full SNP record for a single rsID using efetch with db=snp. Returns an XML document with alleles, placements, and frequency data.
import requests
import xml.etree.ElementTree as ET
EMAIL = "your@email.com"
BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
def efetch_snp_xml(rsid: str) -> ET.Element:
"""Fetch dbSNP XML record for a single rsID via the docsum rettype.
Note: rettype="xml" returns a namespaced ExchangeSet; rettype="docsum"
returns the simpler eSummaryResult/DocumentSummary tree without namespaces."""
rs_num = str(rsid).lstrip("rs")
r = requests.get(f"{BASE}/efetch.fcgi",
params={"db": "snp", "id": rs_num,
"rettype": "docsum", "retmode": "xml",
"email": EMAIL},
timeout=20)
r.raise_for_status()
return ET.fromstring(r.text)
root = efetch_snp_xml("rs80357906")
# Parse the DocumentSummary record (MAF/MAFALLELE were removed in 2024;
# GLOBAL_MAFS is a sub-tree — use ESummary JSON below for easier access)
for docsum in root.iter("DocumentSummary"):
rs_id = docsum.get("uid")
snp_class = docsum.findtext("SNP_CLASS", "Unknown")
chr_pos = docsum.findtext("CHRPOS", "N/A")
clin_sig = docsum.findtext("CLINICAL_SIGNIFICANCE", "N/A")
print(f"rs{rs_id} | Class: {snp_class} | Position: {chr_pos}")
print(f" ClinSig: {clin_sig}")
# rs80357906 | Class: delins | Position: 17:43057062
# ClinSig: pathogenic,risk-factor,uncertain-significance
# Fetch using ESummary for structured JSON (preferred for batch)
def esummary_snp(rsid: str) -> dict:
rs_num = str(rsid).lstrip("rs")
r = requests.get(f"{BASE}/esummary.fcgi",
params={"db": "snp", "id": rs_num,
"retmode": "json", "email": EMAIL},
timeout=15)
r.raise_for_status()
result = r.json()["result"]
return result.get(rs_num, {})
rec = esummary_snp("rs80357906")
print(f"rs{rec.get('snp_id')}:")
print(f" Class : {rec.get('snp_class')}") # e.g., 'delins'
# `maf`/`mafallele` were removed from ESummary in 2024 — use `global_mafs`
# (list of {study, freq}) and pick a study (e.g., 'GnomAD_genomes') or the
# global aggregate ('TOPMED'/'1000Genomes').
for m in rec.get('global_mafs', [])[:4]:
print(f" MAF[{m['study']:18s}]: {m['freq']}")
print(f" ChrPos : {rec.get('chrpos')}") # 17:43057062
print(f" ClinSig : {rec.get('clinical_significance')}")
print(f" FxnClass : {rec.get('fxn_class')}")
Query 2: Gene Variant Search
Search dbSNP for all variants in a gene using esearch. Returns a list of rsIDs matching the gene.
import requests
EMAIL = "your@email.com"
BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
def esearch_snp(query: str, retmax: int = 100) -> tuple[list, int]:
"""Search dbSNP using a query string. Returns (id_list, total_count)."""
r = requests.get(f"{BASE}/esearch.fcgi",
params={"db": "snp", "term": query,
"retmax": retmax, "retmode": "json",
"email": EMAIL},
timeout=15)
r.raise_for_status()
result = r.json()["esearchresult"]
return result["idlist"], int(result["count"])
# All variants in BRCA1
ids, total = esearch_snp("BRCA1[gene] AND human[orgn]", retmax=20)
print(f"BRCA1 variants in dbSNP: {total:,} total")
print(f"First 5 rsIDs: {['rs' + i for i in ids[:5]]}")
# Only clinical variants (linked to ClinVar)
ids_clin, total_clin = esearch_snp(
"BRCA1[gene] AND human[orgn] AND clinsig[filter]", retmax=50)
print(f"BRCA1 variants with clinical significance: {total_clin:,}")
Query 3: Chromosomal Region Search
Search for all variants in a genomic region using chromosome coordinates.
import requests
EMAIL = "your@email.com"
BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
def search_region(chrom: str, start: int, stop: int,
assembly: str = "GRCh38", retmax: int = 200) -> tuple[list, int]:
"""Find all dbSNP variants in a chromosomal region."""
query = f"{chrom}[CHR] AND {start}:{stop}[CHRPOS37]" if assembly == "GRCh37" else \
f"{chrom}[CHR] AND {start}:{stop}[CHRPOS]"
r = requests.get(f"{BASE}/esearch.fcgi",
params={"db": "snp", "term": query,
"retmax": retmax, "retmode": "json",
"email": EMAIL},
timeout=20)
r.raise_for_status()
result = r.json()["esearchresult"]
return result["idlist"], int(result["count"])
# PCSK9 exon 4 region (GRCh38)
ids, total = search_region("1", 55039700, 55040200)
print(f"Variants in chr1:55039700-55040200: {total:,} total")
print(f"Retrieved {len(ids)} rsIDs: {['rs' + i for i in ids[:5]]}")
Query 4: Variant Summary — MAF, Alleles, Clinical Significance
Retrieve structured summary data for variant records using ESummary, extracting MAF, alleles, and database cross-links.
impo