GWAS Catalog Database — SNP-Trait Association Queries
Overview
The NHGRI-EBI GWAS Catalog is a curated collection of published genome-wide association studies, mapping SNP-trait associations with genomic context. The REST API provides programmatic access to studies, associations, variants, traits, genes, and summary statistics. All responses are HAL+JSON with embedded _links for pagination.
When to Use
- Finding genetic variants associated with a disease or trait (e.g., "which SNPs are linked to type 2 diabetes?")
- Retrieving genome-wide significant associations for a specific variant (rs ID)
- Exploring the genetic architecture of complex traits (number of loci, effect sizes)
- Checking variant pleiotropy (how many traits a single SNP affects)
- Downloading summary statistics for meta-analysis or polygenic risk score construction
- Identifying published GWAS studies by disease, gene, or PubMed ID
- Cross-referencing EFO trait ontology terms with GWAS evidence
- Building candidate gene lists from GWAS association regions
- For drug target validation from GWAS hits, use
opentargets-databaseinstead - For variant functional annotation (consequence prediction, regulatory impact), use Ensembl VEP via
gget
Prerequisites
pip install requests matplotlib numpy
API access:
- No authentication required -- fully open access
- Rate limits: no official limit, but add
time.sleep(0.2)between requests to be courteous - Base URL:
https://www.ebi.ac.uk/gwas/rest/api - Response format: HAL+JSON with
_embeddeddata and_linksfor pagination - Pagination: default 20 results per page; max 500 via
sizeparameter
Quick Start
import requests
import time
BASE = "https://www.ebi.ac.uk/gwas/rest/api"
def gwas_get(endpoint, params=None):
"""GWAS Catalog REST API helper with rate limiting and pagination support."""
url = f"{BASE}/{endpoint}"
resp = requests.get(url, params=params or {})
resp.raise_for_status()
time.sleep(0.2)
return resp.json()
# Find studies for a trait keyword. Study records have no top-level `title`
# — the publication title lives at `publicationInfo.title`; the trait label
# lives at `diseaseTrait.trait`.
data = gwas_get("studies/search/findByDiseaseTrait", {"diseaseTrait": "diabetes"})
studies = data["_embedded"]["studies"]
print(f"Found {len(studies)} studies for 'diabetes'")
for s in studies[:3]:
title = (s.get("publicationInfo") or {}).get("title", "N/A")
trait = (s.get("diseaseTrait") or {}).get("trait", "N/A")
print(f" {s['accessionId']} | {trait[:40]:<40} | {title[:60]}")
Core API
Module 1: Study Search
Search GWAS studies by disease trait keyword or PubMed ID.
# Search studies by disease trait
data = gwas_get("studies/search/findByDiseaseTrait", {"diseaseTrait": "breast cancer"})
studies = data["_embedded"]["studies"]
for s in studies[:5]:
pi = s.get("publicationInfo") or {}
print(f" {s['accessionId']} | PMID:{pi.get('pubmedId','N/A')} | {pi.get('title','')[:60]}")
time.sleep(0.2)
# Search by PubMed ID. NOTE: the older `findByPubmedId` 404s on /studies/;
# the working endpoint is `findByPublicationIdPubmedId`.
data = gwas_get("studies/search/findByPublicationIdPubmedId", {"pubmedId": "25673413"})
studies = data["_embedded"]["studies"]
print(f"Studies from PMID 25673413: {len(studies)}")
for s in studies:
trait = (s.get("diseaseTrait") or {}).get("trait", "N/A")
print(f" {s['accessionId']}: {trait}")
Module 2: Association Queries
Retrieve SNP-trait associations filtered by trait (EFO term), variant, or p-value.
# Associations by EFO trait. The old path `efoTraits/{shortForm}/associations`
# also works *if* you have the current shortForm — but trait shortForms have
# been re-mapped to MONDO (e.g. EFO_0000249 → MONDO_0004975). The most reliable
# path is `associations/search/findByEfoTrait?efoTrait=<canonical trait name>`.
data = gwas_get("associations/search/findByEfoTrait",
{"efoTrait": "type 2 diabetes mellitus", "size": 50})
assocs = data["_embedded"]["associations"]
print(f"Associations for 'type 2 diabetes mellitus': {len(assocs)}")
for a in assocs[:5]:
pval = a.get("pvalue", None)
genes = []
for locus in a.get("loci", []) or []:
for gene in locus.get("authorReportedGenes", []) or []:
genes.append(gene.get("geneName", ""))
loci = a.get("loci") or [{}]
snps = [r.get("snps", [{}])[0].get("rsId", "N/A")
for r in (loci[0].get("strongestRiskAlleles") or [])]
print(f" rs={snps} | p={pval} | genes={genes}")
# Associations for a specific variant. NOTE: association records do not embed
# `efoTraits` inline — they expose them via the `_links.efoTraits.href`
# HAL link. Follow the link (cached if needed) to resolve trait names.
data = gwas_get("singleNucleotidePolymorphisms/rs7903146/associations", {"size": 5})
assocs = data["_embedded"]["associations"]
print(f"Associations for rs7903146 (first page): {len(assocs)}")
def association_traits(assoc):
"""Resolve efoTraits via the HAL link on an association record."""
href = (assoc.get("_links") or {}).get("efoTraits", {}).get("href")
if not href:
return []
r = requests.get(href, timeout=15)
if not r.ok:
return []
return [t.get("trait") for t in r.json().get("_embedded", {}).get("efoTraits", [])]
for a in assocs[:5]:
traits = association_traits(a)
print(f" p={a.get('pvalue')} | OR={a.get('orPerCopyNum', 'N/A')} | traits={traits}")
time.sleep(0.1)
Module 3: Variant Lookup
Query variant details by rsID, chromosomal region, or cytogenetic band.
# Lookup single variant
data = gwas_get("singleNucleotidePolymorphisms/rs7903146")
loc = data.get("locations", [{}])[0]
print(f"rs7903146: chr{loc.get('chromosomeName', '?')}:{loc.get('chromosomePosition', '?')}")
print(f" Functional class: {data.get('functionalClass', 'N/A')}")
print(f" Merged into: {data.get('merged', 'N/A')}")
time.sleep(0.2)
# Search variants by chromosomal region
data = gwas_get("singleNucleotidePolymorphisms/search/findByChromBpLocationRange",
{"chrom": "10", "bpStart": "114750000", "bpEnd": "114800000", "size": 50})
snps = data["_embedded"]["singleNucleotidePolymorphisms"]
print(f"Variants in chr10:114750000-114800000: {len(snps)}")
for v in snps[:5]:
print(f" {v['rsId']}: {v.get('functionalClass', 'N/A')}")
# Search variants by gene name (the cytogenetic-band endpoint
# `findByCytogeneticBand` was removed — use gene or chromosome-range instead).
data = gwas_get("singleNucleotidePolymorphisms/search/findByGene",
{"geneName": "TCF7L2", "size": 5})
snps = data["_embedded"]["singleNucleotidePolymorphisms"]
print(f"Variants in TCF7L2: {len(snps)}")
Module 4: Trait Search
Browse and search EFO-mapped traits in the GWAS Catalog.
# Search traits by exact name (the older `findByDescription` endpoint was
# removed — search/efoTrait now expects the canonical trait label).
data = gwas_get("efoTraits/search/findByEfoTrait", {"trait": "Alzheimer disease"})
traits = data["_embedded"]["efoTraits"]
print(f"Traits matching 'Alzheimer disease': {len(traits)}")
for t in traits[:5]:
print(f" {t['shortForm']}: {t['trait']} (uri={t['uri']})")
time.sleep(0.2)
# Get specific trait by shortForm. NOTE: many legacy EFO IDs have been
# re-mapped to MONDO (e.g. old `EFO_0000249` for Alzheimer is now
# `MONDO_0004975` — `efoTraits/EFO_0000249` returns 404). Resolve via search
# above first, then use the current shortForm:
short_form = traits[0]["shortForm"] # e.g. 'MONDO_0004975'
data = gwas_get(f"efoTraits/{short_form}")
print(f"Trait: {data['trait']}")
print(f" URI : {data['uri']}")
print(f" shortForm : {data['shortForm']}")
Module 5: Summary Statistics
Access study-level summary statistics for downstream analysis (m