statsmodels

Overview

Statsmodels provides classical statistical modeling with rigorous inference for Python. It covers linear models, generalized linear models, discrete choice, time series, and comprehensive diagnostics. Unlike scikit-learn (prediction-focused), statsmodels emphasizes coefficient interpretation, p-values, confidence intervals, and model diagnostics.

When to Use

Fitting linear regression (OLS, WLS, GLS) with detailed coefficient tables and diagnostics
Running logistic regression with odds ratios and marginal effects for clinical/epidemiological studies
Analyzing count data with Poisson or negative binomial regression
Time series forecasting with ARIMA, SARIMAX, or exponential smoothing
Performing ANOVA, t-tests, or non-parametric tests with proper corrections
Testing model assumptions (heteroskedasticity, autocorrelation, normality of residuals)
Model comparison using AIC/BIC or likelihood ratio tests
Using R-style formula interface (y ~ x1 + x2 + C(group)) for intuitive model specification
For prediction-focused ML with cross-validation and hyperparameter tuning, use scikit-learn instead
For Bayesian modeling with posterior inference, use pymc instead

Prerequisites

Python packages: statsmodels, numpy, pandas, scipy
Optional: matplotlib (for diagnostic plots), patsy (for formula API, included with statsmodels)
Data: Tabular data as pandas DataFrames or NumPy arrays

pip install statsmodels numpy pandas matplotlib

Quick Start

import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np

# Generate sample data
np.random.seed(42)
n = 100
df = pd.DataFrame({
    "x1": np.random.randn(n),
    "x2": np.random.randn(n),
    "group": np.random.choice(["A", "B"], n)
})
df["y"] = 2 + 3 * df["x1"] - 1.5 * df["x2"] + np.random.randn(n)

# OLS with formula API (R-style)
results = smf.ols("y ~ x1 + x2 + C(group)", data=df).fit()
print(results.summary())
print(f"R²: {results.rsquared:.3f}, AIC: {results.aic:.1f}")

Core API

Module 1: Linear Regression (OLS, WLS, GLS)

Standard linear models with comprehensive diagnostics.

import statsmodels.api as sm
import numpy as np

# Generate data
np.random.seed(42)
X = np.random.randn(200, 3)
y = 1 + 2*X[:, 0] - 0.5*X[:, 1] + np.random.randn(200)

# ALWAYS add constant for intercept
X_const = sm.add_constant(X)
results = sm.OLS(y, X_const).fit()

print(results.summary())
print(f"\nCoefficients: {results.params}")
print(f"P-values: {results.pvalues}")
print(f"R²: {results.rsquared:.4f}")

# Predictions with confidence intervals
pred = results.get_prediction(X_const[:5])
print(pred.summary_frame())

# Robust standard errors (heteroskedasticity-consistent)
results_robust = sm.OLS(y, X_const).fit(cov_type="HC3")
print("Robust SEs:", results_robust.bse)

# Weighted Least Squares
weights = 1 / np.abs(results.resid + 0.1)  # Example weights
results_wls = sm.WLS(y, X_const, weights=weights).fit()
print(f"WLS R²: {results_wls.rsquared:.4f}")

Module 2: Generalized Linear Models (GLM)

Extend regression to non-normal outcomes (binary, count, continuous-positive).

import statsmodels.api as sm
import numpy as np

# Poisson regression for count data
np.random.seed(42)
X = np.random.randn(200, 2)
X_const = sm.add_constant(X)
y_counts = np.random.poisson(np.exp(0.5 + 0.3*X[:, 0]))

model = sm.GLM(y_counts, X_const, family=sm.families.Poisson())
results = model.fit()
print(results.summary())

# Rate ratios
rate_ratios = np.exp(results.params)
print(f"Rate ratios: {rate_ratios}")

# Check overdispersion
overdispersion = results.pearson_chi2 / results.df_resid
print(f"Overdispersion ratio: {overdispersion:.2f}")
if overdispersion > 1.5:
    print("→ Consider Negative Binomial model")

Module 3: Discrete Choice Models (Logit, Probit, Count)

Binary, multinomial, and count outcome models.

import statsmodels.api as sm
import numpy as np

# Logistic regression
np.random.seed(42)
X = np.random.randn(300, 2)
X_const = sm.add_constant(X)
prob = 1 / (1 + np.exp(-(0.5 + X[:, 0] - 0.5*X[:, 1])))
y_binary = np.random.binomial(1, prob)

logit_results = sm.Logit(y_binary, X_const).fit()
print(logit_results.summary())

# Odds ratios
odds_ratios = np.exp(logit_results.params)
print(f"Odds ratios: {odds_ratios}")

# Marginal effects (at means)
margeff = logit_results.get_margeff()
print(margeff.summary())

# Predicted probabilities
probs = logit_results.predict(X_const[:5])
print(f"Predicted P(Y=1): {probs}")

Module 4: Time Series (ARIMA, SARIMAX)

Univariate and multivariate time series modeling and forecasting.

import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
import numpy as np
import pandas as pd

# Generate time series
np.random.seed(42)
dates = pd.date_range("2020-01-01", periods=200, freq="D")
y = np.cumsum(np.random.randn(200)) + 50
ts = pd.Series(y, index=dates)

# Stationarity test
adf_result = adfuller(ts)
print(f"ADF statistic: {adf_result[0]:.4f}, p-value: {adf_result[1]:.4f}")
print("Stationary" if adf_result[1] < 0.05 else "Non-stationary → difference")

# Fit ARIMA
model = ARIMA(ts, order=(1, 1, 1))
results = model.fit()
print(results.summary())

# Forecast with confidence intervals
forecast = results.get_forecast(steps=30)
forecast_df = forecast.summary_frame()
print(f"30-day forecast:\n{forecast_df.head()}")

# Seasonal ARIMA (SARIMAX)
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Monthly data with yearly seasonality
model_sarima = SARIMAX(ts, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
results_sarima = model_sarima.fit(disp=False)
print(f"AIC: {results_sarima.aic:.1f}")

# Diagnostic plots
results_sarima.plot_diagnostics(figsize=(12, 8))

Module 5: Statistical Tests and Diagnostics

Assumption tests, hypothesis tests, and model validation.

import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan, acorr_ljungbox
from statsmodels.stats.stattools import jarque_bera
import numpy as np

# Fit a model first
np.random.seed(42)
X = sm.add_constant(np.random.randn(200, 2))
y = 1 + 2*X[:, 1] + np.random.randn(200) * X[:, 1]  # Heteroskedastic
results = sm.OLS(y, X).fit()

# Heteroskedasticity test (Breusch-Pagan)
bp_stat, bp_p, _, _ = het_breuschpagan(results.resid, X)
print(f"Breusch-Pagan p-value: {bp_p:.4f} {'→ heteroskedastic' if bp_p < 0.05 else '→ OK'}")

# Normality test (Jarque-Bera)
jb_stat, jb_p, _, _ = jarque_bera(results.resid)
print(f"Jarque-Bera p-value: {jb_p:.4f} {'→ non-normal' if jb_p < 0.05 else '→ OK'}")

# Autocorrelation test (Ljung-Box)
lb_result = acorr_ljungbox(results.resid, lags=[10], return_df=True)
print(f"Ljung-Box p-value (lag 10): {lb_result['lb_pvalue'].values[0]:.4f}")

# Variance Inflation Factor (multicollinearity)
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data = pd.DataFrame({
    "Variable": [f"x{i}" for i in range(X.shape[1])],
    "VIF": [variance_inflation_factor(X, i) for i in range(X.shape[1])]
})
print(vif_data)  # VIF > 10 suggests multicollinearity

Module 6: Formula API (R-style)

Intuitive model specification using formulas with automatic dummy coding.

import statsmodels.formula.api as smf
import pandas as pd
import numpy as np

np.random.seed(42)
df = pd.DataFrame({
    "y": np.random.randn(100),
    "x1": np.random.randn(100),
    "x2": np.random.randn(100),
    "group": np.random.choice(["A", "B", "C"], 100),
})

# Formula with categoricals (auto dummy-coded)
res = smf.ols("y ~ x1 + x2 + C(group)", data=df).fit()
print(res.summary())

# Interactions
res2 = smf.ols("y ~ x1 * x2", data=df).fit()  # x1 + x2 + x1:x2
print(f"Interaction term p-value: {res2.pvalues['x1:x2']:.4f}")

# Logit via formula
df["binary"] = (df["y"] > 0).astyp

statsmodels-statistical-modeling

How to add

Drop this on your repo README

Related skills

claude-api

skill-creator

oh-my-issues

claude-mem

Get new Desenvolvimento skills every Monday