\documentclass[]{beamer} %\documentclass[handout]{beamer} % *************************************************************** % for handout, change only this... % \documentclass[twocolumn]{article} % \usepackage{beamerarticle} % \setlength{\textwidth}{7.5in} % \setlength{\textheight}{9.8in} % \setlength{\topmargin}{-1in} % \setlength{\oddsidemargin}{-.52in} % \setlength{\evensidemargin}{-.52in} %\usepackage{beamerprosper} %\usetheme{Warsaw} %\usecolortheme{orchid} \usepackage{graphicx} \usepackage{amsmath,amssymb,array,comment,eucal} \input{macros} \title{Bayesian Model Averaging} \author{Hoff Chapter 9, Liang et al 2007, Hoeting et al (1999), Clyde \& George (2004) Statistical Science} \date{\today} %\Logo(-1.9,7.3){\includegraphics[width=.5in]{../eps/duke}} % Optional: text to put in the bottom of each slide. % By default, the title of the talk will be placed there. %\slideCaption{\textit{October 28, 2005 }} \begin{document} % make the title slide \maketitle \begin{frame}\frametitle{Prior Distributions} \begin{itemize} \item Bayesian Model choice requires proper prior distributions on parameters that are not common across models \pause \item Vague but proper priors may lead to paradoxes! \pause \item Conjugate Normal-Gammas lead to closed form expressions for marginal likelihoods, Zellner's g-prior is the most popular. \pause \end{itemize} \end{frame} \begin{frame}\frametitle{Prior Distributions} \begin{itemize} \item ``Spike and Slab'' - Lempers (1971) Mitchell \& Beauchamp (1988) \pause \item ``Spike and Bell'' Leamer (1978) in BMA \pause \item mixture of 2 normals - concentrated and dispersed - SSVS Gibbs Sampler - George \& McCulloch (1993) \pause \item Back to ``Spike and Bell'' Hoeting, Raftery \& Madigan $MC^3$ (1997) and George \& McCulloch (1997) collapsed MCMC after integrating out $\bg$ \item Conjugate Normal-Gammas lead to closed form expressions for marginal likelihoods, Zellner's g-prior is the most popular. \pause \end{itemize} \end{frame} \begin{frame}\frametitle{Zellner's g-prior} Centered model: $$\Y = \1 \alpha + \X^c \b + \epsilon$$ where $\X^c$ is the centered design matrix where all variables have had their mean subtracted \pause \begin{itemize} \item $p(\alpha) \propto 1$ \pause \item $p(\sigma^2) \propto 1/\sigma^2$ \pause \item $\b_\gamma \mid \alpha, \sigma^2, \g \sim \N(0, g \sigma^2 ({\Xg^c}^\prime \Xg^c)^{-1})$ \pause %\item take $g=n$ \end{itemize} which leads to marginal likelihood of $\Mg$ that is proportional to $$ p(\Y \mid \Mg) = C (1 + g)^{\frac{n-p-1}{2}} ( 1 + g (1 - R^2_\gamma))^{- \frac{(n-1)}{2}}$$ where $R^2$ is the usual coefficient of determination for model $\Mg$. \pause Trade-off of model complexity versus goodness of fit \bigskip Lastly, assign distribution to space of models \end{frame} \begin{frame}{Sketch} \begin{itemize} \item Integrate out $\bg$ using sums of normals \pause \item Find inverse of $\I_n + g \P_{\Xg}$ (properties of projections) \pause \item Find determinant of $\phi (\I_n + g \P_{\Xg})$ \pause \item Integrate out intercept (normal) \pause \item Integrate out $\phi$ (gamma) \pause \item algebra to simplify in from quadratic forms to $R^2_{\g}$ \end{itemize} \end{frame} \begin{frame} \frametitle{Priors on Model Space} $p(\Mg) \Leftrightarrow p(\g)$ \begin{itemize} \item $p(\gamma_j = 1) = .5 \Rightarrow P(\Mg) = .5^p$ Uniform on space of models \pause $\pg \sim \Bin(p, .5)$ \item $\gamma_j \mid \pi \simiid \Ber(\pi)$ and $\pi \sim \Be(a,b)$ then $\pg \sim \BB_p(a, b)$ $$ p(\pg \mid p, a, b) = \frac{ \Gamma(p + 1) \Gamma(\pg + a) \Gamma(p - \pg + b) \Gamma (a + b) }{\Gamma(\pg+1) \Gamma(p - \pg + 1) \Gamma(p + a + b) \Gamma(a) \Gamma(b)} $$ \item $\pg \sim \BB_p(1, 1) \sim \Un(0, p)$ \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{USair Data} <<>>= library(BAS) data(usair, package="HH") poll.bma = bas.lm(log(SO2) ~ temp + log(mfgfirms) + log(popn) + wind + precip + raindays, data=usair, prior="g-prior", alpha=nrow(usair), # g = n n.models=2^6, modelprior = uniform(), method="deterministic") @ % Marginal Posterior Inclusion Probabilities: % Intercept temp log(mfgfirms) log(popn) wind precip raindays % 1.0000 0.9755 0.7190 0.2757 0.7654 0.5994 0.3104 \end{frame} \begin{frame}[fragile]\frametitle{Summary} \begin{small} <>= poll.bma @ \end{small} \end{frame} \begin{frame}[fragile]\frametitle{Plots} \centering <>= par(mfrow=c(2,2)) plot(poll.bma, ask=F) @ \end{frame} \begin{frame}\frametitle{Posterior Distribution with Uniform Prior on Model Space} <>= image(poll.bma, rotate=FALSE) @ \end{frame} \begin{frame}[fragile]\frametitle{Posterior Distribution with BB(1,1) Prior on Model Space} <<>>= poll.bb.bma = bas.lm(log(SO2) ~ temp + log(mfgfirms) + log(popn) + wind + precip + raindays, data=usair, prior="g-prior", alpha=nrow(usair), n.models=2^6, #enumerate modelprior=beta.binomial(1,1)) @ \end{frame} \begin{frame}\frametitle{BB(1,1) Prior on Model Space} <>= image(poll.bb.bma, rotate=FALSE) @ \end{frame} \begin{frame} \frametitle{Jeffreys Scale of Evidence} \begin{itemize} \item Bayes Factor = ratio of marginal likelihoods \pause \item Posterior odds = Bayes Factor $\times$ Prior odds \pause \end{itemize} $B = BF[\M_0 : \Mg]$ and $1/B = BF[\Mg : \M_0]$ \pause \vspace{14pt} \begin{tabular}{|r|l|} \hline \hline Bayes Factor & Interpretation \\ \hline $B \geq 1$ & $H_0$ supported \\ $1 > B \geq 10^{-\frac{1}{2}} $ & minimal evidence against $H_0$ \\ $ 10^{- \frac{1}{2}} > B \geq 10^{-1}$ & substantial evidence against $H_0$ \\ $ 10^{-1} > B \geq 10^{-2}$ & strong evidence against $H_0$ \\ $ 10^{-2} > B $ & decisive evidence against $H_0$ \\ \hline \hline \end{tabular} \pause \vspace{24pt} in context of testing one hypothesis with equal prior odds Kass \& Raftery (JASA 1996) \end{frame} \begin{frame}[fragile] \frametitle{Coefficients} <>= beta = coef(poll.bma) par(mfrow=c(2,3)); plot(beta, subset=2:7,ask=F) @ \end{frame} \begin{frame} \frametitle{Bartlett's Paradox} The Bayes factor for comparing $\Mg$ to the null model: $$ BF(\Mg : \M_0) = (1 + g)^{(n - 1 - \pg)/2} (1 + g(1 - R^2))^{-(n-1)/2} $$ \pause \begin{itemize} \item For fixed sample size $n$ and $R^2$, consider taking values of $g$ that go to infinity \pause \item Increasing vagueness in prior \pause \item What happens to BF as $g \to \infty$? \end{itemize} \end{frame} \begin{frame} \frametitle{Information Paradox} The Bayes factor for comparing $\Mg$ to the null model: $$ BF(\Mg : \M_0) = (1 + g)^{(n - 1 - \pg)/2} (1 + g(1 - R^2))^{-(n-1)/2} $$ \pause \begin{itemize} \item Let $g$ be a fixed constant and take $n$ fixed. \pause \item Let $F = \frac{R_{\g}^2/\pg}{(1 - R_{\g}^2)/(n - 1 - \pg)}$ \pause \item As $R^2_{\g} \to 1$, $F \to \infty$ LR test would reject $\M_0$ where $F$ is the usual $F$ statistic for comparing model $\Mg$ to $\M_0$ \pause \item BF converges to a fixed constant $(1+g)^{-\pg/2}$ (does not go to infinity \end{itemize} ``Information Inconsistency'' see Liang et al JASA 2008 \end{frame} \begin{frame} \frametitle{Mixtures of $g$ priors \& Information consistency} Need $BF \to \infty$ if $\R^2 \to 1$ $\Leftrightarrow$ $\E_g[(1 + g)^{-\pg/2}]$ diverges for $\pg < n - 1$ (proof in Liang et al) \pause \begin{itemize} \item Zellner-Siow Cauchy prior \pause \item hyper-g prior or hyper-g/n (Liang et al JASA 2008) \pause \item robust prior (Bayarrri et al Annals of Statistics 2012 \pause \end{itemize} All have tails that behave like a Cauchy distribution \end{frame} \section{Mortality} \begin{frame}[fragile] \frametitle{Mortality \& Pollution} \begin{itemize} \item Data 60 cities from Statistical Sleuth 12.17 \pause \item response Mortality \pause \item 15 predictors; measures of HC, NOX, SO2 \pause \item Is pollution associated with mortality after adjusting for other socio-economic and meteorological factors? \pause \item 15 predictor variables implies $2^{15} = 32,768$ possible models \end{itemize} <<>>= data(ex1217, package="Sleuth3") library(dplyr) mortality = mutate(ex1217, logHC = log(HC), logNOX = log(NOX), logSO2 = log(SO2)) %>% select(-CITY, -HC, -NOX, -SO2) @ \end{frame} \begin{frame}[fragile]{Jeffreys Zellner-Siow Cauchy Prior} \begin{itemize} \item Jeffreys "reference" prior on $\alpha$ and $\sigma^2$ \item Zellner-Siow Cauchy prior \begin{align*} 1/g & \sim G(1/2, n/2) \\ \bg \mid g, \sigma^2 & \sim N(0, g \sigma^2 (\Xg^T\Xg)^{-1}) \\ \Rightarrow & \bg \mid \sigma^2 \sim C(0, \sigma^2 (\Xg^T\Xg)^{-1}) \end{align*} \end{itemize} <<>>= mort.bma = bas.lm(Mortality ~ ., data=mortality, prior="JZS", alpha=1, n.models=2^15, initprobs="eplogp", method='BAS') @ \end{frame} \begin{frame}[fragile]{Posterior Plots} <>= par(mfrow=c(2,2)) plot(mort.bma, ask=FALSE) @ \end{frame} \begin{frame}[fragile] \frametitle{Posterior Probabilities} \begin{itemize} \item What is the probability that there is no pollution effect? \pause \item Sum posterior model probabilities over all models that include at least one pollution variable \pause <>= models = list2matrix.which(mort.bma) poll.inclusion = (models[, 14:16] %*% rep(1, 3)) > 0 prob.poll = sum(poll.inclusion * mort.bma$postprobs) prob.poll @ \pause \item Posterior probability no effect is \Sexpr{round(1 - prob.poll, 4)} \pause \item Odds that there is an effect \Sexpr{round(prob.poll, 4)}/\Sexpr{round(1 - prob.poll, 4)} = \Sexpr{round(prob.poll/(1 - prob.poll), 4)} \pause \item Prior Odds $7 = (1 - .5^3)/.5^3$ \pause \end{itemize} \end{frame} \begin{frame}\frametitle{Model Space} <>= image(mort.bma) @ \end{frame} \begin{frame}\frametitle{Coefficients} \includegraphics[height=3.5in]{mort-beta1} \end{frame} \begin{frame}\frametitle{Coefficients} <>= mort.beta = coef(mort.bma) par(mfrow=c(2,3)) plot(mort.beta, subset = c(2,4,8, 14:16), ask=FALSE) @ \end{frame} \begin{frame} \frametitle{Effect Estimation} \begin{itemize} \item Coefficients in each model are adjusted for other variables in the model \item OLS: leave out a predictor with a non-zero coefficient then estimates are biased! \item Model Selection in the presence of high correlation, may leave out "redundant" variables; \item improved MSE for prediction (Bias-variance tradeoff) \item in BMA all variables are included, but coefficients are shrunk to 0 \item Care for "causal" questions and confounder adjustment! \end{itemize} \end{frame} \begin{frame}\frametitle{ Other Problems} \begin{itemize} \item Computational \pause if $p > 35$ enumeration is difficult \pause \begin{itemize} \item Gibbs sampler or Random-Walk algorithm on $\g$ \pause \item poor convergence/mixing with high correlations \pause \item Metropolis Hastings algorithms more flexibility \pause \item "Stochastic Search" (no guarantee samples represent posterior) \pause \end{itemize} \item Prior Choice: Choice of prior distributions on $\b$ and on $\g$ \pause \end{itemize} \bigskip Model averaging versus Model Selection -- what are objectives? \end{frame} \end{document}