bic.glm <- function (x, y, glm.family, wt=rep(1,nrow(x)), strict=F, prior.param=c(rep(0.5,ncol(x))),OR=20, OR.fix=2, nbest=150, dispersion=NULL, factor.include.all=T, factor.prior.adjust=F) { ### S-PLUS function to apply Bayesian Model Averaging to variable selection for ### generalized linear models. Bayesian Model Averaging accounts for the model ### uncertainty inherent in the variable selection problem by averaging over the ### best models in themodel class according to posterior model probability. ### For more details see references at the end of the documentation. ### Developer: Chris Volinsky (volinsky@stat.washington.edu) ### ### This software is not formally maintained, but I will be happy to ### hear from people who have problems with it, although I cannot guarantee to ### to solve them! ### ### Permission is hereby given to StatLib to redistribute this software. ### The software can be freely used for non-commercial purposes, and can ### be freely distributed for non-commercial purposes only. ### The copyright is retained by the developer. ### Copyright 1996, 1997 Chris T. Volinsky ### n = total number of data points, p = number of independent variables ### ### Inputs ### x n x p data frame or matrix of independent variables (may include factors) ### y response variable (n-vector) ### glm.family distribution family, as defined by the S-PLUS function glm. ### Choices include gaussian, poisson, Gamma, inverse.gaussian, ### binomial and quasi. See Chambers/Hastie (1993) ### wt weight vector for the observations (length n) ### strict = T will return a more parsimonious set of models (Occam's Window). ### Any model with a more likely submodel will be eliminated. ### = F (default). Returns all models within 1/OR in posterior model prob. ### prior.param p-vector of prior probabilities that parameter is non-zero. Default puts ### a prior of .5 on all parameters. Setting to 1 forces variables into the ### model. ### OR maximum ratio for excluding models in the averaging window ### (default 20). The function will average over a ### set of models including the best model and all ### models with a posterior model probability of at ### least 1/OR of the best model. ### OR.fix Width of the window which keeps models after the leaps ### approximation is done. Because the leaps and bounds gives only an ### approximation to BIC, there is a need to increase the window at this ### first "cut" so as to assure that no good models are deleted. The ### level of this cut is at 1/(OR^OR.fix); the default value for OR.fix is 2. ### nbest sets the nbest parameter of the S-PLUS leaps function. The default is 150, ### this is a conservative value which makes the probability of "losing" a ### good model slim. Reducing this value could increase run time, but there ### is a slight risk of losing valuable models. ### dispersion = T Estimate the dispersion parameter ### = F Set the dispersion parameter = 1 ### Default = F for glm.family = Poisson or binomial, T otherwise. ### factor.include.all ### How should the function model categorical variables of class "factor"? ### A categorical variable with d levels is turned into (d-1) dummy variables ### using a treatment contrast. If factor.include.all = T, ### models will contain either all or none of these dummy variables. ### If factor.include.all = F, models are free to select the dummy ### variables independently. In this case, factor.prior.adjust ### determines the prior on these variables. ### factor.prior.adjust ### If factor.include.all=F, this parameter determines the prior on the d-1 ### dummy variables. When factor.prior.adjust=F, all dummy variables for ### variable i have prior equal to prior.param[i] (or pp[i]). Note that ### this makes the prior probability of *any* of these variables much ### higher than pp[i]. Setting factor.prior.adjust=T corrects for this so ### that the prior probability on the union of the dummies equal to pp[i], ### (and hence the deletion of the factor has a prior probability of 1-pp[i]). ### This adjustment changes the individual priors on each dummy variable to ### 1-(1-pp[i])^(1/(k-1)), where k is the number of levels of the factor. ### Outputs: ### postprob posterior probabilities of the models selected ### label labels identifying the models selected ### dev deviance for each model selected ### df degrees for each model selected ### bic BIC = dev/disp - df * log(n). Smaller BIC means higher posterior model ### probability. ### (Note: BIC returned does not take into account model priors, ### so rank order of BIC may not be the same as the rank ### order of postprob if non-uniform parameter priors are ### specified or if factors are used and factor.include.all = F) ### size the number of independent variables in each of the models ### which a logical matrix of size length(label) x size ### probne0 posterior probability that each variable is non-zero (in %) ### [Note: in the special case where factor.include.all=F, and ### factor.prior.adjust=T, probne0 also returns the posterior probability ### that the *factor* is nonzero, i.e. the posterior probability that any ### one of the factor variables is in the model.] ### postmean model-averaged posterior mean of the parameters (includes averaged-in ### zeros) ### postsd model-averaged posterior sd of the parameters (includes averaged-in zeros). ### mle n x p matrix giving the maximum likelihood estimate of each coefficient ### for each model. ### se n x p matrix giving the standard errors of each coefficient for each model. ### prior.param the priors on the parameters. Will be different than input if factors ### are used and factor.include.all =F. ### prior.model.weights ### If priors on variables are not all .5, then prior weights gives the ratio ### of the model prior to a uniform model prior, based on prior.param. A ### prior.model.weight > 1 means the model is weighed more than uniform ### family Distributional family used in the GLM. ### disp Estimated dispersion parameter (1 if dispersion = F) ### namesx string of variable names ### References: ### Raftery, Adrian E. (1995). Bayesian model selection in social research ### (with Discussion). Sociological Methodology 1995 (Peter V. Marsden, ed.), ### pp. 111-196, Cambridge, Mass.: Blackwells. ### An earlier version, issued as Working Paper 94-12, Center for Studies in Demography and ### Ecology, University of Washington (1994) is available as a Postscript file at ### http://www.stat.washington.edu/tech.reports/bic.ps ### It is also available via ftp using the following commands: ### ftp ftp.stat.washington.edu (or 128.95.17.34) ### login as anonymous ### enter your email address as your password ### ftp>cd /pub/tech.reports ### ftp>get bic.ps ### ftp>bye ### ### Volinsky, C.T., Madigan, D., Raftery, A.E. and Kronmal, R.A. (1997). ### "Bayesian Model Averaging in Proportional Hazard Models: Assessing the Risk ### of a Stroke." Applied Statistics (46). Available at: ### http://www.stat.washington.edu/volinsky/vmrk.ps ### ### Furnival, G.M. and R.W. Wilson (1974) "Regression by leaps and bounds" ### Technometrics 16, 499-511. leaps.glm_function(info,coef,names.arg,nbest=nbest){ names.arg <- names.arg ### evaluate names.arg before cbind if(is.null(names.arg)) names.arg <- c(as.character(1:9), LETTERS, letters)[1:ncol(info)] if(length(names.arg) < ncol(info)) stop("Too few names") ## Start changing things bIb <- coef %*% info %*% coef kx <- ncol(info) maxreg <- nbest * kx if(kx < 3) stop("Too few independent variables") ### if(kx >= 31) ### stop("Problem too large") imeth <- 1 # Corresponds to R^2 df <- kx + 1 # df not important for r2 Ib <- info %*% coef rr <- cbind(info, Ib) rr <- rbind(rr, c(Ib, bIb)) ans <- .Fortran("leaps", ### Fortran routine as.single(rr), as.integer(kx), as.integer(kx + 1), as.integer(df), as.integer(imeth), as.integer(nbest), as.single(0.0001), regid = integer(maxreg), Cp = single(maxreg), size = integer(maxreg), nreg = integer(1), single((nbest + 4) * (kx + 1) + (((kx + 1) * (kx + 2))/2) * ((2 * (kx + 3))/3 + 1)), integer(4 * (kx + 1)^2 + 8 * (kx + 1) + (nbest + 1) * (kx + 1)) )[c("Cp", "size", "nreg", "regid")] ### cleanup Fortran output; fix Cp, r2 and adjr2. nreg <- ans$nreg Cp <- ans$Cp length(Cp) <- nreg size <- ans$size length(size) <- nreg which <- matrix(T, nreg, kx) z <- ans$regid length(z) <- nreg ### Rick Becker's mask function to create which and then labels which <- matrix(as.logical((rep.int(z, kx) %/% rep.int(2^((kx - 1):0), rep.int(length(z), kx))) %% 2), byrow = F, ncol = kx) label <- character(nreg) sep <- if(all(nchar(names.arg) == 1)) "" else "," for(i in 1:nreg) label[i] <- paste(names.arg[which[i, ]], collapse = sep) ans <- list(Cp, size = size, label = label, which = which) names(ans)[1] <- "r2" ans } # end of leaps.glm options(contrasts=c("contr.treatment","contr.treatment")) x_data.frame(x) prior.weight.denom <- .5^ncol(x) names.arg <- names(x) if (is.null(names.arg)) names.arg <- paste("X",1:ncol(x),sep="") ### If there are more than 30 columns in x, reduce to 30 using backwards ### elimination. if(ncol(x) > 30) { while(ncol(x)>30) { x.df_data.frame(x=x,y=y,wt=wt) glm.out <- glm(y~.-wt,data=x.df,family=glm.family, weights=wt) coef <- glm.out$coef p_glm.out$rank R <- glm.out$R rinv <- diag(p) rinv <- backsolve(R, rinv) rowlen <- drop(((rinv^2) %*% rep(1, p))^0.5) sigx_rowlen %o% 1 pvec <- signif(1 - pchisq((coef/sigx)^2, 1), 3) x <- x[, - rev(order(pvec))[1]] } } ### Remove NA's x2 <- na.omit(x) used <- match(row.names(x), row.names(x2)) omitted <- seq(nrow(x))[is.na(used)] if(length(omitted) > 0) { wt <- wt[ - omitted] x <- x2 y <- y[ - omitted] n <- n[ - omitted] warning(paste("There were ",length(omitted),"records deleted due to NA's")) } leaps.x <- x output.names <- names(x) ### locate factors and count factor levels fn <- factor.names(x) factors_!all(unlist(lapply(fn,is.null))) x.df_data.frame(x=x,y=y,wt=wt) glm.out <- glm(y~.-wt,family=glm.family,weights=wt,data=x.df) glm.assign_glm.out$assign fac.levels <- unlist(lapply(glm.assign,length)[-1]) if(factors) { cdf <- cbind.data.frame(y=y,x) mm <- model.matrix(formula(cdf),data=cdf)[,-1,drop=F] mmm <- data.frame(matrix(mm,nrow=nrow(mm),byrow=F)) names(mmm) <- dimnames(mm)[[2]] output.names <- names(mmm) if (factor.include.all) { for (i in 1:length(names(x))){ if (!is.null(fn[[i]])){ nx <- names(x)[i] coefs_glm.out$coef[glm.out$assign[[i+1]]] old.vals_x[,i] new.vals_c(0,coefs) new.vec_as.vector(new.vals[match(old.vals,fn[[i]])]) leaps.x[,nx]_new.vec } } } else ## factor.include.all = F { new.prior_NULL for (i in 1:length(names(x))){ addprior_prior.param[i] if (!is.null(fn[[i]])) { k_length(fn[[i]]) if (factor.prior.adjust) addprior_rep(1-(1-prior.param[i])^(1/(k-1)),k-1) else addprior_rep(prior.param[i],k-1) } new.prior_c(new.prior,addprior) } prior.param_new.prior x <- leaps.x <- mmm } } if (!factor.include.all && !factor.prior.adjust) prior.weight.denom <- .5^ncol(x) ### Start function x.df_data.frame(x=leaps.x,y=y,wt=wt) glm.out <- glm(y~.-wt,family=glm.family,weights=wt,data=x.df) famname <- glm.out$family["name"] if (is.null(dispersion)){ if (famname=="Poisson"|famname=="Binomial") dispersion <- F else dispersion <- T } nobs <- length(y) resid <- resid(glm.out,"pearson") rdf <- glm.out$df.resid is.wt_!all(wt==rep(1,nrow(x))) if (is.wt){ wt <- wt^0.5 resid <- resid * wt excl <- wt==0 if (any(excl)) { warning(paste(sum(excl),"rows with zero wts not counted")) resid <-resid[!excl] } } phihat <- sum(resid^2)/rdf if (dispersion) disp <- phihat else disp <- 1 coef <- glm.out$coef[-1] p <- glm.out$rank R <- glm.out$R rinv <- diag(p) rinv <- backsolve(R, rinv) rowlen <- drop(((rinv^2) %*% rep(1, p))^0.5) sigx_rowlen %o% sqrt(disp) correl_rinv %*% t(rinv) * outer(1/rowlen,1/rowlen) cov_correl*sigx%*%t(sigx) info <- solve(cov[-1,-1]) if (ncol(x)>2) { a <- leaps.glm(info,coef,names.arg=names(leaps.x),nbest=nbest) a$r2 <- pmin(pmax(0, a$r2), 99.9) # Include the null model a$r2 <- c(0, a$r2)/100 a$size <- c(0, a$size) a$label <- c("NULL", a$label) a$which <- rbind(rep(F, ncol(x)), a$which) nmod <- length(a$size) prior.mat <- matrix(rep(prior.param, nmod), nmod, ncol(x), byrow = T) prior <- apply(a$which * prior.mat + (!a$which) * (1 - prior.mat), 1, prod) bIb <- as.numeric(coef %*% info %*% coef) lrt <- bIb - (a$r2 * bIb) bic <- lrt + (a$size) * log(nobs) - 2 * log(prior) occam <- bic - min(bic) < 2 * OR.fix * log(OR) # From MUS p.24 with OR=B_12 size <- a$size[occam] label <- a$label[occam] which <- a$which[occam, ,drop=F] bic <- bic[occam] prior <- prior[occam] } # end if statement else { # ncol <= 2 nmod <- switch(ncol(x),2, 4) bic <- label <- rep(0, nmod) which <- matrix(c(F, T, F, T, F, F, T, T), nmod, nmod/2) size <- c(0, 1, 1, 2)[1:nmod] sep <- if(all(nchar(names.arg) == 1)) "" else "," prior.mat <- matrix(rep(prior.param, nmod), nmod, ncol(x), byrow = T) prior <- apply(which * prior.mat + (!which) * (1 - prior.mat), 1, prod) for(k in 1:nmod) { if (k==1) label[k] <- "NULL" else label[k] <- paste(names.arg[which[k, ]], collapse = sep) } } ### Calculate BIC exactly for the remaining models using "glm", nmod <- length(label) model.fits <- as.list(rep(0, nmod)) dev <- rep(0,nmod) df <- rep(0,nmod) for(k in 1:nmod) { if(sum(which[k, ]) == 0) { glm.out <- glm(y~1,family=glm.family,weights=wt) } else { x.df <- data.frame(x=x[,which[k,]],y=y,wt=wt) glm.out <- glm(y~.-wt,data=x.df,family=glm.family,weights=wt) } dev[k] <- glm.out$deviance df[k] <- glm.out$df model.fits[[k]] <- matrix(0,nrow=length(glm.out$coef),ncol = 2) model.fits[[k]][, 1] <- glm.out$coef coef <- glm.out$coef p_glm.out$rank R <- glm.out$R rinv_diag(p) rinv <- backsolve(R, rinv) rowlen <- drop(((rinv^2) %*% rep(1, p))^0.5) sigx_rowlen %o% 1 correl_rinv %*% t(rinv) * outer(1/rowlen,1/rowlen) cov_correl*sigx%*%t(sigx) model.fits[[k]][,2] <- sqrt(diag(cov)) } bic <- dev/disp - df * log(nobs) - 2 * log(prior) ### Now apply the first Occam's window rule using the exact BIC occam <- bic-min(bic) < 2*log(OR) dev <- dev[occam] df <- df[occam] size <- size[occam] label <- label[occam] which <- which[occam, ,drop=F] bic <- bic[occam] prior <- prior[occam] model.fits <- model.fits[occam] postprob <- exp(-0.5*(bic-min(bic))) / sum(exp(-0.5*(bic-min(bic)))) ### Order models in descending order of posterior probability order.bic <- order(bic,size,label) dev <- dev[order.bic] df <- df[order.bic] size <- size[order.bic] label <- label[order.bic] which <- which[order.bic, ,drop=F] bic <- bic[order.bic] prior_prior[order.bic] postprob <- postprob[order.bic] model.fits <- model.fits[order.bic] ### Apply the second rule to eliminate models with better ones nested within them. nmod <- length(bic) if(strict &(nmod != 1)) { occam <- rep(T,nmod) for (k in (2:nmod)) { for (j in (1:(k-1))) { which.diff <- which[k,] - which[j,] if (all(which.diff>= 0)) occam[k] <- F }} dev <- dev[occam] df <- df[occam] size <- size[occam] label <- label[occam] which <- which[occam, ,drop=F] bic <- bic[occam] prior_prior[occam] postprob <- postprob[occam] postprob <- postprob/sum(postprob) model.fits <-model.fits[occam] # CTV 7/95 } #### Make BIC for null model = 0 # logprior.null <- sum(log(1-prior.param)) # bic <- bic + 2 * logprior.null- (glm.out$null/disp - (length(y)-1)*log(nobs)) bic <- bic + 2 * log(prior) ### Calculate Pr[beta_i != 0 | D] for each coefficient beta_i nmod <- length (bic) probne0 <- round (100* t(which)%*%as.matrix(postprob),1) nvar <- length(output.names) Ebi <- rep(0,nvar+1) SDbi <- rep(0,nvar+1) EbiMk <- matrix (0,nrow=nmod,ncol=nvar+1) dimnames(EbiMk)_list(NULL,c("Int",output.names)) sebiMk <- matrix (0,nrow=nmod,ncol=nvar+1) dimnames(sebiMk)_list(NULL,,c("Int",output.names)) if (factors==F || factor.include.all==F) { for(i in 1:(nvar+1)) if((i==1)||any(which[,(i-1)])) for(k in (1:nmod)) if (i == 1) # intercept { EbiMk[k,1] <- model.fits[[k]][1,1] sebiMk[k,1] <- model.fits[[k]][1,2] } else # i is not the intercept if(which[k, (i-1)] == T) { pos <- sum(which[k, 1:(i-1)])+1 EbiMk[k, i] <- model.fits[[k]][(pos), 1] sebiMk[k, i] <- model.fits[[k]][(pos), 2] } } else #factor.include.all=T { for(i in (1:(ncol(x)+1))) { # includes intercept whereisit_glm.assign[[i]] if((i==1)||any(which[,(i-1)])) for(k in (1:nmod)) if (i == 1) { EbiMk[k,1] <- model.fits[[k]][1,1] sebiMk[k,1] <- model.fits[[k]][1,2] } else { if(which[k, (i-1)] == T) { ##Find position of beta_i in model k spot_sum(which[k,(1:(i-1))]) posMk <- (c(0,cumsum(fac.levels[which[k, ]]))+2)[spot] posMk <- posMk:(posMk+fac.levels[(i-1)]-1) EbiMk[k,whereisit]<- model.fits[[k]][posMk, 1] sebiMk[k, whereisit] <- model.fits[[k]][posMk, 2] } } } } Ebi <- postprob %*% EbiMk Ebimat_matrix(rep(Ebi,nmod),nrow=nmod,byrow=T) SDbi <- sqrt(postprob%*%(sebiMk^2) + postprob%*%((EbiMk-Ebimat)^2)) dimnames(which)_list(NULL,names(x)) if (factors && !factor.type && factor.prior.adjust) { factor.probne0_NULL for (i in 1:(length(glm.assign)-1)) if (length(glm.assign[[i+1]])>1) { isitthere_apply(which[,unlist(glm.assign[i+1])],1,any) name_(glm.assign)[i+1] factor.probne0_c(factor.probne0,as.numeric(isitthere%*%postprob)) } names(factor.probne0)_names(fn)[!unlist(lapply(fn,is.null))] probne0_list(c(probne0),factor=c(factor.probne0)) } ### Output list(postprob=postprob,label=label,deviance=dev,df=df,bic=bic,size=(size), which=which,probne0=c(probne0),postmean=Ebi,postsd=SDbi,mle=EbiMk,se=sebiMk, prior.param=prior.param, prior.model.weights=prior/prior.weight.denom, family = famname, disp = disp, namesx = output.names) }