bic.glm <- function (x, y, glm.family, wt=rep(1,nrow(x)),
		     strict=F, prior.param=c(rep(0.5,ncol(x))),OR=20, 
		     OR.fix=2, nbest=150, dispersion=NULL, factor.include.all=T, 
		     factor.prior.adjust=F)
{	

### S-PLUS function to apply Bayesian Model Averaging to variable selection for 
### generalized linear models.  Bayesian Model Averaging accounts for the model 
### uncertainty inherent in the variable selection problem by averaging over the 
### best models in themodel class according to posterior model probability.  
### For more details see references at the end of the documentation.

###  Developer: Chris Volinsky (volinsky@stat.washington.edu)
### 
###  This software is not formally maintained, but I will be happy to
###  hear from people who have problems with it, although I cannot guarantee to
###  to solve them!
###
### Permission is hereby given to StatLib to redistribute this software.
### The software can be freely used for non-commercial purposes, and can
###  be freely distributed for non-commercial purposes only. 
### The copyright is retained by the developer.
### Copyright 1996, 1997 Chris T. Volinsky
  
### n = total number of data points, p = number of independent variables 
###
### Inputs
###  x		n x p data frame or matrix of independent variables (may include factors)
###  y 		response variable (n-vector)
###  glm.family	distribution family, as defined by the S-PLUS function glm.  
###		Choices include gaussian, poisson, Gamma, inverse.gaussian, 
###		binomial and quasi.  See Chambers/Hastie (1993)
###  wt		weight vector for the observations (length n)
###  strict	= T will return a more parsimonious set of models (Occam's Window).  
###              Any model with a more likely submodel will be eliminated.  
###             = F (default).  Returns all models within 1/OR in posterior model prob.
###  prior.param p-vector of prior probabilities that parameter is non-zero.  Default puts 
###              a prior of .5 on all parameters.  Setting to 1 forces variables into the
###              model.  
###  OR		maximum ratio for excluding models in the averaging window
###		 (default 20).  The function will average over a
###               set of models including the best model and all
###               models with a posterior model probability of at
###               least 1/OR of the best model.
###  OR.fix     Width of the window which keeps models after the leaps 
###              approximation is done.  Because the leaps and bounds gives only an
###              approximation to BIC, there is a need to increase the window at this 
###              first "cut" so as to assure that no good models are deleted.  The 
###              level of this cut is at 1/(OR^OR.fix); the default value for OR.fix is 2.
###  nbest 	sets the nbest parameter of the S-PLUS leaps function.  The default is 150,
###              this is a conservative value which makes the probability of "losing" a 
###              good model slim.  Reducing this value could increase run time, but there 
###              is a slight risk of losing valuable models. 
###  dispersion = T     Estimate the dispersion parameter
###		= F 	Set the dispersion parameter = 1
###		 Default = F for glm.family = Poisson or binomial, T otherwise. 
### factor.include.all 
###		 How should the function model categorical variables of class "factor"?
###              A categorical variable with d levels is turned into (d-1) dummy variables
###              using a treatment contrast.  If factor.include.all = T, 
###	         models will contain  either all or none of these dummy variables.
###              If factor.include.all = F, models are free  to select the dummy 
###              variables independently.  In this case, factor.prior.adjust 
###              determines the prior on these variables.
### factor.prior.adjust
###               If factor.include.all=F, this parameter determines the prior on the d-1
###               dummy variables.  When factor.prior.adjust=F, all dummy variables for
###               variable i have prior equal to prior.param[i] (or pp[i]).  Note that
###               this makes the prior probability of *any* of these variables much 
###               higher than pp[i].  Setting factor.prior.adjust=T corrects for this so
###               that the prior probability on the union of the dummies equal to pp[i],
###               (and hence the deletion of the factor has a prior probability of 1-pp[i]).  
###	  	  This adjustment changes the individual priors on each dummy variable to
###               1-(1-pp[i])^(1/(k-1)), where k is the number of levels of the factor.  

### Outputs:
### postprob	posterior probabilities of the models selected
### label	labels identifying the models selected
### dev		deviance for each model selected
### df		degrees for each model selected
### bic		BIC = dev/disp - df * log(n).  Smaller BIC means higher posterior model
###             probability.  
###		(Note:  BIC returned does not take into account model priors, 
###			so rank order of BIC may not be the same as the rank 
###			order of postprob if non-uniform parameter priors are
###			specified or if factors are used and factor.include.all = F) 
### size	the number of independent variables in each of the models
### which	a logical matrix of size length(label) x size
### probne0	posterior probability that each variable is non-zero (in %)
###  		 [Note: in the special case where factor.include.all=F, and
###	          factor.prior.adjust=T, probne0 also returns the posterior probability
###		  that the *factor* is nonzero, i.e. the posterior probability that any 
###   		  one of the factor variables is in the model.]
### postmean	model-averaged posterior mean of the parameters (includes averaged-in 
###		 zeros)
### postsd	model-averaged posterior sd of the parameters (includes averaged-in zeros).
### mle 	n x p matrix giving the maximum likelihood estimate of each coefficient 
###             for each model. 
### se 	  	n x p matrix giving the standard errors of each coefficient for each model. 
### prior.param the priors on the parameters.  Will be different than input if factors
###             are used and factor.include.all =F.
### prior.model.weights
###             If priors on variables are not all .5, then prior weights gives the ratio
###             of the model prior to a uniform model prior, based on prior.param.  A  
###             prior.model.weight > 1 means  the model is weighed more than uniform
###  family	Distributional family used in the GLM.
###  disp	Estimated dispersion parameter (1 if dispersion = F)
###  namesx	string of variable names

### References: 
###  Raftery, Adrian E. (1995). Bayesian model selection in social research
###    (with Discussion). Sociological Methodology 1995 (Peter V. Marsden, ed.),
###    pp. 111-196, Cambridge, Mass.: Blackwells.
  
###   An earlier version, issued as Working Paper 94-12, Center for Studies in Demography and 
### Ecology, University of Washington (1994) is available as a Postscript file at
###   http://www.stat.washington.edu/tech.reports/bic.ps
  
### It is also available via ftp using the following commands:
###   ftp ftp.stat.washington.edu (or 128.95.17.34)
###   login as anonymous
###   enter your email address as your password
###   ftp>cd /pub/tech.reports
###   ftp>get bic.ps
###   ftp>bye
### 
###   Volinsky, C.T., Madigan, D., Raftery, A.E. and Kronmal, R.A. (1997).
###    "Bayesian Model Averaging in Proportional Hazard Models:  Assessing the Risk 
###     of a Stroke."  Applied Statistics (46).  Available at:
###     http://www.stat.washington.edu/volinsky/vmrk.ps
###  
### Furnival, G.M. and R.W. Wilson (1974) "Regression by leaps and bounds"
###   Technometrics 16, 499-511.
  
  leaps.glm_function(info,coef,names.arg,nbest=nbest){
    names.arg <- names.arg	### evaluate names.arg before cbind
    if(is.null(names.arg))
      names.arg <- c(as.character(1:9), LETTERS, letters)[1:ncol(info)]
    if(length(names.arg) < ncol(info)) stop("Too few names")	
    ##  Start changing things
    bIb <- coef %*% info %*% coef
    kx <- ncol(info)
    maxreg <- nbest * kx
    if(kx < 3) stop("Too few independent variables")	
###	if(kx >= 31)
###		stop("Problem too large")
    imeth <- 1	# Corresponds to R^2
    df <- kx + 1	# df not important for r2
    Ib <- info %*% coef
    rr <- cbind(info, Ib)
    rr <- rbind(rr, c(Ib, bIb))
    ans <- .Fortran("leaps",
### Fortran routine
		    as.single(rr),
		    as.integer(kx),
		    as.integer(kx + 1),
		    as.integer(df),
		    as.integer(imeth),
		    as.integer(nbest),
		    as.single(0.0001),
		    regid = integer(maxreg),
		    Cp = single(maxreg),
		    size = integer(maxreg),
		    nreg = integer(1),
		    single((nbest + 4) * (kx + 1) + (((kx + 1) * (kx + 2))/2) * ((2 *
										  (kx + 3))/3 + 1)),
		    integer(4 * (kx + 1)^2 + 8 * (kx + 1) + (nbest + 1) * (kx + 1))
		    )[c("Cp", "size", "nreg", "regid")]	
###  cleanup Fortran output; fix Cp, r2 and adjr2.
    nreg <- ans$nreg
    Cp <- ans$Cp
    length(Cp) <- nreg
    size <- ans$size
    length(size) <- nreg
    which <- matrix(T, nreg, kx)
    z <- ans$regid
    length(z) <- nreg	
###  Rick Becker's mask function to create which and then labels
    which <- matrix(as.logical((rep.int(z, kx) %/% rep.int(2^((kx - 1):0), 
							   rep.int(length(z), kx))) %% 2), byrow = F, ncol = kx)
    label <- character(nreg)
    sep <- if(all(nchar(names.arg) == 1)) "" else ","
    for(i in 1:nreg)
      label[i] <- paste(names.arg[which[i,  ]], collapse = sep)
    ans <- list(Cp, size = size, label = label, which = which)
    names(ans)[1] <- "r2"
    ans
  } # end of leaps.glm
  
  options(contrasts=c("contr.treatment","contr.treatment"))
  x_data.frame(x)
  prior.weight.denom <- .5^ncol(x)
  
  names.arg <- names(x)
  if (is.null(names.arg))  names.arg <- paste("X",1:ncol(x),sep="")
  
  
  
### If there are more than 30 columns in x, reduce to 30 using backwards
###  elimination.
  if(ncol(x) > 30) {
    while(ncol(x)>30) {
      x.df_data.frame(x=x,y=y,wt=wt)
      glm.out <- glm(y~.-wt,data=x.df,family=glm.family,
		     weights=wt)
      coef <- glm.out$coef
      p_glm.out$rank
      R <- glm.out$R
      rinv <- diag(p)
      rinv <- backsolve(R, rinv)
      rowlen <- drop(((rinv^2) %*% rep(1, p))^0.5)
      sigx_rowlen %o% 1
      pvec <- signif(1 - pchisq((coef/sigx)^2, 1), 3)
      x <- x[,  - rev(order(pvec))[1]]
    }
  }
  
### Remove NA's

  x2 <- na.omit(x)
  used <- match(row.names(x), row.names(x2))
  omitted <- seq(nrow(x))[is.na(used)]
  if(length(omitted) > 0) {
    wt <- wt[ - omitted]
    x <- x2
    y <- y[ - omitted]
    n <- n[ - omitted]
    warning(paste("There were ",length(omitted),"records deleted due to NA's"))
  }
  

  leaps.x <- x
  output.names <- names(x)

### locate factors and count factor levels

  fn <- factor.names(x)
  factors_!all(unlist(lapply(fn,is.null)))
  x.df_data.frame(x=x,y=y,wt=wt)
  glm.out <- glm(y~.-wt,family=glm.family,weights=wt,data=x.df)
  glm.assign_glm.out$assign
  fac.levels <- unlist(lapply(glm.assign,length)[-1])
  if(factors)
    {
      cdf <- cbind.data.frame(y=y,x)
      mm <- model.matrix(formula(cdf),data=cdf)[,-1,drop=F]
      mmm <- data.frame(matrix(mm,nrow=nrow(mm),byrow=F))
      names(mmm) <- dimnames(mm)[[2]]
      output.names <- names(mmm)
      if (factor.include.all)
	{
	  for (i in 1:length(names(x))){
	    if (!is.null(fn[[i]])){
	      nx <- names(x)[i]
	      coefs_glm.out$coef[glm.out$assign[[i+1]]]
	      old.vals_x[,i]
	      new.vals_c(0,coefs)
	      new.vec_as.vector(new.vals[match(old.vals,fn[[i]])])
	      leaps.x[,nx]_new.vec
	    }
	  }
	}
	else ## factor.include.all = F
	  {
	    new.prior_NULL
	    for (i in 1:length(names(x))){
	      addprior_prior.param[i]
	      if (!is.null(fn[[i]]))
		{
		  k_length(fn[[i]])
		  if (factor.prior.adjust) addprior_rep(1-(1-prior.param[i])^(1/(k-1)),k-1)
		    else addprior_rep(prior.param[i],k-1)
		}
	      new.prior_c(new.prior,addprior)
	    }
	    prior.param_new.prior
	    x <- leaps.x <- mmm
	  }
    }

  if (!factor.include.all && !factor.prior.adjust) prior.weight.denom <- .5^ncol(x)
  
### Start function

  x.df_data.frame(x=leaps.x,y=y,wt=wt)
  glm.out <- glm(y~.-wt,family=glm.family,weights=wt,data=x.df)
  famname <- glm.out$family["name"]
  if (is.null(dispersion)){
    if (famname=="Poisson"|famname=="Binomial") dispersion <- F
      else dispersion <- T
  }
  nobs <- length(y)
  resid <- resid(glm.out,"pearson")
  rdf <- glm.out$df.resid
  is.wt_!all(wt==rep(1,nrow(x)))
  if (is.wt){
    wt <- wt^0.5
    resid <- resid * wt
    excl <- wt==0
    if (any(excl)) {
      warning(paste(sum(excl),"rows with zero wts not counted"))
      resid <-resid[!excl]
    }
  }
  phihat <- sum(resid^2)/rdf
  if (dispersion) disp <- phihat
    else disp <- 1
  coef <- glm.out$coef[-1]
  p <- glm.out$rank
  R <- glm.out$R
  rinv <- diag(p)
  rinv <- backsolve(R, rinv)
  rowlen <- drop(((rinv^2) %*% rep(1, p))^0.5)
  sigx_rowlen %o% sqrt(disp)
  correl_rinv %*% t(rinv) * outer(1/rowlen,1/rowlen)
  cov_correl*sigx%*%t(sigx)
  info <- solve(cov[-1,-1])
  if (ncol(x)>2)
    {
      a <- leaps.glm(info,coef,names.arg=names(leaps.x),nbest=nbest)
      a$r2 <- pmin(pmax(0, a$r2), 99.9)	# Include the null model
      a$r2 <- c(0, a$r2)/100
      a$size <- c(0, a$size)
      a$label <- c("NULL", a$label)
      a$which <- rbind(rep(F, ncol(x)), a$which)
      nmod <- length(a$size)
      prior.mat <- matrix(rep(prior.param, nmod), nmod, ncol(x), byrow = T)
      prior <- apply(a$which * prior.mat + (!a$which) * (1 - prior.mat), 1, 
		     prod)
      bIb <- as.numeric(coef %*% info %*% coef)
      lrt <- bIb - (a$r2 * bIb)
      bic <- lrt + (a$size) * log(nobs) - 2 * log(prior)
      occam <- bic - min(bic) < 2 * OR.fix * log(OR)	# From MUS p.24 with OR=B_12
      size <- a$size[occam]
      label <- a$label[occam]
      which <- a$which[occam, ,drop=F]
      bic <- bic[occam]
      prior <- prior[occam]
    }      # end if statement
    else 
      { # ncol  <= 2
	nmod <- switch(ncol(x),2, 4)
	bic <- label <- rep(0, nmod)
	which <- matrix(c(F, T, F, T, F, F, T, T), nmod, nmod/2)
	size <- c(0, 1, 1, 2)[1:nmod]
	sep <- if(all(nchar(names.arg) == 1)) "" else ","
	prior.mat <- matrix(rep(prior.param, nmod), nmod, ncol(x), 
			    byrow = T)
	prior <- apply(which * prior.mat + (!which) * (1 - prior.mat), 
		       1, prod)
	for(k in 1:nmod) {
	  if (k==1) label[k] <- "NULL"
	    else label[k] <- paste(names.arg[which[k,  ]], collapse = sep)
	}
      }
  
  
### Calculate BIC exactly for the remaining models using "glm",  
 
  nmod <- length(label)
  model.fits <- as.list(rep(0, nmod))
  dev <- rep(0,nmod)
  df <- rep(0,nmod)
  for(k in 1:nmod) 
    {
      if(sum(which[k,  ]) == 0) {
	glm.out <- glm(y~1,family=glm.family,weights=wt)
      }
	else 
	  {
	    x.df <- data.frame(x=x[,which[k,]],y=y,wt=wt)
	    glm.out <- glm(y~.-wt,data=x.df,family=glm.family,weights=wt)
	  }
      dev[k] <- glm.out$deviance
      df[k] <- glm.out$df
      model.fits[[k]] <- matrix(0,nrow=length(glm.out$coef),ncol = 2)
      model.fits[[k]][, 1] <- glm.out$coef
      coef <- glm.out$coef
      p_glm.out$rank
      R <- glm.out$R
      rinv_diag(p) 
      rinv <- backsolve(R, rinv)
      rowlen <- drop(((rinv^2) %*% rep(1, p))^0.5)
      sigx_rowlen %o% 1
      correl_rinv %*% t(rinv) * outer(1/rowlen,1/rowlen)
      cov_correl*sigx%*%t(sigx)
      model.fits[[k]][,2] <- sqrt(diag(cov)) 		  
    }
  bic <- dev/disp - df * log(nobs) - 2 * log(prior)
  
### Now apply the first Occam's window rule using the exact BIC
  
  occam <- bic-min(bic) < 2*log(OR)
  dev <- dev[occam]
  df <- df[occam]
  size <- size[occam]
  label <- label[occam]
  which <- which[occam, ,drop=F]
  bic <- bic[occam]
  prior <- prior[occam]
  model.fits <- model.fits[occam]
  postprob <- exp(-0.5*(bic-min(bic))) / sum(exp(-0.5*(bic-min(bic))))
  
### Order models in descending order of posterior probability
  order.bic <- order(bic,size,label)
  dev <- dev[order.bic]
  df <- df[order.bic]
  size <- size[order.bic]
  label <- label[order.bic]
  which <- which[order.bic, ,drop=F]
  bic <- bic[order.bic]
  prior_prior[order.bic]
  postprob <- postprob[order.bic]
  model.fits <- model.fits[order.bic]

### Apply the second rule to eliminate models with better ones nested within them.
  
  nmod <- length(bic)
  if(strict &(nmod != 1)) {
    occam <- rep(T,nmod)
    for (k in (2:nmod)) {
      for (j in (1:(k-1))) {
	which.diff <- which[k,] - which[j,]
	if (all(which.diff>= 0)) occam[k] <- F
      }}
    dev <- dev[occam]
    df <- df[occam]
    size <- size[occam]
    label <- label[occam]
    which <- which[occam, ,drop=F]
    bic <- bic[occam]
    prior_prior[occam]
    postprob <- postprob[occam]
    postprob <- postprob/sum(postprob)
    model.fits <-model.fits[occam]			# CTV 7/95
  }
  
#### Make BIC for null model = 0
  
#  logprior.null <- sum(log(1-prior.param))
#  bic <- bic + 2 * logprior.null- (glm.out$null/disp - (length(y)-1)*log(nobs))

 bic <- bic + 2 * log(prior)
 
### Calculate Pr[beta_i != 0 | D] for each coefficient beta_i
  
  nmod <- length (bic)
  probne0 <- round (100* t(which)%*%as.matrix(postprob),1) 
  nvar <- length(output.names)
  Ebi <- rep(0,nvar+1)
  SDbi <- rep(0,nvar+1)
  EbiMk <- matrix (0,nrow=nmod,ncol=nvar+1)
  dimnames(EbiMk)_list(NULL,c("Int",output.names))
  sebiMk <- matrix (0,nrow=nmod,ncol=nvar+1)
  dimnames(sebiMk)_list(NULL,,c("Int",output.names))
  if (factors==F || factor.include.all==F)
   {
      for(i in 1:(nvar+1)) 
	if((i==1)||any(which[,(i-1)])) 
	  for(k in (1:nmod)) 
	    if (i == 1) # intercept
	      {
		EbiMk[k,1] <- model.fits[[k]][1,1]
		sebiMk[k,1] <- model.fits[[k]][1,2]
	      }
	      else  # i is not the intercept
		if(which[k, (i-1)] == T) 
		  {
		    pos <- sum(which[k, 1:(i-1)])+1
		    EbiMk[k, i] <- model.fits[[k]][(pos), 1]
		    sebiMk[k, i] <- model.fits[[k]][(pos), 2]
		  }
    }
    else  #factor.include.all=T
      {
	for(i in (1:(ncol(x)+1))) {		# includes intercept
	  whereisit_glm.assign[[i]]
	  if((i==1)||any(which[,(i-1)])) 
	    for(k in (1:nmod)) 
	      if (i == 1)
		{
		  EbiMk[k,1] <- model.fits[[k]][1,1]
		  sebiMk[k,1] <- model.fits[[k]][1,2]
		}
		else 
		  {
		    if(which[k, (i-1)] == T) {
		      ##Find position of beta_i in model k
		      spot_sum(which[k,(1:(i-1))])
		      posMk <- (c(0,cumsum(fac.levels[which[k, ]]))+2)[spot]
		      posMk <- posMk:(posMk+fac.levels[(i-1)]-1)
		      EbiMk[k,whereisit]<- model.fits[[k]][posMk, 1]
		      sebiMk[k, whereisit] <- model.fits[[k]][posMk, 2]
		    }
		  }
	}
      }

  Ebi <- postprob %*% EbiMk
  Ebimat_matrix(rep(Ebi,nmod),nrow=nmod,byrow=T)
  SDbi <- sqrt(postprob%*%(sebiMk^2) + postprob%*%((EbiMk-Ebimat)^2))

  dimnames(which)_list(NULL,names(x))

  if (factors && !factor.type && factor.prior.adjust)
    {
      factor.probne0_NULL
      for (i in 1:(length(glm.assign)-1))
	if (length(glm.assign[[i+1]])>1)
	  {
	    isitthere_apply(which[,unlist(glm.assign[i+1])],1,any)
	    name_(glm.assign)[i+1]
	    factor.probne0_c(factor.probne0,as.numeric(isitthere%*%postprob))
	  }
      names(factor.probne0)_names(fn)[!unlist(lapply(fn,is.null))]
      probne0_list(c(probne0),factor=c(factor.probne0))
    }
 
### Output
list(postprob=postprob,label=label,deviance=dev,df=df,bic=bic,size=(size),
     which=which,probne0=c(probne0),postmean=Ebi,postsd=SDbi,mle=EbiMk,se=sebiMk,
     prior.param=prior.param, prior.model.weights=prior/prior.weight.denom,
     family = famname, disp = disp, namesx = output.names)
}