bayesreg <- function(x, y, wt=rep(1, length(y)), prior=c("gprior","ZSnull","ZSfull","ZSnullLAP","ZSfullLAP","BIC"), nbest=1, nvmax=8, regmeth=c("exhaustive","backward","forward","seqrep"), occam=FALSE, strict=FALSE)
{

## MODIFIED VERSION OF bicreg (SPLUS VERSION) FOR R-----------------------
## UPDATED BY CHRISTINE KOHNEN, DUKE UNIVERSITY & SAMSI-------------------
## LAST UPDATED: MARCH 3, 2003--------------------------------------------
##
## INPUTS-----------------------------------------------------------------
## x          matrix of independent variables 
## y          dependent variable
## wt	      weights for regression
## prior      prior method - gprior, ZSnull, ZSfull, ZSnullLAP, ZSfullLAP,BIC  
## nbest      number of subsets of each size to report
## nvmax      largest subset size to be examined
## regmeth    search method - "exhaustive", "forward", "backward" or "seqrep"
## occam      = TRUE, will use Occam's window (default is 20) as the   
## 	      maximum ratio for excluding models
##            = FALSE (default) will return all models
## strict     = TRUE will return a more parsimonious set of models
##            (Occam's Window) Any model with a more likely submodel will
##            be eliminated.  
##            = FALSE (default).  Returns all models within 1/OR in post. model
##            prob  
##
## OUTPUT-----------------------------------------------------------------
## post.order ordering of the posterior probabilites in decreasing order 
## postprob   posterior probabilities of the models selected
## label      labels identifying the models selected (no intercept)
## r2	      R2 values for the models
## bic	      BIC values for the models
## size	      number of independent variables in each of the models (no int.)
## which      a logical matrix with one row per model and one column per 
##	      variable indicating whether that variable is in the model
## which.int  similar to which, but has a column for the intercept  
## margs      log marginal values or log Bayes Factors (depends on prior)
## probne0    posterior probability that each variable is non-zero (in %)
## postmean   posterior mean of each coefficient (from model mixing)
## postsd     posterior SD of each coefficient (from model mixing) 
## ols	      matrix (like the matrix which) giving the OLS estimate of
##	      each coefficient for each model
## se         matrix (like ols) giving the corresponding SE for each model
##  
## CODE STARTS HERE------------------------------------------------------
  
  x <- data.frame(x)
  if(is.null(dimnames(x))) dimnames(x) <- list(NULL,paste("X",1:ncol(x),sep=""))
  y <- as.numeric(y)
  options(contrasts=c("contr.treatment","contr.treatment"))
  
## REMOVING THE NA'S------------------------------------------------------
  x2 <- na.omit(data.frame(x))
  used <- match(row.names(data.frame(x)), row.names(x2))
  omitted <- seq(nrow(x))[is.na(used)]
  if(length(omitted) > 0) {
    x <- x2
    y <- y[ - omitted]
    warning(paste("There were ",length(omitted),"records deleted due to NA's"))
  }
  
## CREATING THE DESIGH MATRIX---------------------------------------------
  cdf <- cbind.data.frame(y=y,x)
  mm <- model.matrix(formula(cdf),data=cdf)[,-1,drop=F]
  x <- mm

## EXECUTE THE LEAPS OR REGSUBSETS ALGORITHM -- DEPENDS ON THE DIM(X)

  ## Load the leaps() package if not already available
  library(leaps)
  
  nvar <- ncol(x)  
  n <- length(y)

  r2full <- summary(lm(y~x))$r.squared
  
  if(nvar > 30){  
    a <- regsubsets(x, y, nbest=nbest, nvmax=nvmax, method = regmeth, really.big =T) 
    b <- summary(a)
    b$rsq <- pmin(pmax(0, b$rsq), .999)
    label <- rep(0,nrow(b$which))
    for(i in 1:nrow(b$which)){
      label[i] <- paste(dimnames(x)[[2]][which(b$which[i,2:(nvar+1)]==T)],collapse=",")
    }
    ## Add values corresponding to the null model (intercept only)
    which.int <- rbind(c(T,rep(F,nvar)),b$which)
    r2 <- round(c(0, b$rsq), 3)
    size <- c(1,apply(b$which,1,sum)) #intercept is included in sum
    label <- c("NULL", label)
    which <- rbind(rep(F,nvar), b$which[,2:(nvar+1)])
    bic <- c(0,b$bic)
  }
  
  else if(nvar < 31 & nvar > 2){
    a <- leaps(x, y,method = "r2", names=dimnames(x)[[2]])
    a$r2 <- pmin(pmax(0, a$r2), .999)
    label <- rep(0,nrow(a$which))
    for(i in 1:nrow(a$which)){
      label[i] <- paste(dimnames(x)[[2]][which(a$which[i,]==T)],collapse=",")
    }
    ## Add values corresponding to the null model (intercept only)
    r2 <- round(c(0, a$r2), 3)
    size <- c(1, a$size) #intercept is included in a$size
    label <- c("NULL", label)
    which <- rbind(rep(F, nvar), a$which) #a$which does not include int
    bic <- n * log(1 - r2) + (size - 1) * log(n)
    which.int <- cbind(rep(T,nrow(which)),which)
  }
    
  else{
    r2 <- bic <- NULL
    nmod <- switch(ncol(x),2,4)
    bic <- label <- rep(0, nmod)
    model.fits <- as.list(rep(0, nmod))
    which <- matrix(c(F,T,F,T,F,F,T,T), nmod, nmod/2)
    size <- c(1,2,2,3)[1:nmod]
    sep <- if(all(nchar(dimnames(x)[[2]])==1)) "" else ","
    for(k in 1:nmod) {
      if (k==1) {
        label[k] <- "NULL"
        lm1 <- lm(y~1,w=wt)
      }
      else {
        label[k] <- paste(dimnames(x)[[2]][which[k,]], collapse = sep)
        x.lm <- cbind.data.frame(y = y, x=x[,which[k, ,drop=F]], wt = wt)
        lm1 <- lm(y~.-wt,data=x.lm,w=wt)
      }
      r2[k] <- round(summary(lm1)$r.sq,digits=3)
    }
    bic <- n*log(1-r2)+(size-1)*log(n)
  }

## CALCULATE THE LOG MARGINALS OR LOG BAYES FACTORS FOR EACH MODEL--------
  
  margs <- rep(0,nrow(which))
  for(i in 1:nrow(which)){ 
    margs[i] <- switch(1+pmatch(prior[1],
                                c("gprior","ZSnull","ZSfull","ZSnullLAP","ZSfullLAP","BIC"),
                                nomatch=0),
                       stop(paste("Ambiguous or unrecognized prior name: ",prior)),
                       log(margy(n,n,sum(which[i,]),r2[i])),
                       ZSnull(r2[i],n,sum(which[i,]),c=n/2),
                       ZSfull(r2full,r2[i],n,nvar,sum(which[i,])),
                       ZSnullLAP(r2[i],n,sum(which[i,])),
                       ZSfullLAP(r2full,r2[i],n,nvar,sum(which[i,])),
                       -.5*bic[i])
  }
  
  
## CALCULATE BIC AND APPLY THE FIRST OW RULE------------------------------
  if(occam){
    OR <- 20
    occam <- bic - min(bic) < 2 * log(OR)
    r2 <- r2[occam] #only select the r2 values for which occam=TRUE
    size <- size[occam] #only select the size values for which occam=TRUE
    label <-label[occam] #only select the labels for which occam=TRUE
    which <- which[occam,  ,drop=F]
    bic <- bic[occam]
    margs <- margs[occam]
    postprob <- exp(-0.5 * bic)/sum(exp(-0.5 * bic))	
    postprob[is.na(postprob)] <- 1

## ORDER MODELS IN DESCENDING ORDER OF POSTERIOR PROBABILITY--------------
    order.bic <- order(bic, size, label,decreasing=TRUE)
    r2 <- r2[order.bic]
    size <- size[order.bic]
    label <- label[order.bic]
    which <- which[order.bic,,drop=F]
    bic <- bic[order.bic]
    margs <- margs[order.bic]
    postprob <- postprob[order.bic]	
  
## APPLY THE SECOND RULE TO ELIMINATE MODELS WITH BETTERS ONES NESTED W/IN
    if(strict){
      nmod <- length(bic)
      if (nmod>1) {
        occam <- rep(T, nmod)
        for(k in (2:nmod)) {
          for(j in (1:(k - 1))) {
            which.diff <- which[k,] - which[j,]
            if(all(which.diff >= 0)) occam[k] <- F
          }
        }
        r2 <- r2[occam]
        size <- size[occam]
        label <- label[occam]
        nmod <- sum(occam)
        which <- which[occam,  ,drop=F]
        bic <- bic[occam]
        margs <- margs[occam]
        postprob <- postprob[occam]
        postprob <- postprob/sum(postprob)
      }
    }
  }
  else{
    postprob <- exp(margs)/sum(exp(margs))	
    postprob[is.na(postprob)] <- 1
  }

## ORDER THE POSTERIOR MODEL PROBABILITIES IN DESCENDING ORDER----------
  post.order <- order(postprob,decreasing=TRUE)
  
## CALCULATE PR[BETA_i != 0 | D] FOR EACH COEFFICIENT BETA_i------------
  probne0 <- round(t(which.int) %*% as.matrix(postprob), 1)
  nmod <- length(bic)	

## CALCULATE E[BETA_i|D] AND SD[BETA_i|D] FOR EACH COEFFICIENT BETA_i--
  model.fits <- as.list(rep(0, nmod))
  for(k in 1:nmod) {
    if(sum(which[k,])!=0){
      model.fits[[k]] <- summary(lm(y~x[,which[k,],drop=F],wt=wt))$coef
    }
    else model.fits[[k]] <- summary(lm(y~1,wt=wt))$coef
  }
  Ebi <- rep(0, (nvar+1))
  SDbi <- rep(0,(nvar+1))
  EbiMk <- matrix(0, nmod, nvar+1)
  sebiMk <- matrix(0, nmod, nvar+1)
  for(i in 1:(nvar+1)) {
    if((i==1) || (sum(which.int[,i]!=0))) {
      for(k in 1:nmod){
        if((i==1) || (which.int[k,i]==T)){
          
# FIND POSITION OF BETA_i IN MODEL k-----------------------------------
          if(i==1) pos <- 1
          else pos <- sum(which.int[k,(1:i)])
          EbiMk[k, i] <- model.fits[[k]][pos,1]
          sebiMk[k, i] <- model.fits[[k]][pos,2]
        }
      }
      Ebi[i] <- as.numeric(sum(postprob * EbiMk[,i]))
      SDbi[i] <- sqrt(postprob%*%(sebiMk[,i]^2)+postprob%*%((EbiMk[,i]-Ebi[i])^2))
    }
  }
  
## OUTPUT GIVEN IN DESCENDING ORDER OF POSTERIOR MODEL PROBABILITIES----
  dimnames(which) <- list(NULL,dimnames(x)[[2]])
  dimnames(which.int) <- list(NULL,c("Int",dimnames(x)[[2]]))
  dimnames(EbiMk) <- dimnames(sebiMk) <- list(NULL,c("Int",dimnames(x)[[2]]))
  
  return(list(post.order=post.order,
              postprob = postprob,
              label = label,
              r2 = r2,
              bic = bic, 
              size = size-1, 
              which = which, 
              which.int = which.int,
              margs = margs,
              probne0 = c(probne0),
              postmean = Ebi, 
              postsd = SDbi,
              ols = EbiMk,
              se = sebiMk))
}
  


##
## Zellner-Siow prior for the linear model; here we compute the Bayes
## factor to the full model exactly and using a Laplace
## approximation.
##
## The current model has $k$ covariates of a total of $p$ possible and
## there is a common intercept in all models being compared.
## The sample size is $n$
## 
## Originally we are considering
##
##   Y= 1 alpha + X1 beta1 + X2 beta2
## 
## where 1'X1=1'X2=0, rank(X1)=k, rank(X2)=p-k, and we are interested in
## testing H0:beta2=0.
##
## We then reparametrize to
##
##   Y= 1 alpha + X1 eta + V beta2
##
## where eta = beta1 + (X1'X1)^-1 X2 beta2
##         V = [I - X1(X1'X1)^-1 X1'] X2
## 
##               y'(I-P*-P_X1-P_X2)y
## R^2Full =  1- -------------------
##                    y'(I-P*)y
##
##                  y'(I-P*-P_X1)y
## R^2current =  1- --------------
##                    y'(I-P*)y
##
##   P* = 1 (1'1)^(-1) 1'= 1 1'/n, where 1 is the nx1 vector of ones
##   P_X1  = X1 (X1'X1)^(-1) X1'
##   P_V  = V (V'V)^(-1) V'
##   I  = nxn identity matrix
##
########################################
# Exact Bayes factor to the full model #
########################################

# Calculates the Bayes factor by numerical integration

ZSfull <- function(r2full,r2curr,n,p,k){
  if(k==p) out <- 0
  else{
    eps <- (1-r2full)/(1-r2curr)
    aux <- function(u){
      return(integrandfull(u,n,p,k,eps))
    }  
    out <- -log(integrate(aux,0,1)$value)
  }
  #return(-log(integrate(aux,0,1)$value))
  return(out)
}

# calculates the integrand in the Bayes Factor; change of variables 1/(1+g)=u

integrandfull <- function(u,n,p,k,eps){
  aux <- -(n-(k+1))*log(eps+u*(1-eps))/2
  aux <- aux + (p-(k+1))*log(u)/2
  aux <- aux - 3*log(1-u)/2
  aux <- aux - (n*u)/(2*(1-u))
  aux <- aux + log(n/2)/2-lgamma(1/2)
  return(exp(aux))
}

###########################
# A Laplace approximation #
###########################

# this returns the Laplace approx to the Bayes factor to the null model

ZSfullLAP <- function(r2full,r2curr,n,p,k){
  if(k==p) aux <- 0
  else{
    eps <- (1-r2full)/(1-r2curr)
    hat <- pmodefull(eps,n,p,k)
    if(length(hat)>1) stop("more than 2 roots")
    else{
      if(length(hat)==0) stop("no roots found")
      else
        if(hat[1]==0) stop("root at zero")
        else{
          aux <- -likfull(hat,eps,n,p,k)+(log(2*pi)-log(-infofull(hat,eps,n,p,k)))/2
        }
    }
  }
  return(aux)
}

# this returns log(likfullelihood times prior)
likfull <- function(g,eps,n,p,k){ 
  aux <- (n-(p+1))*log(1+g)-(n-(k+1))*log(1+eps*g)-3*log(g)-n/g
  aux <- aux/2
  aux <- aux+log(n/2)/2-lgamma(1/2)
  return(aux)
}

# this returns the first derivative of log(likfull())
scorefull <- function(g,eps,n,p,k){
  aux <- (n-(p+1))/(1+g)-(n-(k+1))*eps/(1+eps*g)-3/g+n/g^2
  aux <- aux/2
  return(aux)
}

# and this returns the second derivative of log(likfull())
infofull <- function(g,eps,n,p,k){
  aux <- -(n-(p+1))/(1+g)^2+(n-(k+1))*eps^2/(1+eps*g)^2+3/g^2-2*n/g^3
  aux <- aux/2
  return(aux)
}

# this gives the posterior mode
pmodefull <- function(eps,n,p,k){
  # aux <- zapsmall(polyroot(c(n,n*(eps+1)-3,n-p+eps*(k-2)-4,-eps*(p-k+3))));
  aux <- rootsfull(c(n,n*(eps+1)-3,n-p+eps*(k-2)-4,-eps*(p-k+3)))
  sol <- c()
  for(i in 1:length(aux)){
    if(aux[i]>=0) sol <- append(sol,aux[i])
  } 
  return(sol)
}

rootsfull <- function(coeff){
  c <- coeff[1]/coeff[4]
  b <- coeff[2]/coeff[4]
  a <- coeff[3]/coeff[4]

  Q <- (a^2-3*b)/9
  R <- (2*a^3-9*a*b+27*c)/54
  Q3 <- Q^3

  disc <- R^2-Q3

  if(disc>=0){
    A <- -sign(R)*(abs(R)+sqrt(disc))^(1/3)
    if(A==0.) B <- 0.
    else B <- Q/A
    return(c((A+B)-a/3))
  }
  else{
    theta <- acos(R/sqrt(Q3))
    x <- c(0,0,0)
    aux <- -2*sqrt(Q)
    x[1] <- aux*cos(theta/3)
    x[2] <- aux*cos((theta+2*pi)/3)
    x[3] <- aux*cos((theta-2*pi)/3) 
    aux <- a/3
    for(i in 1:3) x[i] <- x[i]-aux
    return(x)
  }
} 



##
## Zellner-Siow prior for the linear model; here we compute the Bayes
## factor to the null model exactly and using a Laplace
## approximation.
##
## The current model has $k$ covariates and there is a
## common intercept in all models being compared. The sample size is $n$
## 
## 
##           y'(I-P*-P)y
## R^2 =  1- ----------- ( = 1-epsilon )
##            y'(I-P*)y
##
##   P* = 1 (1'1)^(-1) 1'= 1 1'/n, where 1 is the nx1 vector of ones
##   P  = X (X'X)^(-1) X', where X is the nxk design matrix of the
##                         current model, excluding the column of ones
##   I  = nxn identity matrix
##
##################################
# Exact Bayes factor to the null #
##################################

# Calculates the Bayes factor by numerical integration

ZSnull <- function(r2,n,k,c=n/2){
  if(k==0) out <- 0
  else{
    aux <- function(u){
      return(integrand(u,n,k,r2,c=n/2))
    }
    out <- log(integrate(aux,0,1)$value)
    #return(log(integrate(aux,0,1)$value))
  }
  return(out)
}

# calculates the integrand in the Bayes Factor; change of variables 1/(1+g)=u

integrand <- function(u,n,k,r2,c){
  aux <- -(n-1)*log((1-r2)+u*r2)/2;
  aux <- aux + (k-1)*log(u)/2;
  aux <- aux - 3*log(1-u)/2;
  aux <- aux - c*u/(1-u);
  aux<- aux + log(c)/2-lgamma(1/2);
  return(exp(aux))
}

###########################
# A Laplace approximation #
###########################

# this returns the Laplace approx to the Bayes factor to the null model
ZSnullLAP <- function(r2,n,k){
  if(k==0) aux <- 0
  else{
    hat <- pmode(r2,n,k);
    if(length(hat)>1) stop("more than 2 roots")
    else{
      if(length(hat)==0) stop("no roots found")
      else
        if(hat[1]==0) stop("root at zero")
        else{
          aux <- lik(hat,r2,n,k)+(log(2*pi)-log(-info(hat,r2,n,k)))/2;
        }
    }
  }
  return(aux);
}

# this returns log(likelihood times prior)
lik <- function(g,r2,n,k){ 
  aux <- (n-1-k)*log(1+g)-(n-1)*log(1+(1-r2)*g)-3*log(g)-n/g;
  aux <- aux/2;
  aux <- aux+log(n/2)/2-lgamma(1/2);
  return(aux);
}

# this returns the first derivative of log(lik())
score <- function(g,r2,n,k){
  aux <- (n-1-k)/(1+g)-(n-1)*(1-r2)/(1+(1-r2)*g)-3/g+n/g^2;
  aux <- aux/2;
  return(aux);
}

# and this returns the second derivative of log(lik())
info <- function(g,r2,n,k){
  aux <- -(n-1-k)/(1+g)^2+(n-1)*(1-r2)^2/(1+(1-r2)*g)^2+3/g^2-2*n/g^3;
  aux <- aux/2;
  return(aux);
}

# returns the second derivative of the log-likelihood only
info2 <- function(g,r2,n,k){
  aux <- -(n-1-k)/(1+g)^2+(n-1)*(1-r2)^2/(1+(1-r2)*g)^2;
  aux <- aux/2;
  return(aux);
}

# this gives the posterior mode
pmode <- function(r2,n,k){
  # aux <- zapsmall(polyroot(c(n,n*r2-3,n-4-k-2*(1-r2),-(1-r2)*(k+3))));
  aux <- roots(c(n,n*(2-r2)-3,n-4-k-2*(1-r2),-(1-r2)*(k+3)));
  sol <- c();
  for(i in 1:length(aux)){
    if(aux[i]>=0) sol <- append(sol,aux[i])
  } 
  return(sol)
}

roots <- function(coeff){
  c <- coeff[1]/coeff[4];
  b <- coeff[2]/coeff[4];
  a <- coeff[3]/coeff[4];

  Q <- (a^2-3*b)/9;
  R <- (2*a^3-9*a*b+27*c)/54;
  Q3 <- Q^3;

  disc <- R^2-Q3;

  if(disc>=0){
    A <- -sign(R)*(abs(R)+sqrt(disc))^(1/3);
    if(A==0.) B <- 0.
    else B <- Q/A;
    return(c((A+B)-a/3));
  }
  else{
    theta <- acos(R/sqrt(Q3));
    x <- c(0,0,0);
    aux <- -2*sqrt(Q);
    x[1] <- aux*cos(theta/3);
    x[2] <- aux*cos((theta+2*pi)/3); 
    x[3] <- aux*cos((theta-2*pi)/3); 
    aux <- a/3;
    for(i in 1:3) x[i] <- x[i]-aux;
    return(x);
  }
}












## Calculates the marginal distribution of Y using a g-prior
margy <- function(g,n,k,r2){
  # if null model, force the marginal to zero
  if(k==0) out <- 0
  else{
    val <- (1+g)^(0.5*(n-k-1))
    out <- val*(1+g*(1-r2))^(-0.5*(n-1))
  }
  return(out)
}