##################################### # Mixture of exponentials model for # # survival data. Unknown number of # # components. Use R.J. MCMC for # # inference. # # Last modified: 04/15/02 # ##################################### #Simulate data from a 5 component model: l<-c(0.5,0.75,1,1.25,1.5) p<-rep(1/5,5) n<-200 ll<-sample(l,size=n,replace=T,prob=p) table(ll) #ll # 0.5 0.75 1 1.25 1.5 # 40 43 42 37 38 x<-rexp(n,rate=ll) # right censor about 1/4 of sample sum(x>1.5) #[1] 60 d<-1-(x>1.5) table(d) #d # 0 1 # 60 140 x[x>1.5]<-1.5 #------------- plot the hazard for this model:------# hm<-function(x,l,p){ #vector arguments x, l and p h<-t(outer(exp(-l),x,FUN="^")) h<-sweep(h,2,p,FUN="*") hd<-apply(h,1,sum) hn<-sweep(h,2,l,FUN="*") hn<-sweep(hn,1,hd,FUN="/") return(apply(hn,1,sum)) } postscript("hf.ps",horizontal=F) grid<-seq(0,10,by=0.01) plot(grid,hm(grid,l,p),type="l",xlab="time",ylab="hazard", main="5 Component Exponential Hazard Function") dev.off() #------------ some useful functions ----------------# #cell probabilities for imputing latent component cl<-function(l,p,x,d){ f<-t(outer(l,d,FUN="^")) f<-f*t(outer(exp(-l),x,FUN="^")) f<-sweep(f,2,p,FUN="*") f<-sweep(f,1,apply(f,1,sum),FUN="/") return(f) } #-------------- 5 component model sampling: ------------------------# lk<-c(0,l,Inf) II<-50000 samp<-matrix(0,nrow=II,ncol=10) #prior on lambda_k is Gamma(a,b) all k a<-b<-0.5 for (i in 1:II){ #new latent indicators: pk<-cl(lk[2:6],p,x,d) k<-apply(pk,1,FUN="sample",x=5,size=1,replace=TRUE) #new mixing fractions: p<-rgamma(5, shape=table(factor(k,levels=1:5))+1) p<-p/sum(p) #now Dirichlet #new lambdas: for (kk in 1:5){ #inverse cdf method (need observe constraints) pg<-pgamma(lk[c(kk,kk+2)],shape=sum(d[k==kk])+a,rate=sum(x[k==kk])+b) u<-runif(1) qg<-pg[1]+(pg[2]-pg[1])*u lk[kk+1]<-qgamma(qg,shape=sum(d[k==kk])+a,rate=sum(x[k==kk])+b) } samp[i,]<-c(lk[2:6],p) } #--------------------- "Output Analysis" -----------------------------# postscript("trace.lambda.5mixexp.ps",horizontal=F) par(mfrow=c(5,1)) for (i in 1:5) plot(samp[,i],main=paste("Trace of Lambda_",i,sep=""), xlab="Iteration number",ylab="Lambda") dev.off() postscript("trace.pi.5mixexp.ps",horizontal=F) par(mfrow=c(5,1)) for (i in 6:10) plot(samp[,i],main=paste("Trace of Lambda_",i,sep=""), xlab="Iteration number",ylab="Lambda") dev.off() postscript("hist.5mixexp.ps",horizontal=F) par(mfrow=c(5,2)) for (i in c(1,6,2,7,3,8,4,9,5,10)) hist(samp[,i],nclass=50) dev.off() apply(samp,2,mean) #II=50K: # [1] 0.1842993 0.4667053 0.7687178 1.1231460 1.8866952 # [5] 0.1031825 0.1848607 0.2385244 0.2557332 0.2176993 library(coda,lib.loc="/usr/home/fac/iversen/lib/osf1/R") chain<-mcmc(samp) #make type mcmc for coda raftery.diag(chain,q=0.025,r=0.01,s=0.95) #Quantile (q) = 0.025 #Accuracy (r) = +/- 0.01 #Probability (s) = 0.95 # # Burn-in Total Lower bound Dependence # (M) (N) (Nmin) factor (I) # 6 2078 937 2.22 # 24 7314 937 7.81 # 35 10955 937 11.70 # 52 14183 937 15.10 # 24 7572 937 8.08 # 15 5480 937 5.85 # 24 6810 937 7.27 # 40 11336 937 12.10 # 40 11280 937 12.00 # 24 9592 937 10.20 #----------- unknown # components model sampling: -----------------# #------------ some useful functions ----------------# #mixture log-likelihood mll<-function(l,p,x,d){ f<-t(outer(l,d,FUN="^")) f<-f*t(outer(exp(-l),x,FUN="^")) f<-sweep(f,2,p,FUN="*") f<-sum(log(apply(f,1,sum))) return(f) } #log prior log Pr(pi,lambda,c) lprior<-function(a,b,l,rho){ # note: not a function of p b/c p ~ Dir(1) lp<-sum(dgamma(l,shape=a,rate=b,log=T)) return(lp+dpois(length(l),lambda=rho,log=T)) } J<-function(p,u,m="split"){ #jacobian: ifelse(m=="split",return(p/(1-u)),return((1-u)/p)) } qratio<-function(l1,l2,u,m="split"){ r<-(l2-l1)/(6*u*(1-u)) ifelse(m=="split",return(r),return(1/r)) } fs<-function(u,v,l,p){ return(list(l1=v,l2=(l-(v*u))/(1-u),p1=u*p,p2=(1-u)*p)) } fj<-function(l1,l2,p1,p2){ return(list(u=p1/(p1+p2),v=l1,l=((l1*p1/(p1+p2))+(l2*p2/(p1+p2))),p=p1+p2)) } #Simulate data from a 5 component model: l<-c(0.2,1,5,25,125) p<-rep(1/5,5) n<-1000 ll<-sample(l,size=n,replace=T,prob=p) table(ll) #ll #0.2 1 5 25 125 #194 209 194 197 206 x<-rexp(n,rate=ll) #note: do not censor d<-rep(1,1000) #-------------- sampling: --------------------------# II<-100000 #prior on lambda_k is Gamma(a,b) all k a<-1 b<-0.1 rho<-5 numk<-5 max.numk<-20 samp<-matrix(0,nrow=II,ncol=2*max.numk) samp.numk<-numeric(II) l<-sort(rgamma(5,shape=1, rate=0.1)) #[1] 1.485107 1.639263 2.215048 5.073247 16.823258 p<-rgamma(5,shape=1, rate=1) p<-p/sum(p) #[1] 0.24062197 0.04646496 0.19900917 0.03409506 0.47980884 n<-1000 ll<-sample(l,size=n,replace=T,prob=p) table(ll) #ll # 1.485107 1.639263 2.215048 5.073247 16.823258 # 244 40 200 32 484 x<-rexp(n,rate=ll) #note: do not censor d<-rep(1,1000) lk<-c(0,l,Inf) for (i in 1:II){ if ((i/10)==round(i/10,0)) cat(paste("Iter ",i,"\n",sep="")) numk.i<-length(p) ud<-runif(1) if (ud<1/3){ #do not jump dimensions #new latent indicators: pk<-cl(lk[2:(numk+1)],p,x,d) k<-apply(pk,1,FUN="sample",x=numk,size=1,replace=TRUE) #new mixing fractions: p<-rgamma(numk, shape=table(factor(k,levels=1:5))+1) p<-p/sum(p) #now Dirichlet #new lambdas: for (kk in 1:numk){ #inverse cdf method (need observe constraints) pg<-pgamma(lk[c(kk,kk+2)],shape=sum(d[k==kk])+a,rate=sum(x[k==kk])+b) u<-runif(1) qg<-pg[1]+(pg[2]-pg[1])*u lk[kk+1]<-qgamma(qg,shape=sum(d[k==kk])+a,rate=sum(x[k==kk])+b) } #cat(paste("Iter ",i,": stay\n",sep="")) samp[i,c(1:numk,(max.numk+1):(max.numk+numk))]<-c(lk[2:(numk+1)],p) samp.numk[i]<-numk } # end of sample same dimensional model if (((numk.i>1)&&(ud>=1/3)&&(ud<2/3))|| ((numk.i==1)&&(ud>=1/3))){ #split: move up a dimension kk<-sample(1:numk,size=1) u1<-rbeta(1,2,2) v1<-runif(1,min=lk[kk],max=lk[kk+1]) f<-fs(u1,v1,lk[kk+1],p[kk]) if (f$l1>f$l2) cat("ERROR in fs()\n") l.prop<-c(lk[1:kk],f$l1,f$l2,lk[(kk+2):(numk+2)]) l.prop<-l.prop[-c(1,numk+3)] p.prop<-c(0,p,0) p.prop<-c(p.prop[1:kk],f$p1,f$p2,p.prop[(kk+2):(numk+2)]) p.prop<-p.prop[-c(1,numk+3)] #browser() alpha<-mll(l.prop,p.prop,x,d)-mll(lk[2:(numk+1)],p,x,d) alpha<-alpha+lprior(a,b,l.prop,rho)-lprior(a,b,lk[2:(numk+1)],rho) alpha<-exp(alpha)*qratio(lk[kk],lk[kk+1],u1,m="split")*J(p[kk],u1,m="split") if (numk.i==1) alpha<-2*alpha #cat(paste("Iter ",i,": split, alpha = ",alpha,"\n",sep="")) if (runif(1)1)&&(ud>=2/3)){ #join: drop a dimension kk<-sample(1:(numk-1),size=1) f<-fj(lk[kk+1],lk[kk+2],p[kk],p[kk+1]) l.prop<-lk[-(kk+2)] l.prop[kk+1]<-f$l l.prop<-l.prop[2:numk] p.prop<-p[-(kk+1)] p.prop[kk]<-f$p #browser() alpha<-mll(l.prop,p.prop,x,d)-mll(lk[2:(numk+1)],p,x,d) alpha<-alpha+lprior(a,b,l.prop,rho)-lprior(a,b,lk[2:(numk+1)],rho) alpha<-exp(alpha)*qratio(lk[kk],f$l,f$u,m="join")*J(f$p,f$u,m="join") #cat(paste("Iter ",i,": join, alpha = ",alpha,"\n",sep="")) if (runif(1) l #[1] 1.485107 1.639263 2.215048 5.073247 16.823258 #> p #[1] 0.40906301 0.25482845 0.04423823 0.29187031 apply(samp[samp.numk==2,c(1:2,(max.numk+1):(max.numk+2))],2,mean) #[1] 1.6362581 14.7150359 0.4507706 0.5492294 apply(samp[samp.numk==3,c(1:3,(max.numk+1):(max.numk+3))],2,mean) # [1] 1.4492121 6.8556491 18.5952831 # [4] 0.3394099 0.2787735 0.3818166 apply(samp[samp.numk==4,c(1:4,(max.numk+1):(max.numk+4))],2,mean) #[1] 1.3215290 4.3594756 11.1575737 20.9709428 #[5] 0.2683048 0.2254216 0.2260962 0.2801773 postscript("rj.numktrace.ps") par(las=1) plot(samp.numk,xlab="Iteration Number",ylab="Number of Components", main="Trace Plot of Model Dimension") dev.off()