% Simple example: Compute raw correlations of ER gene with others, % and look in various ways at the expression patterns % % Includes simple image plots, and multiple regression via least squares fit % % Use variants of this code to look at how regression coeffs vary as different % subsets of genes are fitted: compare coeffs with raw correlations % % Can also use variants of this code to select genes most highly correlated % with an outcome, even a binary outcome % %% --- read in data first, then create expression matrix x x=AD7129; Des=Descriptions; x(x<=1)=1; x=log2(x); [N,n]=size(x) y=x(5524,:); % selects estrogen receptor gene as a response Des(5524,:) clf scatter(1:n,y) [ind,sERG]=select_genes_cor(x,y,N); % vector sERG holds correlations hist(sERG,100) % explore most highly correlated genes, say 50 of them % via image plot of expression levels plot(sERG(ind(1:50))) imagecf(x(ind(1:50),:),'gr') % let's reorder columns (arrays) to have ER+ first, ER- next [a,j]=sort(C_ER); imagesc(x(ind(1:50),j)); colormap hot b=sum(C_ER); line(0.5+[b b],[0 51]) % separates +/- arrays c=0.5+sum(sERG(ind(1:50))>0); line([0 n+1],[c c]+.5); % separates pos/neg genes imagecf(x(ind(1:50),j),'hot'); colormap hot b=sum(C_ER); line(0.5+[b b],[0 51]) % separates +/- arrays c=0.5+sum(sERG(ind(1:50))>0); line([0 n+1],[c c]+.5); % separates pos/neg genes % ----------------------------------------------------------------------------- % or,how about simply [ind,sERG]=select_genes_cor(x,C_ER,N); j=1:n; j=[j(C_ER==1) j(C_ER==0)]; imagesc(x(ind(1:100),j)); colormap('jet') X=std_rows(x); imagesc(X(ind(1:100),j)); colormap('jet') colormap hot % cover functions .... imagecf(X(ind(1:50),j),'hot') cmap jet cmap winter cmap gr b=sum(C_ER==1); line(0.5+[b b],[0 51]) c=0.5+sum(sERG(ind(1:50))>0); line([0 50],[c+.5 c+.5]) text(13,-2,'ER+ Cases','horizontalalignment','center') text(37,-2,'ER- Cases','horizontalalignment','center') %% % let's do some more images ... now ER status related % ind=select_genes_cor(x,C_ER,100); size(ind) imagesc(x(ind,:)); cmap gr; colorbar imagesc(std_rows(x(ind,:))); cmap gr; colorbar imagecf(x(ind,:),'gr') [a,i]=sort(C_ER); ShowGene(x(:,i),C_ER(i),5524) imagecf(x(ind,i),'gr') imagecf(std_rows(x(ind,i)),'gr'); X=std_rows(x); imagecf(X(ind,i),'jet') imagecf(X(ind,i),'gr',1) imagecf(X(ind,i),'gr',2) imagecf(X(ind,i),'gr',[1 2]); colorbar imagecf(X(ind,i),'gr',1:5) % look at the imagecf function ... % % some more with additional genes ... Des(2542,:) ind=[ind select_genes_cor(x,x(2542,:),25)]; size(ind) size(unique(ind)) imagecf(X(ind,i),'gr',1:2); colorbar % ------------------------------------------------------- % let's select just a few genes for regression ... p=40; ind=select_genes_cor(x,C_ER,p); sERG(ind) sERG(ind)' Des(ind,:) scatter(x(ind(1),:),y) scatter(x(ind(20),:),y) subplot(2,2,1); scatter(x(ind(1),:),y) subplot(2,2,2); scatter(x(ind(2),:),y) subplot(2,2,3); scatter(x(ind(3),:),y) subplot(2,2,4); scatter(x(ind(4),:),y) % ------------------------------------------------------- % fit linear regression models using least squares -- using % only the first 4 predictors -- to predict/explain the ER gene clf size(y) y=reshape(y,n,1); size(y) size(x) [beta,v,prob,haty,e]=mreg(y,x(ind(2:5),:)); % dont uses x(1,:) as that's y [beta(2:5) sERG(ind(2:5)) v(2:5) prob(2:5)] % compare coeffs with correlations scatter(haty,y); xlabel('fitted values'); ylabel('response') % compare data and fitted values scatter(haty,e); xlabel('fitted values'); ylabel('response') % residuals from fit should look random % symmetric about zero, no structure % draw a plot and "hold" it "on" for use later ... fitted values again figure(5); scatter(haty,y); xlabel('fitted values'); ylabel('response'); hold on % -------------------------------------------------------------------------- % matlab toolbox functions for linear regression: help regress help stepwise [Beta,bint,r,rint,stats]=regress(y,[ones(n,1) x(ind(2:5),:)']); [beta Beta] bint [ bint(:,1) Beta bint(:,2)] stats % now stepwise help stepwise stepwise(x(ind(2:5),:)',y) Beta % click on graphs in gui ... see significance, multicollineaity stepwise(x(ind(2:10),:)',y) % --------------------------------------------------------------------- % Shrinkage prior analyses .. % look at function mregbayes % [Beta,v,prob,haty,e]=mreg(y,x(ind(2:5),:)); [beta,v,prob,haty,e]=mregbayes(y,x(ind(2:5),:),1e6); [beta Beta] [beta,v,prob,haty,e]=mregbayes(y,x(ind(2:5),:),10); [beta Beta] [beta,v,prob,haty,e]=mregbayes(y,x(ind(2:5),:),1); [beta Beta] [beta,v,prob,haty,e]=mregbayes(y,x(ind(2:5),:),.01); [beta Beta] %%% -- may also use a prior inverse gamma prior on residual variance %% see function help mregbayes [beta,v,prob,haty,e]=mregbayes(y,x(ind(2:5),:),0.01,[ 1 1] ); %%% Now, may use this to output marginal model (=tau) likelihood function to % also assess relevant values of tau. For example, [beta,v,prob,haty,e,ml]=mregbayes(y,x(ind(2:5),:),0.01); ml [beta,v,prob,haty,e,ml]=mregbayes(y,x(ind(2:5),:),0.5); ml [beta,v,prob,haty,e,ml]=mregbayes(y,x(ind(2:5),:),1); ml % so why not do it properly ... Tau=(1:20)/140; mlik=[]; for tau=Tau [beta,v,prob,haty,e,ml]=mregbayes(y,x(ind(2:5),:),tau); mlik=[mlik ml]; end scatter(Tau, mlik) % Hmm, need larger values ... Tau=(1:50)/100; mlik=[]; for tau=Tau [beta,v,prob,haty,e,ml]=mregbayes(y,x(ind(2:5),:),tau); mlik=[mlik ml]; end scatter(Tau, mlik) scatter(Tau, exp(mlik-max(mlik))) % looks like MLE is around 0.15 - check Tau( find(mlik==max(mlik)) ) [beta,v,prob,haty,e,ml]=mregbayes(y,x(ind(2:5),:),0.12); beta %% Binary outcome? [beta,v,prob,haty,e,ml]=mregbayes(C_ER,x(ind(2:5),:),0.15); beta scatter(haty,y) % need binary regression model .. % -------------------------------------------------------------------------- % explore some selection of predictors ... forward selection using "optimal" % choice at each step in terms of correlation with residuals at that step % look at mregbic function % [beta,v,varind,prob,haty,e,R2,BIC]=mregbic(y,x,5); % -------------------------------------------------------------------------- % another way to explore some selection of predictors ... drops coeffs at 2.5% significance [beta,v,prob,haty,e,varind]=mregauto(y,x(ind,:),0.025); % creates figure 4 with estimates and intervals of betas varind % which predictors are selected from 1-20 selind=ind(varind); % index numbers of selected genes in full set Des(selind,:) % names of these selected genes figure(5); scatter(haty,y,'r+'); hold off % adds fitted values, in red, to initial plot % similar to that with 8 top genes: parsimony, meaning prob % significance levels for selected predictors scatter(haty,e) % residuals from this refined model % ------------------------------------------------------- % now let's do a predictive analysis -- use these genes but refit the model to % only the first 43 arrays and predict the rest ... [beta,v,prob,haty,e,predy,predv]=mregpredict(y,x(selind,:), 44:49); % creates fig 2 with betas % and fig 3 with point and 95% interval estimates of predicted outcomes on cases 44-49