% Simple example: Compute raw correlations of ER gene with others,
% and look in various ways at the expression patterns
%
% Includes simple image plots, and multiple regression via least squares fit
%
% Use variants of this code to look at how regression coeffs vary as different
%  subsets of genes are fitted: compare coeffs with raw correlations
%
% Can also use variants of this code to select genes most highly correlated 
%  with an outcome, even a binary outcome 
%

%% --- read in data first, then create expression matrix x
x=AD7129; Des=Descriptions; x(x<=1)=1; x=log2(x);
[N,n]=size(x)

y=x(5524,:); 
	% selects estrogen receptor gene as a response

Des(5524,:)

clf
scatter(1:n,y)

sERG=[]; 	 		% vector sERG holds correlations
v=std(x'); 			% sample stds of all genes
plot(v)				% note that some are zero 
	
    for i=1:N
        if (v(i)>0) 
 	    c=corrcoef(y,x(i,:));  
	    sERG=[sERG; c(1,2) ];
        else
	    sERG=[sERG;0]; 
        end;
    end;
 
plot(sERG)
plot(sort(sERG))
   
% explore most highly correlated genes, say 50 of them
%  via image plot of expression levels 

ngene=50; 
[b,i]=sort(abs(sERG)); i=flipud(i); ind=i(1:(1+ngene)); 
  % that sorts genes by absolute correlation with ER gene
plot(sERG(ind))

[b,i]=sort(sERG(ind)); i=flipud(i); 
  % that orders from pos to neg correlation
  % can look at names with Des(ind(i),:)
  % now image plot ...

imagesc(x(ind(i),:)); colormap hot

% let's reorder columns (arrays) to have ER+ first, ER- next
 j=1:n; j=[j(C_ER==1) j(C_ER==0)]; 
 imagesc(x(ind(i),j)); colormap hot
 hold on 
 b=sum(C_ER==1);         plot([b b],[0 ngene+1]);  % separates +/- arrays
 b=0.5+sum(sERG(ind(i))>0); plot([0 n+1],[b b]);      % separates pos/neg genes
 hold off


% -------------------------------------------------------
% let's select just a few genes for regression ... 	

p=40; [b,i]=sort(abs(sERG)); i=flipud(i); ind=i(2:(1+p)); 
sERG(ind)
sERG(ind)'
Des(ind,:)

scatter(x(ind(1),:),y)
scatter(x(ind(20),:),y)

subplot(2,2,1); scatter(x(ind(1),:),y)
subplot(2,2,2); scatter(x(ind(2),:),y)
subplot(2,2,3); scatter(x(ind(3),:),y)
subplot(2,2,4); scatter(x(ind(4),:),y)


% -------------------------------------------------------
% fit linear regression models using least squares -- using
%  only the first 8 predictors -- to predict/explain the ER gene
clf
size(y)
y=reshape(y,n,1);
size(y)
size(x)
[beta,v,prob,haty,e]=mreg(y,x(ind(1:8),:));

[beta(2:9),sERG(ind(1:8))]	     % compare coeffs with correlations

scatter(haty,y); xlabel('fitted values'); ylabel('response')
				     % compare data and fitted values	
scatter(haty,e); xlabel('fitted values'); ylabel('response')
				     % residuals from fit should look random
				     % symmetric about zero, no structure

% draw a plot and "hold" it "on" for use later ... fitted values again
figure(5); scatter(haty,y); xlabel('fitted values'); ylabel('response'); hold on

% --------------------------------------------------------------------------
% explore some selection of predictors ... drops coeffs at 2.5% significance

[beta,v,prob,haty,e,varind]=mregauto(y,x(ind,:),0.025);
	% creates figure 4 with estimates and intervals of betas

varind
	% which predictors are selected from 1-20
selind=ind(varind);    
	% index numbers of selected genes in full set
Des(selind,:)
	% names of these selected genes
figure(5); scatter(haty,y,'r+'); hold off
	% adds fitted values, in red, to initial plot
	% similar to that with 8 top genes: parsimony, meaning
prob
	% significance levels for selected predictors
scatter(haty,e)
	% residuals from this refined model


% -------------------------------------------------------
% now let's do a predictive analysis -- use these genes but refit the model to
% only the first 43 arrays and predict the rest ...

[beta,v,prob,haty,e,predy,predv]=mregpredict(y,x(selind,:), 44:49);
	% creates fig 2 with betas 
	%    and  fig 3 with point and 95% interval estimates of 
	       predicted outcomes on cases 44-49