load BC49DATA x=AD7129; Des=Descriptions; x(x<=1)=1; x=log2(x);[N,n]=size(x); % ---------------------------------------------------------------------------- % find the 100 genes most correlated with the ER gene in expression levels ngene=100; [ind,s_ER]=select_genes_cor(x,C_ER,ngene); X=x(ind,:); des=Des(ind,:) % perform SVD, and look at some of results [A,D,F]=svd_mw(X); 100*D.*D/sum(D.*D) % or, ... [A,D,F]=svd_mw(X-repmat(mean(X,2),1,n)); D 100*D.*D/sum(D.*D) figure(1) scatter(1:n,F(1,:)) % first factor on each of 49 arrays ShowGene(F,C_ER,1) % factor is a 'metagene' F(1,:)=-F(1,:); A(:,1)=-A(:,1); % sign is arbitrary ShowGene(F,C_ER,1) scatter(1:n,F(2,:)) % second factor ShowGene(F,C_ER,2) scatter(1:n,F(3,:)) % third factor ShowGene(F,C_ER,3) % now scatter plot pairs of factors pairs(F,[1 2]) pairs(F,[1 2 3]) % now a fancy scatter plot of pairs with color coding for two groups bcpairs(F,C_ER,[1 2]) bcpairs(F,C_ER,[1 2 3]) bcpairs(F,C_ER,[3 4 5]) % look at loadings of genes on factors A(:,[1 2]) scatter(1:ngene,A(:,1)) scatter(1:ngene,A(:,3)) Des( find(abs(A(:,3))>.2) , :) [a,j]=sort(A(:,1)); size(j) scatter(1:ngene,A(j,1)) j=flipud(j); scatter(1:ngene,A(j,1)) des(j(1:10),:) des(j(end:-1:(end-5)),:) clf sc3(F,C_ER,[1 2 3]) % interact, rotate etc ; look at sc3 function names=[ 'ps2 ', 'er1 ', 'tff1 ', 'cp450 ', 'igfr ', 'c23948']; for i=1:6 subplot(3,2,i) h=ind(j(i)); Des(h,:) bar( find(C_ER==0), x(h,C_ER==0),'b') hold on bar( find(C_ER==1), x(h,C_ER==1),'r') hold off title(names(i,:)) xlabel('Tumor sample') ylabel('Expn') end y=x; c_er=C_ER; [a,i]=sort(C_ER); y=y(:,i); c_er=C_ER(i); for i=1:6 subplot(3,2,i) h=ind(j(i)); Des(h,:) bar( find(c_er==0), 2.^y(h,c_er==0),'b') hold on bar( find(c_er==1), 2.^y(h,c_er==1),'r') hold off title(names(i,:)) xlabel('Tumor sample') ylabel('Expn') end % % -------------------------------------------------------------------- % Start on factor regressions % load data in again % order data by ER status -- just for graphics [a,j]=sort(C_ER); x=x(:,j); C_ER=C_ER(j); Des(5524,:) ShowGene(x,C_ER,5524) X=std_rows(x); y=X(5524,:)'; X(5524,:)=0; [N,n]=size(X); p=N; [ind,s]=select_genes_cor(X,y,N); hist(s,40); s(ind(1:5)) Des(ind(1:5),:) bcpairs(x,C_ER,ind(1:2)) bcpairs(x,C_ER,ind(1:3)) ig=find(abs(s)>.5); length(ig) % create factors on these 89 genes [A,D,F]=svd_mw(X(ig,:)); ShowGene(F,C_ER,1) [inf,r]=select_genes_cor(F,y,n); hist(r,20); r(inf(1:5)) %%%% NOTE WELL: Factor one is better than any one gene alone ... bcpairs(F,C_ER,inf(1:3)) clf figure(1); subplot(2,1,1); bar(1:n,x(5524,:)) subplot(2,1,2); ShowGene(x,C_ER,5524) figure(2); imagesc(std_rows(X(ig,:))); cmap rg; colorbar imagecf(std_rows(X(ig,:)),'rg'); colorbar imagecf(std_rows(X(ig,:)),'rg',[1 2]); colorbar cmap hot cmap jet %% regression on factors? % [beta,v,prob,haty,e]=mreg(y,F(1:5,:)); [beta v prob] stepwise(F(1:5,:)',y) [beta,v,prob,haty,e]=mreg(y,F(1:10,:)); [beta v prob] [beta,v,prob,haty,e]=mreg(y,F([1 6],:)); [beta v prob] %% -- refit [beta,v,prob,haty,e]=mreg(y,F(1,:)); [beta v prob] %% find resids and see what correlates with them from the remaining genes... [ing,t]=select_genes_cor(X,e,N); hist(t,40); t(ing(1:5)) ig=find(abs(t)>.35); length(ig) [a,d,f]=svd_mw(X(ig,:)); bcpairs([e'; f(1,:)],C_ER,[1 2]) H=[F(1,:); f(1:2,:)]; [beta,v,prob,haty,e]=mreg(y,H); stepwise(H',y) % --------------------------------------------------------------------- % Now let's go back and find SVD of all genes ... [A,D,F]=svd_mw(x-repmat(mean(x,2),1,n)); select_genes_cor(A,C_ER,10) % top 200 genes on first factor, relate to ER status d=genefactor(A,s_ER,Des,200,15); d(1:20,:) % other factors, ER status ... d=genefactor(A,s_ER,Des,200,1); d(1:20,:) % factor 2 looks interesting % correlation relates to loading d=genefactor(A,s_ER,Des,200,3); d(1:20,:) d=genefactor(A,s_ER,Des,200,6); d(1:20,:) % factor 6 looks interesting, more so % now let's look at Nodal status ... [ind,s_NO]=select_genes_cor(x,LNpos>0,ngene); X=x(ind,:); des=Des(ind,:) ShowGene(X,LNpos>0,1) scatter(X(1,:),log2(1+LNpos)) [ind,s_NO]=select_genes_cor(x,log2(1+LNpos),ngene); X=x(ind,:); des=Des(ind,:) ShowGene(X,LNpos>0,1) scatter(X(1,:),log2(1+LNpos)) [A,D,F]=svd_mw(X d=genefactor(A,s_NO,Des,200,1); d(1:20,:) d=genefactor(A,s_NO,Des,200,2); d(1:20,:) d=genefactor(A,s_NO,Des,200,4); d(1:20,:) % etc % of interest to repeat this exploration using subsets of genes % ---------------------------------------------------------------------- %% Now more on factor regressions using full Bayesian MCMC analysis %% % read in data, remove Affy controls and genes that do not vary across arrays % set up validation cases ivalid=sort(unique([ivalidER 7 8])) % fit Bayesian linear model, predicting expression of a single gene using % SVD regression on the remaining genes; output approx posterior means and SDs % of regression parameters igene=5524; Des(igene,:) % select this gene as response, and create data and description for the rest ... X=x; y=X(igene,:)'; des=Des; des(igene,:)=[]; X(igene,:)=[]; [beta,li,alli]=Mregsvd(X,y,ivalid,20,4,[2000 500 1],0); [beta,li,alli]=Mregsvd(X,y,ivalid,20,4,[500 200 1],1); [b,i]=sort(abs(beta)); i=flipud(i); des(li(i(1:5)),:) % again ... q=20; igene=i(1:q); [int2str((1:q)'),repmat(' ',q,1),num2str(beta(igene),2),repmat(' ',q,1),des(li(igene),1:80)]