rm(list=ls()) #***************************************************************************** # SAMGS.LC Testing # based on 'Best Combination' #***************************************************************************** library(corpcor) library(qvalue) # GS.format.dataframe.to.list was called below in the function SAMGS GS.format.dataframe.to.list <- function(GS){ # GS : gene sets # -> a dataframe with rows=genes, # columns= gene sets, # GS[i,j]=1 if gene i in gene set j # GS[i,j]=0 otherwise # OR # a list with each element corresponding to a gene set = a vector of strings (genes identifiers) # if(is.data.frame(GS)){ genes <- rownames(GS) L <- NULL for(ags in names(GS)){ w <- which(GS[,ags]==1) if(length(w)>0) { L <- c(L,list(genes[w])) names(L)[length(L)] <- ags } } L }else{ GS } } T2.like.SAMGS <- function(DATA, cl){ # DATA : expression data # -> dataframe with rows=genes, # columns=samples, # weight: weights to T2 statistics of genes. # cl : factor defining a bipartition of the samples # IN THE SAME ORDER AS IN DATA if(!is.null(cl)){ cl <- as.factor(cl) C1 <- which(cl==levels(cl)[1]) C2 <- which(cl==levels(cl)[2]) } if(is.null(C1) | is.null(C2))stop("Error - SAMGS-PC.TlikeStat : classes 1 and 2 are undefined.") GS.size<-dim(DATA)[1]; if(GS.size>=2){ means.C1 <- rowMeans(DATA[,C1]) means.C2 <- rowMeans(DATA[,C2]) diffmean.C1C2 <- means.C1 - means.C2 } else{ means.C1 <- mean(DATA[C1]) means.C2 <- mean(DATA[C2]) diffmean.C1C2 <- means.C1 - means.C2 } sum(diffmean.C1C2^2) } LC<-function(DATA, cl, nbPermutations=1000, s0=0){ # DATA : single data set with dim pxn # -> a dataframe or matrix with rows=genes, # columns=samples # -> make sure that DATA is a dataframe or matric even p=1 # # cl : a factor defining a bipartition of the samples # ->IN THE SAME ORDER AS IN DATA # # nbPermutations: number of permutation. # # s0 : a positive constant # -> used to avoid 'small variation problem', as used in SAM and SAMGS GS.size<-dim(DATA)[1]; nb.Samples<-ncol(DATA); # nb of samples if(GS.size>=2){ # (1) shrinking the pooled covariance matrix. DATA<-t(DATA) # with dim nxp Cov.Pooled<-cov.shrink(DATA, verbose=FALSE, lambda.var=0) # (2) eigen decomposition of pooled covariance EIGEN.decom<-eigen(Cov.Pooled); D<-EIGEN.decom$values+s0; # further shrinkag by adding a possitive constant s0 U<-EIGEN.decom$vectors; # (3) adjust DATA DATA<-t(DATA%*%U)/sqrt(D) # with dim pxn } if(GS.size==1){ cov.pooled<-c(cov(t(DATA))) DATA<-DATA/sqrt(cov.pooled+s0) DATA<-as.matrix(DATA) } # (4) T2-like stats obtained on 'true' data sam.sumsquareT.obs<-T2.like.SAMGS(DATA,cl) # (5) T2-like stats obtained on 'permutation' data nb.Samples<-length(cl); sam.sumsquareT.permut <- rep(NA,nbPermutations) for(i in 1:nbPermutations) { ind <- sample(nb.Samples) data<-matrix(DATA[,ind],ncol=nb.Samples) sam.sumsquareT.permut[i] <- T2.like.SAMGS(data,cl); } # (6) p-value p.value <- sum(sam.sumsquareT.permut>=sam.sumsquareT.obs)/nbPermutations return(p.value) } SAMGS.LC <- function(GS, DATA, cl, nbPermutations=1000, if.Qvalue=FALSE){ # GS : gene sets # -> a dataframe with rows=genes, # columns= gene sets, # GS[i,j]=1 if gene i in gene set j # GS[i,j]=0 otherwise # OR # a list with each element corresponding to a gene set = a vector of strings (genes identifiers) # # # DATA : expression data # -> a dataframe with rows=genes, # columns=samples # # cl : a factor defining a bipartition of the samples IN THE SAME ORDER AS IN DATA # # nbPermutations: number of permutation. # # (1) pre-treatment of the gene sets genes <- rownames(DATA) # gene names of the microarray data nb.Samples <- ncol(DATA) # nb of samples nb.GeneSets <- length(GS) # nb of gene sets GS <- GS.format.dataframe.to.list(GS); # change format of GS from dataframe to list GS <- lapply(GS,function(z) as.numeric(which(genes %in% z))); # numericalized index of each GS GS.sizes <- sapply(GS,length) # size of each gene set GS.data <- lapply(GS, function(z) as.matrix(DATA[z, ],ncol=nb.Samples)); # creat data of each GS (rows=genes, columns=samples) # (2) testing data of each GS GeneSets.pval<-sapply(GS.data, function(z) LC(z, cl, nbPermutations=1000, s0=0)) # (3) q-value if(nb.GeneSets>=2 & if.Qvalue){ GeneSets.qval <-qvalue(GeneSets.pval)$qvalues res <- as.data.frame(cbind("GS size" = GS.sizes, "GS p-value" = GeneSets.pval, "GS q-value" = GeneSets.qval )) res <- cbind(res,"GS name"= names(GS))[c(4,1:3)] } else{ #if there is only one set, no need to calculate q-value. res <- as.data.frame(cbind("GS size" = GS.sizes, ##GeneSets.sizes, "GS p-value" = GeneSets.pval)) res <- cbind(res,"GS name"= names(GS))[c(3,1:2)] } rownames(res)<-NULL list("GS stats"=res) } #***************************************************************************** # Hierarchical Cluster Testing # based on (a) hierarchical cluster and (b) testing method #***************************************************************************** library(cluster) library(corpcor) library(qvalue) mergetosubset<-function(merge){ #merge: a component of hirarchical cluster object. # -> a matrix with two collumns descript the process of agglomeration. # 'merge' in a object created by 'agnes' # maxsteps<-dim(merge)[1] subset.list<-list() for(i in 1:maxsteps){ a<-merge[i,1] b<-merge[i,2] if(a<0) a<- -a else a<-subset.list[[a]] if(b<0) b<- -b else b<-subset.list[[b]] subset.list[[i]]<-c(a,b) } return(subset.list) } branch<-function(merge, node){ #merge : a component of hirarchical cluster object. # # -> a matrix with two collumns descript the process of agglomeration. # 'merge' in a object created by 'agnes' # #node : a number in 1:M with M the maximun steps used in cluster algorithm. # M=dim(merge)[1] # family<-node children<-merge[node,] family<-c(family, children) trace<-any(children>0) while(trace){ parents<-children[children>0] n<-length(parents) children<-NULL for(i in 1:n){ children<-c(children, merge[parents[i],]) } family<-c(family, children) trace<-any(children>0) } return(family) } hc.test<-function(DATA, cl, nbPermutations=1000, s0=0, critic=0.05){ # DATA : single data set with dim pxn # -> a dataframe or matrix with rows=genes, # columns=samples, # cl : a factor defining a bipartition of the samples # ->IN THE SAME ORDER AS IN DATA # # nbPermutations: number of permutation. # # s0 : a positive constant # -> used to avoid 'small variation problem', as used in SAM and SAMGS # # critic: critic value for testing. # # default testing method is LC, # we can adjust the codes to allow using alternative methods including # (1) SAMGS # (2) MANOVA # (3) LC+MANOVA # (4) SAMGS+MANOVA # 1. hierarchical cluster using 'agnes(cluster)' # ->the function agnes use a 'Agglomerative Nesting' strategy which is bottom-up GS.size<-dim(DATA)[1] object<-agnes(DATA) # 2. create subset list. merge<-object$merge subset.list<-mergetosubset(merge) # 3.hierarchical cluster testing: # -> we use a top-down testing strategy maxsteps<-dim(merge)[1]; sig.branch<-pvalue.branch<-adj.pvalue.branch<-rep(NA,maxsteps) sig.leave<-pvalue.leave<-adj.pvalue.leave<-rep(NA,GS.size) # (a) branch testing for (i in 1:maxsteps){ j<-maxsteps-i+1 sig<-sig.branch[j] if(is.na(sig)){ Id<-subset.list[[j]] Data<-DATA[Id,] pvalue.branch[j]<-LC(Data, cl, nbPermutations, s0=0) adj.pvalue.branch[j]<-pvalue.branch[j]*GS.size/length(Id) if(adj.pvalue.branch[j]<=critic) sig.branch[j]<-T else { branch.rejected<-branch(merge,j) branch.set<-branch.rejected[branch.rejected>0] sig.branch[branch.set]<- F leave.set<--branch.rejected[branch.rejected<0] sig.leave[leave.set]<- F } } } object[[10]]<-cbind(sig=sig.branch, pvalue=pvalue.branch, adj.pv=adj.pvalue.branch) names(object)[10]<-"branch" # (b) leave testing leave.na<-(1:GS.size)[is.na(sig.leave)] m<-length(leave.na) if(m>=1){ for(i in 1:m){ Id<-leave.na[i] Data<-matrix(DATA[Id,],1) pvalue.leave[Id]<-LC(Data, cl, nbPermutations, s0) adj.pvalue.leave[Id]<-pvalue.leave[Id]*GS.size if (adj.pvalue.leave[Id]<=critic) sig.leave[Id]<-T else sig.leave[Id]<-F } } object[[11]]<-cbind(sig=sig.leave, pvalue=pvalue.leave, adj.pv=adj.pvalue.leave) names(object)[11]<-"leave" return(object) } SAMGS.hct<-function(GS, DATA, cl, nbPermutations=1000, s0=0, critic=0.05){ # GS : gene sets # -> a dataframe with rows=genes, # columns= gene sets, # GS[i,j]=1 if gene i in gene set j # GS[i,j]=0 otherwise # OR # a list with each element corresponding to a gene set = a vector of strings (genes identifiers) # # # DATA : expression data # -> a dataframe with rows=genes, # columns=samples # # cl : a factor defining a bipartition of the samples IN THE SAME ORDER AS IN DATA # # nbPermutations: number of permutation. # # s0 : a positive constant # -> used to avoid 'small variation problem', as used in SAM and SAMGS # # critic: critic value for testing. # # (1) pre-treatment of the gene sets genes <- rownames(DATA) # gene names of the microarray data nb.Samples <- ncol(DATA) # nb of samples nb.GeneSets <- length(GS) # nb of gene sets C1.size <- table(cl)[1] # nb of samples in class 1 GS <- GS.format.dataframe.to.list(GS); # change format of GS from dataframe to list GS <- lapply(GS,function(z) as.numeric(which(genes %in% z))); # numericalized index of each GS GS.sizes <- sapply(GS,length); # size of each gene set GS.data <- lapply(GS, function(z) as.matrix(DATA[z, ],ncol=nb.Samples)); # creat data of each GS (rows=genes, columns=samples) # (2) Hierarchical Clustering and Hierarchical Testing object.list<-lapply(GS.data, function(z) hc.test(z, cl, nbPermutations, s0, critic)) return(object.list) } plot.hgst<-function(Object, SN.GS, genenames){ # Object : object created by using SAMGS.hct # genenames : a vector of gene names # SN.GS : series number of the GS which you want to plot par(mfrow=c(1,length(SN.GS))) GSnames<-names(Object) for(i in SN.GS){ object<-Object[[i]] if(missing(genenames)) genenames<-NULL else object$order.lab<-genenames[as.numeric(object$order.lab)] plot(object, ask = FALSE, which.plots = 2, main = paste("HLC test of", GSnames[i])) r.order<-object$order k<-length(r.order) r.merge<-object$merge l.leave<-1:k l.nots<-rep(NA,k-1) for (j in 1:(k-1)){ a<-r.merge[j,1] b<-r.merge[j,2] if(a<0) l.a<-l.leave[r.order==-a] if(a>0) l.a<-l.nots[a] if(b<0) l.b<-l.leave[r.order==-b] if(b>0) l.b<-l.nots[b] l.nots[j]<-(l.a+l.b)/2 } ht<-sort(object$height) for(ii in (k-1):1){ if(object$branch[ii,1]==1){ points(x=l.nots[ii],y=ht[ii], col=2, pch=19) } } leave.sig<-(object$leave[,1]==1) if (any(leave.sig==T)){ id<-(1:k)[leave.sig] order.id<-order(r.order)[id] # x position of sig.leave for(ii in 1:length(id)){ ht.id<-(1:(k-1))[apply(object$merge==-id[ii], 1, any)] points(x=order.id[ii],y=ht[ht.id]-max(ht)/10, col=2, pch=19) } } } }