Chapter 6 R code modules to be used for stand-alone analysis.

Base on the South Dakota State University analysis protocol
http://bioinformatics.sdstate.edu/

Integrated Differential Expression and Pathway analysis
1.ShinyGO: Gene Ontology Enrichment Analysis

6.1 Local installation

Local installation of this software is possible through steps below. But it is not supported or updated freqently. Local install is for non-profit organizations only. For-profit businesses please contact us to license the database.

To run iDEP on your local machine (Windows, MacOS, Linux):
Requirements:

More than 250GB available storage
More than 4GB memory
Most recent version of R and RStudio installed.

Upgrade to the most recent version of R and Rstudio.
Start RStudio and install all the R packages. As we need so many R packages, this may take several hours. You can let it run and get started on steps 3 and 4 below to save time. From RStudio console window:
source https://raw.githubusercontent.com/iDEP-SDSU/idep/master/classes/librarySetup.R
Download iDEP source code and example data files from GitHub. The best is to click the green “Code” button and select “Download ZIP” on this page. Unzip to a folder such as C:/IDEP, so that it contains all the subfolders such as config, classes, shinyapps, and so on.
Download the most recent database file from here. Unzip it to the same folder (C:/IDEP), so that your database can be found at C:/IDEP/data/data104.
Start Rstudio and load the ui.R and server.R scripts in the folder C:/IDEP/shinyapps/idep94. And then click on Run app. Similarily, the ShinyGO app could be started at the folder, C:/IDEP/shinyapps/go74/.

6.2 Install packages

source https://raw.githubusercontent.com/iDEP-SDSU/idep/master/classes/librarySetup.R

# dplyr complains this required libraries: libudunits2-dev, libmariadb-client-lgpl-dev
#demo
# install.packages("plotly", repos="http://cran.rstudio.com/", dependencies=TRUE)
# sometimes need to remove all installed packages: https://www.r-bloggers.com/how-to-remove-all-user-installed-packages-in-r/ 
list.of.packages <- c(
  "shiny", "shinyAce", "shinyBS", "plotly",
  "RSQLite", "gplots", 
  "ggplot2", "dplyr", #"tidyverse",
  "plotly",
  "e1071", "reshape2", "DT",
  "data.table", "Rcpp","WGCNA","flashClust","statmod","biclust","igraph","Rtsne",
  "visNetwork", "BiocManager","feather","shinyjs","reactable"
)

list.of.bio.packages  <- c(
  "getDEE2", "limma", "DESeq2", "edgeR", "gage", "fgsea", "ReactomePA", "pathview", "PREDA",
  "impute", "runibic","QUBIC","rhdf5", "STRINGdb",
  "PREDAsampledata", "sfsmisc", "lokern", "multtest", "hgu133plus2.db", 
   "org.Ag.eg.db","org.At.tair.db","org.Bt.eg.db","org.Ce.eg.db","org.Cf.eg.db",
   "org.Dm.eg.db","org.EcK12.eg.db","org.EcSakai.eg.db","org.Gg.eg.db",
   "org.Hs.eg.db","org.Mm.eg.db","org.Mmu.eg.db","org.Pf.plasmo.db",
   "org.Pt.eg.db","org.Rn.eg.db","org.Sc.sgd.db","org.Ss.eg.db","org.Xl.eg.db"
)

 if(1) { # remove all old packages, to solve problem caused by Bioconductor upgrade
    # create a list of all installed packages
     ip <- as.data.frame(installed.packages())
    # head(ip)
    # if you use MRO, make sure that no packages in this library will be removed
     ip <- subset(ip, !grepl("MRO", ip$LibPath))
    # we don't want to remove base or recommended packages either\
     ip <- ip[!(ip[,"Priority"] %in% c("base", "recommended")),]
    # determine the library where the packages are installed
     path.lib <- unique(ip$LibPath)
    # create a vector with all the names of the packages you want to remove
     pkgs.to.remove <- ip[,1]
    # head(pkgs.to.remove)
    # remove the packages
     sapply(pkgs.to.remove, remove.packages, lib = path.lib)
}

new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
new.bio.packages <- list.of.bio.packages[!(list.of.bio.packages %in% installed.packages()[,"Package"])]
notInstalledPackageCount = length(new.packages) + length(new.bio.packages)

#Install Require packages
while(notInstalledPackageCount != 0){

    if(length(new.packages)) install.packages(new.packages, repos="http://cran.rstudio.com/", dependencies=TRUE, quiet=TRUE)
    if(length(new.bio.packages)){
        BiocManager::install(new.bio.packages, ask = FALSE, dependencies=TRUE, quiet=TRUE)
    }

    new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
    new.bio.packages <- list.of.bio.packages[!(list.of.bio.packages %in% installed.packages()[,"Package"])]
    if( notInstalledPackageCount == length(new.packages) + length(new.bio.packages) )
    {
        #no new package installed.
        break
    }
    else {
       notInstalledPackageCount = length(new.packages) + length(new.bio.packages)
    }
}

#PGSEA is deprecated since Bioconductor 3.12. So we have to install manually from source.
BiocManager::install(c("GO.db","KEGG.db", "annaffy")) # required y PGSEA
install.packages("https://bioconductor.org/packages/3.10/bioc/src/contrib/PGSEA_1.60.0.tar.gz", 
                 repos=NULL, type="source")
list.of.bio.packages = c(list.of.bio.packages, "PGSEA") # add package for testing

#Load Packages
suc = unlist ( lapply(list.of.packages, require, character.only = TRUE) )
if(sum(suc) < length(list.of.packages) )
    cat ("\n\nWarnning!!!!!! These R packages cannot be loaded:", list.of.packages[!suc] )

suc = unlist ( lapply(list.of.bio.packages, require, character.only = TRUE) )
if(sum(suc) < length(list.of.bio.packages) )
    cat ("\n\nWarnning!!!!!! These Bioconductor packages cannot be loaded:", list.of.bio.packages[!suc] )

################################################################
# Install packages
################################################################

# dplyr complains this required libraries: libudunits2-dev, libmariadb-client-lgpl-dev
# install.packages("plotly", repos="http://cran.rstudio.com/", dependencies=TRUE)
# sometimes need to remove all installed packages: https://www.r-bloggers.com/how-to-remove-all-user-installed-packages-in-r/ 
#cat("\n Installing lots of R packages, this may take several hours ... \n
#   Each of these packages took years to develop.\n So be a patient thief.\n
#   \n Note 1: Sometimes dependencies needs to be installed manually. 
#   \n Note 2: If you are using an older version of R, and having trouble with package installation,
#   \n         sometimes, it is easier to install the new version of R and delete all old packages, and start fresh. 
#  ")

list.of.packages <- c("dendextend", "htmlwidgets","RSQLite", 
    "gplots", "ggplot2", "dplyr", "tidyverse","plotly","e1071", 
    "reshape2",  "Rcpp","flashClust","statmod",
    "biclust","igraph","Rtsne")

list.of.bio.packages  <- c("STRINGdb", "limma", "DESeq2", "edgeR", "gage", 
    "PGSEA", "fgsea", "ReactomePA", "pathview", "PREDA", "impute", "runibic",
    "QUBIC","rhdf5", "PREDAsampledata", "sfsmisc", "lokern", "multtest", 
    "GSEABase","hgu133plus2.db")

#Install Require packages
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages, repos="http://cran.rstudio.com/", dependencies=TRUE)

new.bio.packages <- list.of.bio.packages[!(list.of.bio.packages %in% installed.packages()[,"Package"])]
if(length(new.bio.packages)){
    if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
    BiocManager::install(new.bio.packages, suppressUpdates = T)
}   

# WGCNA must be installed after AnnotationDbi, impute, GO.db, preprocessCore
# install.packages(c("WGCNA"))

#Test Load Packages    This cause an error when too many packages are loaded.
# suc = unlist ( lapply(list.of.packages, require, character.only = TRUE) )
# if(sum(suc) < length(list.of.packages) )
#   cat ("\n\nWarnning!!!!!! These R packages cannot be loaded:", list.of.packages[!suc] )

# suc = unlist ( lapply(list.of.bio.packages, require, character.only = TRUE) )
# if(sum(suc) < length(list.of.bio.packages) )
#   cat ("\n\nWarnning!!!!!! These Bioconductor packages cannot be loaded:", list.of.bio.packages[!suc] )

6.3 Global variables

################################################################
# Global variables
################################################################

library(RSQLite,verbose=FALSE)  # for database connection
library(gplots,verbose=FALSE)       # for hierarchical clustering
library(ggplot2,verbose=FALSE)  # graphics
library(e1071,verbose=FALSE)        # computing kurtosis
library(DT,verbose=FALSE)       # for renderDataTable
library(plotly,verbose=FALSE)   # for interactive heatmap
library(reshape2,verbose=FALSE)     # for melt correlation matrix in heatmap
input_goButton = 0    # if 1, use demo data file
Min_overlap <- 2
minSetSize = 3; 
mappingCoverage = 0.60 # 60% percent genes has to be mapped for confident mapping
mappingEdge = 0.5  # Top species has 50% more genes mapped
PvalGeneInfo = 0.05; minGenes = 10 # min number of genes for ploting
kurtosis.log = 50  # log transform is enforced when kurtosis is big
kurtosis.warning = 10 # log transformation recommnded 
minGenesEnrichment = 2 # perform GO or promoter analysis only if more than this many genes
PREDA_Permutations =1000
maxGeneClustering = 12000  # max genes for hierarchical clustering and k-Means clustering. Slow if larger
maxGeneWGCNA = 2000 # max genes for co-expression network
maxFactors =6  # max number of factors in DESeq2 models
redudantGeneSetsRatio = 0.9  # remove redundant genesets in enrichment analysis
set.seed(2) # seed for random number generator
mycolors = sort(rainbow(20))[c(1,20,10,11,2,19,3,12,4,13,5,14,6,15,7,16,8,17,9,18)] # 20 colors for kNN clusters
#Each row of this matrix represents a color scheme;

hmcols <- colorRampPalette(rev(c("#D73027", "#FC8D59", "#FEE090", "#FFFFBF",
"#E0F3F8", "#91BFDB", "#4575B4")))(75)
heatColors = rbind(      greenred(75),     bluered(75),     colorpanel(75,"green","black","magenta"),colorpanel(75,"blue","yellow","red"),hmcols )
rownames(heatColors) = c("Green-Black-Red","Blue-White-Red","Green-Black-Magenta","Blue-Yellow-Red","Blue-white-brown")
colorChoices = setNames(1:dim(heatColors)[1],rownames(heatColors)) # for pull down menu

# Create a list for Select Input options
speciesChoice = list()
speciesChoice <- append( setNames( "NEW","**NEW SPECIES**"), speciesChoice  )
speciesChoice <- append( setNames( "BestMatch","Best matching species"), speciesChoice  )

6.4 Read data and pre-process

################################################################
# Read data and pre-process
################################################################

readData <- function(inFile ) {

                # these packages moved here to reduce loading time
                library(edgeR,verbose=FALSE) # count data D.E.
                library(DESeq2,verbose=FALSE) # count data analysis

                dataTypeWarning =0
                dataType =c(TRUE)

                #---------------Read file
                x <- read.csv(inFile)   # try CSV
                if(dim(x)[2] <= 2 )   # if less than 3 columns, try tab-deliminated
                    x <- read.table(inFile, sep="\t",header=TRUE)   
                #-------Remove non-numeric columns, except the first column
                
                for(i in 2:dim(x)[2])
                    dataType = c( dataType, is.numeric(x[,i]) )
                if(sum(dataType) <=2) return (NULL)  # only less than 2 columns are numbers
                x <- x[,dataType]  # only keep numeric columns
                
                # rows with all missing values
                ix = which( apply(x[,-1],1, function(y) sum( is.na(y) ) ) != dim(x)[2]-1 )
                x <- x[ix,]
                
                dataSizeOriginal = dim(x); dataSizeOriginal[2] = dataSizeOriginal[2] -1
                
                x[,1] <- toupper(x[,1])
                x[,1] <- gsub(" ","",x[,1]) # remove spaces in gene ids
                x = x[order(- apply(x[,2:dim(x)[2]],1,sd) ),]  # sort by SD
                x <- x[!duplicated(x[,1]) ,]  # remove duplicated genes
                rownames(x) <- x[,1]
                x <- as.matrix(x[,c(-1)])
                
                # remove "-" or "." from sample names
                colnames(x) = gsub("-","",colnames(x))
                colnames(x) = gsub("\\.","",colnames(x))

                
                #cat("\nhere",dim(x))
                # missng value for median value
                if(sum(is.na(x))>0) {# if there is missing values
                    if(input_missingValue =="geneMedian") { 
                        rowMedians <- apply(x,1, function (y)  median(y,na.rm=T))
                        for( i in 1:dim(x)[2] ) {
                            ix = which(is.na(x[,i]) )
                            x[ix,i] <- rowMedians[ix]                       
                        }
                            
                    } else if(input_missingValue =="treatAsZero") {
                        x[is.na(x) ] <- 0                   
                    } else if (input_missingValue =="geneMedianInGroup") {
                        sampleGroups = detectGroups( colnames(x))
                        for (group in unique( sampleGroups) ){      
                            samples = which( sampleGroups == group )
                            rowMedians <- apply(x[,samples, drop=F],1, function (y)  median(y,na.rm=T))
                            for( i in  samples ) { 
                                ix = which(is.na(x[ ,i] ) ) 
                                if(length(ix) >0 )
                                    x[ix, i  ]  <- rowMedians[ix]
                            }                                       
                        }
                        
                        # missing for entire sample group, use median for all samples
                        if(sum(is.na(x) )>0 ) { 
                            rowMedians <- apply(x,1, function (y)  median(y,na.rm=T))
                            for( i in 1:dim(x)[2] ) {
                                ix = which(is.na(x[,i]) )
                                x[ix,i] <- rowMedians[ix]                       
                            }                       
                        }
                    }
                }

                # Compute kurtosis
                mean.kurtosis = mean(apply(x,2, kurtosis),na.rm=T)
                rawCounts = NULL
                pvals= NULL
                if (input_dataFileFormat == 2 ) {  # if FPKM, microarray

                    if ( is.integer(x) ) dataTypeWarning = 1;  # Data appears to be read counts

                    #-------------filtering
                    #tem <- apply(x,1,max)
                    #x <- x[which(tem > input_lowFilter),]  # max by row is at least 
                    x <- x[ which( apply( x, 1,  function(y) sum(y >= input_lowFilter)) >= input_NminSamples2 ) , ] 

                    
                    x <- x[which(apply(x,1, function(y) max(y)- min(y) ) > 0  ),]  # remove rows with all the same levels

                    #--------------Log transform
                    # Takes log if log is selected OR kurtosis is big than 100
                    if ( (input_transform == TRUE) | (mean.kurtosis > kurtosis.log ) ) 
                        x = log(x+abs( input_logStart),2)

                    tem <- apply(x,1,sd) 
                    x <- x[order(-tem),]  # sort by SD

                } else 
                    if( input_dataFileFormat == 1) {  # counts data


                    # data not seems to be read counts
                    if(!is.integer(x) & mean.kurtosis < kurtosis.log ) {
                        dataTypeWarning = -1
                    }
                    # not used as some counts data like those from CRISPR screen
                    #validate(   # if Kurtosis is less than a threshold, it is not read-count
                    #   need(mean.kurtosis > kurtosis.log, "Data does not seem to be read count based on distribution. Please double check.")
                    # )
                    x <- round(x,0) # enforce the read counts to be integers. Sailfish outputs has decimal points.
                    #x <- x[ which( apply(x,1,max) >= input_minCounts ) , ] # remove all zero counts
                    

                    # remove genes if it does not at least have minCounts in at least NminSamples
                    #x <- x[ which( apply(x,1,function(y) sum(y>=input_minCounts)) >= input_NminSamples ) , ]  # filtering on raw counts
                    # using counts per million (CPM) for filtering out genes.
                                             # CPM matrix                  #N samples > minCounts
                    x <- x[ which( apply( cpm(DGEList(counts = x)), 1,  
                            function(y) sum(y>=input_minCounts)) >= input_NminSamples ) , ] 
                    


                    rawCounts = x; # ??? 
                    # construct DESeqExpression Object
                    # colData = cbind(colnames(x), as.character(detectGroups( colnames(x) )) )
                    tem = rep("A",dim(x)[2]); tem[1] <- "B"   # making a fake design matrix to allow process, even when there is no replicates
                    colData = cbind(colnames(x), tem )
                    colnames(colData)  = c("sample", "groups")
                    dds <- DESeqDataSetFromMatrix(countData = x, colData = colData, design = ~ groups)
                    dds <- estimateSizeFactors(dds) # estimate size factor for use in normalization later for started log method


                    # regularized log  or VST transformation
                    if( input_CountsTransform == 3 ) { # rlog is slow, only do it with 10 samples
                        if(dim(x)[2]<=20 ) { 
                         x <- rlog(dds, blind=TRUE); x <- assay(x) } else 
                         x <- log2( counts(dds, normalized=TRUE) + input_countsLogStart ) 
                         }  

                        else {
                        if ( input_CountsTransform == 2 ) {    # vst is fast but aggressive
                            x <- vst(dds, blind=TRUE)
                            x <- assay(x)  
                        } else{  # normalized by library sizes and add a constant.
                            x <- log2( counts(dds, normalized=TRUE) + input_countsLogStart )   # log(x+c) 
                            # This is equivalent to below. But the prior counts is more important
                            #x <- cpm(DGEList(counts = x),log=TRUE, prior.count=input_countsLogStart )  #log CPM from edgeR
                            #x <- x-min(x)  # shift values to avoid negative numbers
                        }
                    }
                } else 
                    if( input_dataFileFormat == 3)  {  # other data type

                        #neg_lfc neg_fdr pos_lfc pos_fdr 
                        #11       1      11       1 
                        
                        n2 = ( dim(x)[2] %/% 2) # 5 --> 2
                        # It looks like it contains P values
                        # ranges of columns add 0.2 and round to whole. For P value columns this should be 1
                        tem = round( apply(x, 2, function( y) max(y)- min(y))  + .2)     
                        if( sum(tem[(1:n2)*2  ] ==  1 ) == n2 | 
                            sum(tem[(1:n2)*2-1  ] ==  1 ) == n2 ) {         
                            x = x[,1:(2*n2) ,drop=FALSE ] # if 5, change it to 4            
                            if(tem[2] == 1) { # FDR follows Fold-change
                                pvals = x [,2*(1:n2 ),drop=FALSE ]  # 2, 4, 6
                                x = x[, 2*(1:n2 )-1,drop=FALSE]   # 1, 3, 5

                            } else {    # FDR follows Fold-change
                                pvals = x [,2*(1:n2 )-1,drop=FALSE ]  # 2, 4, 6     
                                x = x[, 2*(1: n2 ),drop=FALSE]   # 1, 3, 5
                            }
                        }
                    ix =  which(apply(x,1, function(y) max(y)- min(y) ) > 0  )
                    x <- x[ix,]  # remove rows with all the same levels
                    if(!is.null(pvals) )
                        pvals = pvals[ix,]
                        
                    }
                    
                    
                dataSize = dim(x);
                if(!(dim(x)[1]>5 & dim(x)[2]>1)) 
                stop ( "Data file not recognized. Please double check.")


                
                sampleInfoDemo=NULL
                if( input_goButton >0)
                    sampleInfoDemo <- t( read.csv(demoDataFile2,row.names=1,header=T,colClasses="character") )

                    finalResult <- list(data = as.matrix(x), mean.kurtosis = mean.kurtosis, rawCounts = rawCounts, dataTypeWarning=dataTypeWarning, dataSize=c(dataSizeOriginal,dataSize),sampleInfoDemo=sampleInfoDemo, pvals =pvals )
                return(finalResult)

    }

readSampleInfo <- function(inFile){

                dataTypeWarning =0
                dataType =c(TRUE)

                #---------------Read file
                x <- read.csv(inFile,row.names=1,header=T,colClasses="character")   # try CSV
                if(dim(x)[2] <= 2 )   # if less than 3 columns, try tab-deliminated
                    x <- read.table(inFile, row.names=1,sep="\t",header=TRUE,colClasses="character")
                # remove "-" or "." from sample names
                colnames(x) = gsub("-","",colnames(x))
                colnames(x) = gsub("\\.","",colnames(x))    

                #----------------Matching with column names of expression file
                ix = match(toupper(colnames(readData.out$data)), toupper(colnames(x)) ) 
                ix = ix[which(!is.na(ix))] # remove NA

                if(! ( length(unique(ix) ) == dim(readData.out$data)[2] 
                       & dim(x)[1]>=1 & dim(x)[1] <500 ) ) # at least one row, it can not be more than 500 rows
                     stop( "Error!!! Sample information file not recognized. Sample names must be exactly the same. Each row is a factor. Each column represent a sample.  Please see documentation on format.")
                
                
                #-----------Double check factor levels, change if needed
                # remove "-" or "." from factor levels
                for( i in 1:dim(x)[1]) {
                   x[i,] = gsub("-","",x[i,])
                   x[i,] = gsub("\\.","",x[i,])             
                }
                # if levels from different factors match
                if( length(unique(ix) ) == dim(readData.out$data)[2]) { # matches exactly
                    x = x[,ix]
                    # if the levels of different factors are the same, it may cause problems
                    if( sum( apply(x, 1, function(y) length(unique(y)))) > length(unique(unlist(x) ) ) ) {
                        tem2 =apply(x,2, function(y) paste0( names(y),y)) # factor names are added to levels
                        rownames(tem2) = rownames(x)
                        x <- tem2               
                    }
                    return(t( x ) )         
                } else retrun(NULL)
                            
                
}

textTransform <- function () { 
        k.value =  readData.out$mean.kurtosis     
        tem = paste( "Mean Kurtosis =  ", round(k.value,2), ".\n",sep = "")
        if( k.value > kurtosis.log) tem = paste(tem, " Detected extremely large values. \n  When kurtosis >", kurtosis.log,
        ", log transformation is automatically applied.\n", sep="") else if (k.value>kurtosis.warning)
        {tem = paste(tem, " Detected  large numbers with kurtosis >",
        kurtosis.warning,". \n Log transformation is recommended.", sep="") }
        if(readData.out$dataTypeWarning == 1 ) {
            tem = paste (tem, " ------Warning!!! \nData matrix contains all integers. \n It seems to be read counts!!! \n Please select appropriate data type in the previous page and reload file.")}
        if(readData.out$dataTypeWarning == -1 ) {
            tem = paste (tem, "  ------Warning!!! \nData does not look like read counts data. \n Please select appropriate data type in the previous page and reload file.")}
        tem
}


# Clean up gene sets. Remove spaces and other control characters from gene names  
cleanGeneSet <- function (x){
  # remove duplicate; upper case; remove special characters
  x <- unique( toupper( gsub("\n| ","",x) ) )
  x <- x[which( nchar(x)>1) ]  # genes should have at least two characters
  return(x)
}
findSpeciesById <- function (speciesID){ # find species name use id
  return( orgInfo[which(orgInfo$id == speciesID),]  )
}

# just return name
findSpeciesByIdName <- function (speciesID){ # find species name use id
  return( orgInfo[which(orgInfo$id == speciesID),3]  )
}

# convert gene IDs to ensembl gene ids and find species
convertID <- function (query,selectOrg, selectGO) {
    querySet <- cleanGeneSet( unlist( strsplit( toupper(query),'\t| |\n|\\,')))
    # querySet is ensgene data for example, ENSG00000198888, ENSG00000198763, ENSG00000198804
    querySTMT <- paste( "select distinct id,ens,species from mapping where id IN ('", paste(querySet,collapse="', '"),"')",sep="")
    result <- dbGetQuery(convert, querySTMT)
    if( dim(result)[1] == 0  ) return(NULL)
    if(selectOrg == speciesChoice[[1]]) {
        comb = paste( result$species,result$idType)
        sortedCounts = sort(table(comb),decreasing=T)
        recognized =names(sortedCounts[1])
        result <- result[which(comb == recognized),]
        speciesMatched=sortedCounts
        names(speciesMatched )= sapply(as.numeric(gsub(" .*","",names(sortedCounts) ) ), findSpeciesByIdName  ) 
        speciesMatched <- as.data.frame( speciesMatched )
        if(length(sortedCounts) == 1) { # if only  one species matched
        speciesMatched[1,1] <-paste( rownames(speciesMatched), "(",speciesMatched[1,1],")",sep="")
        } else {# if more than one species matched
            speciesMatched[,1] <- as.character(speciesMatched[,1])
            speciesMatched[,1] <- paste( speciesMatched[,1]," (",speciesMatched[,2], ")", sep="") 
            speciesMatched[1,1] <- paste( speciesMatched[1,1],"   ***Used in mapping***  To change, select from above and resubmit query.")     
            speciesMatched <- as.data.frame(speciesMatched[,1])
        }
    } else { # if species is selected
        result <- result[which(result$species == selectOrg ) ,]
        if( dim(result)[1] == 0  ) return(NULL) #stop("ID not recognized!")
        speciesMatched <- as.data.frame(paste("Using selected species ", findSpeciesByIdName(selectOrg) )  )
    }
    result <- result[which(!duplicated(result[,2]) ),] # remove duplicates in ensembl_gene_id
    result <- result[which(!duplicated(result[,1]) ),] # remove duplicates in user ID
    colnames(speciesMatched) = c("Matched Species (genes)" ) 
    conversionTable <- result[,1:2]; colnames(conversionTable) = c("User_input","ensembl_gene_id")
    conversionTable$Species = sapply(result[,3], findSpeciesByIdName )

    return(list(originalIDs = querySet,IDs=unique( result[,2]), 
                species = findSpeciesById(result$species[1]), 
                #idType = findIDtypeById(result$idType[1] ),
                speciesMatched = speciesMatched,
                conversionTable = conversionTable
                ) )
}


# retrieve detailed info on genes
geneInfo <- function (fileName = NULL){

    if(!is.null(fileName))  # if only one file           #WBGene0000001 some ensembl gene ids in lower case
    { x = read.csv(as.character(fileName) ); x[,1]= toupper(x[,1]) } else # read in the chosen file 
    { return(as.data.frame("No file.") )   }
    Set=rep("List",nrow(x))
    return( cbind(x,Set) )
 }

convertedData <- function() {
        if( is.null(converted.out ) ) return( readData.out$data) # if id or species is not recognized use original data.
            
                if(input_noIDConversion) return( readData.out$data )
                
                mapping <- converted.out$conversionTable
                # cat (paste( "\nData:",input_selectOrg) )
                x =readData.out$data

                rownames(x) = toupper(rownames(x))
                # any gene not recognized by the database is disregarded
                # x1 = merge(mapping[,1:2],x,  by.y = 'row.names', by.x = 'User_input')
                # the 3 lines keeps the unrecogized genes using original IDs
                x1 = merge(mapping[,1:2],x,  by.y = 'row.names', by.x = 'User_input', all.y=TRUE)

                # original IDs used if ID is not matched in database
                ix = which(is.na(x1[,2]) )
                x1[ix,2] = x1[ix,1] 
                
                #multiple matched IDs, use the one with highest SD
                tem = apply(x1[,3:(dim(x1)[2])],1,sd)
                x1 = x1[order(x1[,2],-tem),]
                x1 = x1[!duplicated(x1[,2]) ,]
                rownames(x1) = x1[,2]
                x1 = as.matrix(x1[,c(-1,-2)])
                tem = apply(x1,1,sd)
                x1 = x1[order(-tem),]  # sort again by SD
                return(x1)

    }

convertedCounts <- function() {
              if( is.null(converted.out ) ) return( readData.out$rawCounts)
                mapping <- converted.out$conversionTable

                x =readData.out$rawCounts

                    rownames(x) = toupper(rownames(x))
                    # any gene not recognized by the database is disregarded
                    # x1 = merge(mapping[,1:2],x,  by.y = 'row.names', by.x = 'User_input')
                    # the 3 lines keeps the unrecogized genes using original IDs
                    x1 = merge(mapping[,1:2],x,  by.y = 'row.names', by.x = 'User_input', all.y=TRUE)
                    ix = which(is.na(x1[,2]) )
                    x1[ix,2] = x1[ix,1] # original IDs used
                    
                    #multiple matched IDs, use the one with highest SD
                    tem = apply(x1[,3:(dim(x1)[2])],1,sd)
                    x1 = x1[order(x1[,2],-tem),]
                    x1 = x1[!duplicated(x1[,2]) ,]
                    rownames(x1) = x1[,2]
                    x1 = as.matrix(x1[,c(-1,-2)])
                    tem = apply(x1,1,sd)
                    x1 = x1[order(-tem),]  # sort again by SD

                    return(x1)
} # here

nGenesFilter <- function() { 
        tem = readData.out$dataSize
        ix = match( toupper( rownames(convertedData.out)), toupper(converted.out$conversionTable$ensembl_gene_id  ) )
        nMatched = sum( !is.na(ix) )
        if( input_noIDConversion) 
        return( paste(tem[1], "genes in", tem[4], "samples.", tem[3], " genes passed filter.\n Original gene IDs used." ) )  else
        return( paste(tem[1], "genes in", tem[4], "samples. Of the", tem[3], " genes passed filter, \n", 
            nMatched, " were converted to Ensembl gene IDs in our database. \n
              The remaining ", tem[3]-nMatched, "genes were kept in the data using original IDs." ) )     
}       

# Define sample groups based on column names
 detectGroups <- function (x){  # x are col names
    tem <- gsub("[0-9]*$","",x) # Remove all numbers from end
    #tem = gsub("_Rep|_rep|_REP","",tem)
    tem <- gsub("_$","",tem); # remove "_" from end
    tem <- gsub("_Rep$","",tem); # remove "_Rep" from end
    tem <- gsub("_rep$","",tem); # remove "_rep" from end
    tem <- gsub("_REP$","",tem)  # remove "_REP" from end
    return( tem )
 }

  # EDA plots -------------------------------------
 densityPlot <- function() {
set.seed(2) # seed for random number generator
mycolors = sort(rainbow(20))[c(1,20,10,11,2,19,3,12,4,13,5,14,6,15,7,16,8,17,9,18)] # 20 colors for kNN clusters
#Each row of this matrix represents a color scheme;
    
    x <- readData.out$data

    myColors = mycolors
    plot(density(x[,1]),col = myColors[1], lwd=2,
        xlab="Expresson values", ylab="Density", main= "Distribution of transformed data",
        ylim=c(0, max(density(x[,1])$y)+.02 ) )
      
    for( i in 2:dim(x)[2] )
    lines(density(x[,i]),col=myColors[i], lwd=2 )
    if(dim(x)[2]< 31 ) # if too many samples do not show legends
        legend("topright", cex=1,colnames(x), lty=rep(1,dim(x)[2]), col=myColors )  
   
}

 genePlot <- function() {
    x <- convertedData.out
    
    Symbols <- rownames(x)
    if( input_selectOrg != "NEW") {
        ix = match( rownames(x), allGeneInfo.out[,1])
        if( sum( is.na(allGeneInfo.out$symbol )) != dim(allGeneInfo.out )[1] ) {  # symbol really exists? 
            Symbols = as.character( allGeneInfo.out$symbol[ix] )
            Symbols[which( nchar(Symbols) <= 2 ) ] <- rownames(x) [which( nchar(Symbols) <= 2 ) ] 
            }
       }
    x = as.data.frame(x)
    x$Genes = Symbols
    # matching from the beginning of symbol
    ix = which(regexpr(  paste("^" , toupper(input_geneSearch),sep="")   ,toupper(x$Genes)) > 0)
    
    if(grepl(" ", input_geneSearch)  )  # if there is space character, do exact match
        ix = match(gsub(" ","", toupper(input_geneSearch)),x$Genes)
    
    # too few or too many genes found
    if(length(ix) == 0 | length(ix) > 50 ) return(NULL)
      # no genes found
         
    mdf = melt(x[ix,],id.vars="Genes", value.name="value", variable.name="samples")

    p1 <- ggplot(data=mdf, aes(x=samples, y=value, group = Genes, shape=Genes, colour = Genes)) +
        geom_line() +
        geom_point( size=5,  fill="white")+ #shape=21  circle
        #theme(axis.text.x = element_text(size=16,angle = 45, hjust = 1)) +
        labs(y="Transformed expression level") +
        coord_cartesian(ylim = c(0, max(mdf$value)))
    p1 <- p1 + theme(plot.title = element_text(size = 16,hjust = 0.5)) + # theme(aspect.ratio=1) +
     theme(axis.text.x = element_text(angle=45, size = 16, hjust=1),
           axis.text.y = element_text( size = 16),
           axis.title.x = element_blank(),
           axis.title.y = element_text( size = 16) ) +
    theme(legend.text=element_text(size=12))    
    
    p1  
    }   

    
geneBarPlotError <- function() {
   x <- convertedData.out
    
    Symbols <- rownames(x)
    if( input_selectOrg != "NEW") {
        ix = match( rownames(x), allGeneInfo.out[,1])
        if( sum( is.na(allGeneInfo.out$symbol )) != dim(allGeneInfo.out )[1] ) {  # symbol really exists? 
            Symbols = as.character( allGeneInfo.out$symbol[ix] )
            Symbols[which( nchar(Symbols) <= 2 ) ] <- rownames(x) [which( nchar(Symbols) <= 2 ) ] 
            }
       }
    x = as.data.frame(x)
    x$Genes = Symbols
    #write.csv(x,"tem.csv")
    # Search for genes
    #ix = grep("HOXA",toupper(x$Genes) )
    # ix = grep(toupper(input_geneSearch),toupper(x$Genes))  # sox --> Tsox  
    # matching from the beginning of symbol
    ix = which(regexpr(  paste("^" , toupper(input_geneSearch),sep="")   ,toupper(x$Genes)) > 0)
    
    if(grepl(" ", input_geneSearch)  )  # if there is space character, do exact match
        ix = match(gsub(" ","", toupper(input_geneSearch)),x$Genes)
    
    # too few or too many genes found
    if(length(ix) == 0 | length(ix) > 50 ) return(NULL)
      # no genes found
         
    mdf = melt(x[ix,],id.vars="Genes", value.name="value", variable.name="samples")
    mdf$count = 1
    g = detectGroups(mdf$samples)
    Means = aggregate(mdf$value,by=list( g, mdf$Genes ), FUN = mean, na.rm=TRUE  )
    SDs = aggregate(mdf$value,by=list( g, mdf$Genes ), FUN = sd, na.rm=TRUE  )
    Ns = aggregate(mdf$count, by= list(g, mdf$Genes) , FUN = sum  )
    summarized = cbind(Means,SDs[,3],Ns[,3])
    colnames(summarized)= c("Samples","Genes","Mean","SD","N")
    summarized$SE = summarized$SD / sqrt(summarized$N)  
        
    #http://www.sthda.com/english/wiki/ggplot2-barplots-quick-start-guide-r-software-and-data-visualization
    p2 <- ggplot(summarized, aes(x=Genes, y=Mean,fill=Samples) ) + # data & aesthetic mapping
        geom_bar(stat="identity", position=position_dodge()) + # bars represent average
        geom_errorbar(aes(ymin=Mean-SE, ymax=Mean+SE), width=0.2,position=position_dodge(.9)) +
        labs(y="Expression Level") 
    if(input_useSD == 1) { 
    p2 <- ggplot(summarized, aes(x=Genes, y=Mean,fill=Samples) ) + # data & aesthetic mapping
        geom_bar(stat="identity", position=position_dodge()) + # bars represent average
        geom_errorbar(aes(ymin=Mean-SD, ymax=Mean+SD), width=0.2,position=position_dodge(.9)) +
        labs(y="Expression Level") 
    }
    
    p2 <- p2 +  theme(plot.title = element_text(size = 16,hjust = 0.5)) + # theme(aspect.ratio=1) +
     theme(axis.text.x = element_text(angle=45, size = 16, hjust=1),
           axis.text.y = element_text( size = 16),
           axis.title.x = element_blank(),
           axis.title.y = element_text( size = 16) ) +
    theme(legend.text=element_text(size=16))
    
    p2
}

6.5 Hierarchical clustering

################################################################
# Hierarchical clustering
################################################################

# Functions for hierarchical clustering
hclust2 <- function(x, method="average", ...)  # average linkage
  hclust(x, method=method, ...)
hclust.ward.D <- function(x, method="ward.D", ...)  # average linkage
  hclust(x, method=method, ...)
hclust.ward.D2 <- function(x, method="ward.D2", ...)  # average linkage
  hclust(x, method=method, ...)
hclust.single <- function(x, method="single", ...)  # average linkage
  hclust(x, method=method, ...)
hclust.mcquitty <- function(x, method="mcquitty", ...)  # average linkage
  hclust(x, method=method, ...)
hclust.median <- function(x, method="median", ...)  # average linkage
  hclust(x, method=method, ...)
hclust.centroid <- function(x, method="centroid", ...)  # average linkage
  hclust(x, method=method, ...)
  
hclustFuns <- list( averge = hclust2, complete=hclust, single=hclust.single,
                    median=hclust.median, centroid=hclust.centroid, mcquitty=hclust.mcquitty)
hclustChoices = setNames(1:length(hclustFuns),names(hclustFuns)) # for pull down menu

dist2 <- function(x, ...)   # distance function = 1-PCC (Pearson's correlation coefficient)
  as.dist(1-cor(t(x), method="pearson"))
  
dist3 <- function(x, ...)   # distance function = 1-abs(PCC) (Pearson's correlation coefficient)
  as.dist(1-abs(cor(t(x), method="pearson")))  
  
 distFuns <- list(Correlation=dist2, Euclidean=dist,AbsolutePCC=dist3) 
  # hierarchical clustering tree ----------------------------------------

hmcols <- colorRampPalette(rev(c("#D73027", "#FC8D59", "#FEE090", "#FFFFBF",
"#E0F3F8", "#91BFDB", "#4575B4")))(75)
heatColors = rbind(      greenred(75),     bluered(75),     colorpanel(75,"green","black","magenta"),colorpanel(75,"blue","yellow","red"),hmcols )
rownames(heatColors) = c("Green-Black-Red","Blue-White-Red","Green-Black-Magenta","Blue-Yellow-Red","Blue-white-brown")
colorChoices = setNames(1:dim(heatColors)[1],rownames(heatColors)) # for pull down menu
# adding sample legends to heatmap; this is for the main heatmap
# https://stackoverflow.com/questions/3932038/plot-a-legend-outside-of-the-plotting-area-in-base-graphics
add_legend <- function(...) {
  opar <- par(fig=c(0, 1, 0, 1), oma=c(0, 0, 0, 0), 
    mar=c(0, 0, 0, 6), new=TRUE)
  on.exit(par(opar))
  plot(0, 0, type='n', bty='n', xaxt='n', yaxt='n')
  legend(...)
}

correlationMatrix <- function( ) {
    # heatmap of correlation matrix
    x <- readData.out$data
    maxGene <- apply(x,1,max)
    x <- x[which(maxGene > quantile(maxGene)[1] ) ,] # remove bottom 25% lowly expressed genes, which inflate the PPC
    
       melted_cormat <- melt(round(cor(x),2), na.rm = TRUE)
    # melted_cormat <- melted_cormat[which(melted_cormat[,1] != melted_cormat[,2] ) , ]
        # Create a ggheatmap
        ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
         geom_tile(color = "white")+
         scale_fill_gradient2(low = "green", high = "red",  mid = "white", 
            space = "Lab",  limit = c(min(melted_cormat[,3]) ,max(melted_cormat[,3])), midpoint = median(melted_cormat[,3]),
            name="Pearson\nCorrelation"
          ) +
         theme_minimal()+ # minimal theme
         theme(axis.text.x = element_text(angle = 45, vjust = 1, 
            size = 15, hjust = 1))+
         theme(axis.text.y = element_text( 
            size = 15))+
         coord_fixed()+
        theme(
          axis.title.x = element_blank(),
          axis.title.y = element_blank(),
          panel.grid.major = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank(),
          axis.ticks = element_blank(),
         legend.justification = c(1, 0),
          legend.position = c(0.6, 0.7),
         legend.direction = "horizontal")+
         guides(fill = FALSE) # + ggtitle("Pearson's Correlation Coefficient (all genes)")
         
         if(input_labelPCC && ncol(x)<30)
                ggheatmap <- ggheatmap +  geom_text(aes(Var2, Var1, label = value), color = "black", size = 4)

        ggheatmap
         
}        

staticHeatmap <- function () { 
     x <- readData.out$data   # x = read.csv("expression1.csv")

    n=input_nGenes
    #if(n>6000) n = 6000 # max
    if(n>dim(x)[1]) n = dim(x)[1] # max as data
    # this will cutoff very large values, which could skew the color 
    if(input_geneCentering)
        x=as.matrix(x[1:n,])-apply(x[1:n,],1,mean)
    
    # standardize by gene
    if(input_geneNormalize) 
        x <- x / apply(x,1,sd)
        
    # row centering and normalize
    x <- scale(x, center = input_sampleCentering, scale = input_sampleNormalize) 

    
    
    cutoff = median(unlist(x)) + input_heatmapCutoff * sd (unlist(x)) 
    x[x>cutoff] <- cutoff
    cutoff = median(unlist(x)) - input_heatmapCutoff *sd (unlist(x)) 
    x[x< cutoff] <- cutoff
    
    groups = detectGroups(colnames(x) )
    # if sample info file is uploaded us that info:

    if(!is.null(readSampleInfo.out) &&  !is.null(input_selectFactorsHeatmap) ) { 
        if(input_selectFactorsHeatmap == "Sample_Name" )
            groups = detectGroups(colnames(x) ) else 
            {   ix = match(input_selectFactorsHeatmap, colnames(readSampleInfo.out ) ) 
                groups = readSampleInfo.out[,ix]
            }
    }   
    
    groups.colors = rainbow(length(unique(groups) ) )

    #http://stackoverflow.com/questions/15351575/moving-color-key-in-r-heatmap-2-function-of-gplots-package
    lmat = rbind(c(0,4),c(0,1),c(3,2),c(5,0))
    lwid = c(2,6) # width of gene tree; width of heatmap
    lhei = c(1.5,.2,8,1.1)
    #layout matrix
    #        [,1] [,2]
    #   [1,]    0    4
    #   [2,]    0    1
    #   [3,]    3    2
    #   [4,]    5    0
    # 4--> column tree; 1--> column color bar; 2--> heatmap; 3-> row tree; 5--> color key.
    # height of 4 rows is specified by lhei; width of columns is given by lwid


    par(mar = c(12, 4, 1.4, 0.2))


    if( n>110) 
    heatmap.2(x, distfun = distFuns[[as.integer(input_distFunctions)]]
        ,hclustfun=hclustFuns[[as.integer(input_hclustFunctions)]]
        ,Colv=!input_noSampleClustering
        #col=colorpanel(75,"green","black","magenta")  ,
        #col=bluered(75),
        #col=greenred(75), 
        ,col= heatColors[as.integer(input_heatColors1),]
        ,density.info="none", trace="none", scale="none", keysize=.5
        ,key=T, symkey=F
        ,ColSideColors=groups.colors[ as.factor(groups)]
        ,labRow=""
        ,margins=c(10,2)
        ,srtCol=45
        ,cexCol=2  # size of font for sample names
        ,lmat = lmat, lwid = lwid, lhei = lhei
        )

    if( n<=110) 
    heatmap.2(x, distfun =  distFuns[[as.integer(input_distFunctions)]]
        ,hclustfun=hclustFuns[[as.integer(input_hclustFunctions)]]
        ,Colv=!input_noSampleClustering     
        ,col= heatColors[as.integer(input_heatColors1),], density.info="none", trace="none", scale="none", keysize=.5
        ,key=T, symkey=F
        #,labRow=labRow
        ,ColSideColors=groups.colors[ as.factor(groups)]
        ,margins=c(18,12)
        ,cexRow=1
        ,srtCol=45
        ,cexCol=1.8  # size of font for sample names
        ,lmat = lmat, lwid = lwid, lhei = lhei
    )
    
    par(lend = 1)           # square line ends for the color legend
    add_legend("topleft",
        legend = unique(groups), # category labels
        col = groups.colors[ unique(as.factor(groups))],  # color key
        lty= 1,             # line style
        lwd = 10            # line width
    )
    
}


    #Interactive heatmap via Plotly plot ----------------------------------------------
    # interactive heatmap with plotly
heatmapPlotly <- function () {  
    input_nGenesPlotly= 50
    
    

       x <- convertedData.out

    n=input_nGenesPlotly
    #if(n>6000) n = 6000 # max
    if(n>dim(x)[1]) n = dim(x)[1] # max as data

    if(input_geneCentering)
        x=as.matrix(x[1:n,])-apply(x[1:n,],1,mean)
    # standardize by gene
    if(input_geneNormalize) 
        x <- x / apply(x,1,sd)
    # row centering and normalize
    x <- scale(x, center = input_sampleCentering, scale = input_sampleNormalize) 

    cutoff = median(unlist(x)) + 3*sd (unlist(x)) 
    x[x>cutoff] <- cutoff
    cutoff = median(unlist(x)) - 3*sd (unlist(x)) 
    x[x< cutoff] <- cutoff

    # adding gene symbol
    ix <- match( rownames(x), allGeneInfo.out[,1])
    geneSymbols <- as.character( allGeneInfo.out$symbol)[ix]
    # if missing or duplicated, use Ensembl ID
    ix <- which(nchar( geneSymbols) <=1 | duplicated(geneSymbols ) | is.null(geneSymbols)  | is.na(geneSymbols)    );   geneSymbols[ ix ] <- rownames(x)[ix]
    rownames( x) = geneSymbols;


    clust <- x %>% 
      dist2() %>% 
      hclust2()

    # Get order
    ord <- clust$order

    # Re-arrange based on order
    df <- t( x[ord,] )%>%
       melt()
       
    colnames(df)[1:2] <- c("X","Y")
    colorNames = unlist(strsplit(tolower(rownames(heatColors)[ as.integer(input_heatColors1)   ]),"-" ) )
    p <- df %>%
      ggplot(aes(X, Y, fill = value)) + 
           geom_tile()+ scale_fill_gradient2(low = colorNames[1], mid = colorNames[2],high = colorNames[3]) +
           theme(axis.title.y=element_blank(),   # remove y labels
           # axis.text.y=element_blank.out,  # keep gene names for zooming
            axis.ticks.y=element_blank(),
            axis.title.x=element_blank()) +
            theme(axis.text.x = element_text(size=10,angle = 45, hjust = 1))


    ggplotly(p) %>% 
        layout(margin = list(b = 150,l=200))  # prevent cutoff of sample names
}

6.6 PCA

################################################################
# PCA
################################################################

PCAplot <- function() {   #PCA
     x <- convertedData.out;
     pca.object <- prcomp(t(x))
        
    pcaData = as.data.frame(pca.object$x[,1:2]); pcaData = cbind(pcaData,detectGroups(colnames(x)) )
    colnames(pcaData) = c("PC1", "PC2", "Sample_Name")
    percentVar=round(100*summary(pca.object)$importance[2,1:2],0)
    if(is.null(readSampleInfo.out)) { 
        p=ggplot(pcaData, aes(PC1, PC2, color=Sample_Name, shape = Sample_Name)) + geom_point(size=5) 
        } else {
        pcaData = cbind(pcaData,readSampleInfo.out )
        p=ggplot(pcaData, aes_string("PC1", "PC2", color=input_selectFactors,shape=input_selectFactors2)) + geom_point(size=5) 
        
        }
    p=p+xlab(paste0("PC1: ",percentVar[1],"% variance")) 
    p=p+ylab(paste0("PC2: ",percentVar[2],"% variance")) 
    p=p+ggtitle("Principal component analysis (PCA)")+coord_fixed(ratio=1.0)+ 
     theme(plot.title = element_text(size = 16,hjust = 0.5)) + theme(aspect.ratio=1) +
     theme(axis.text.x = element_text( size = 16),
           axis.text.y = element_text( size = 16),
           axis.title.x = element_text( size = 16),
           axis.title.y = element_text( size = 16) ) +
    theme(legend.text=element_text(size=16))
       print(p)
       }
    # variance chart
    # plot(pca.object,type="bar", xlab="Principal Components", main ="Variances explained")
    
     
MDSplot <- function() {  # MDS
        x <- convertedData.out;
     fit = cmdscale( dist2(t(x) ), eig=T, k=2)
     
    # par(pin=c(5,5))
    pcaData = as.data.frame(fit$points[,1:2]); pcaData = cbind(pcaData,detectGroups(colnames(x)) )
    colnames(pcaData) = c("x1", "x2", "Sample_Name")
    

    if(is.null(readSampleInfo.out)) { 
    p=ggplot(pcaData, aes(x1, x2, color=Sample_Name, shape = Sample_Name)) + geom_point(size=5) 
    } else {
        pcaData = cbind(pcaData,readSampleInfo.out )
        p=ggplot(pcaData, aes_string("x1", "x2", color=input_selectFactors,shape=input_selectFactors2)) + geom_point(size=5) 
        }
    p=p+xlab("Dimension 1") 
    p=p+ylab("Dimension 2") 
    p=p+ggtitle("Multidimensional scaling (MDS)")+ coord_fixed(ratio=1.)+ 
     theme(plot.title = element_text(hjust = 0.5)) + theme(aspect.ratio=1) +
         theme(axis.text.x = element_text( size = 16),
           axis.text.y = element_text( size = 16),
           axis.title.x = element_text( size = 16),
           axis.title.y = element_text( size = 16) ) +
    theme(legend.text=element_text(size=16))
       print(p)
    
     }

tSNEplot <- function() {  # t-SNE
     x <- convertedData.out;
     library(Rtsne,verbose=FALSE)
     set.seed(input_tsneSeed2)
     tsne <- Rtsne(t(x), dims = 2, perplexity=1, verbose=FALSE, max_iter = 400)

    pcaData = as.data.frame(tsne$Y); pcaData = cbind(pcaData,detectGroups(colnames(x)) )
    colnames(pcaData) = c("x1", "x2", "Sample_Name")
    

    if(is.null(readSampleInfo.out)) { 
    p=ggplot(pcaData, aes(x1, x2, color=Sample_Name, shape = Sample_Name)) + geom_point(size=5) 
    } else {
        pcaData = cbind(pcaData,readSampleInfo.out )
        p=ggplot(pcaData, aes_string("x1", "x2", color=input_selectFactors,shape=input_selectFactors2)) + geom_point(size=5) 
        }
    p=p+xlab("Dimension 1") 
    p=p+ylab("Dimension 2") 
    p=p+ggtitle("t-SNE plot")+ coord_fixed(ratio=1.)+ 
     theme(plot.title = element_text(hjust = 0.5)) + theme(aspect.ratio=1) +
         theme(axis.text.x = element_text( size = 16),
           axis.text.y = element_text( size = 16),
           axis.title.x = element_text( size = 16),
           axis.title.y = element_text( size = 16) ) +
    theme(legend.text=element_text(size=16))
       print(p)
    
     }

 # read pathway data from local file, not used if using database
 # Read gene sets GMT file
# This functions cleans and converts to upper case

readGMTRobust <- function (file1) {   # size restriction
        # Read in the first file 
        x <- scan(file1, what="", sep="\n")
        # x <- gsub("\t\t.","",x)     # GMT files saved by Excel has a lot of empty cells "\t\t\t\t"   "\t." means one or more tab
        x <- gsub(" ","",x)  # remove white space
        x <- toupper(x)    # convert to upper case

        #----Process the first file
        # Separate elements by one or more whitespace
        y <- strsplit(x, "\t")
        # Extract the first vector element and set it as the list element name
        names(y) <- sapply(y, `[[`, 1)
        #names(y) <- sapply(y, function(x) x[[1]]) # same as above
        # Remove the first vector element from each list element
        y <- lapply(y, `[`, -c(1,2))
        #y <- lapply(y, function(x) x[-1]) # same as above
        # remove duplicated elements
        for ( i in 1:length(y) )  y[[i]] <- cleanGeneSet(y[[i]])
        # check the distribution of the size of gene lists sapply(y, length) hold a vector of sizes
        if( max( sapply(y,length) ) <5) cat("Warning! Gene sets have very small number of genes!\n Please double check format.")
        y <- y[which(sapply(y,length) > 1)]  # gene sets smaller than 1 is ignored!!!

        return(y)
    }


readGeneSets <- function (fileName, convertedData, GO,selectOrg, myrange) {

    pathway <- dbConnect(sqlite,fileName)
    
    if(is.null(GO)  ) GO <- "GOBP"   # initial value not properly set; enforcing  

    # get Gene sets
    querySet = rownames(convertedData)
    sqlQuery = paste( " select distinct gene,pathwayID from pathway where gene IN ('", paste(querySet,collapse="', '"),"')" ,sep="")
    # cat(paste0("\n\nhere:",GO,"There"))

    if( GO != "All") sqlQuery = paste0(sqlQuery, " AND category ='",GO,"'")
    result <- dbGetQuery( pathway, sqlQuery  )
    if( dim(result)[1] ==0) {return(list( x=as.data.frame("No matching species or gene ID file!" )) )}
    # list pathways and frequency of genes
    pathwayIDs = aggregate( result$pathwayID, by   = list(unique.values = result$pathwayID), FUN = length)
    pathwayIDs = pathwayIDs[which(pathwayIDs[,2]>= myrange[1] ),]
    pathwayIDs = pathwayIDs[which( pathwayIDs[,2] <= myrange[2] ),]
    if(dim(pathwayIDs)[1] ==0 ) geneSets = NULL;
    
    # convert pathways into lists like those generated by readGMT
    geneSets = lapply(pathwayIDs[,1], function(x)  result[which(result$pathwayID == x ),1]     )
    pathwayInfo <- dbGetQuery( pathway, paste( " select distinct id,Description from pathwayInfo where id IN ('", 
                            paste(pathwayIDs[,1],collapse="', '"),   "') ",sep="") )
    ix = match( pathwayIDs[,1], pathwayInfo[,1])
    names(geneSets) <- pathwayInfo[ix,2]  
    #geneSets <- geneSets[ -which(duplicated(names(geneSets) ))] # remove geneSets with the same name
    dbDisconnect(pathway)
    return( geneSets )
} 


# Runs pathway analysis using PGSEA; this is copied and revised from PGSEA package
myPGSEA  <- function (exprs, cl, range = c(25, 500), ref = NULL, center = TRUE, 
    p.value = 0.005, weighted = TRUE, nPermutation=100, enforceRange = TRUE, ...) {
    if (is(exprs, "ExpressionSet")) 
        exprs <- exprs(exprs)
    if (!is.list(cl)) 
        stop("cl need to be a list")
    if (!is.null(ref)) {
        if (!is.numeric(ref)) 
            stop("column index's required")
    }
    if (!is.null(ref)) {
        if (options.out$verbose) 
            cat("Creating ratios...", "\n")
        ref_mean <- apply(exprs[, ref], 1, mean, na.rm = TRUE)
        exprs <- sweep(exprs, 1, ref_mean, "-")
    }
    if (center) 
        exprs <- scale(exprs, scale = FALSE)         # column centering is done
    results <- matrix(NA, length(cl), ncol(exprs))
    rownames(results) <- names(cl)
    colnames(results) <- colnames(exprs)
    mode(results) <- "numeric"
    Setsize = c(rep(0,length(cl)))     # gene set size vector
    mean2 = c(rep(0,length(cl)))     # mean of the range of means 
    meanSD = c(rep(0,length(cl)))     # SD of the range of means    
    if (is.logical(p.value)) 
        { p.results <- results; mean.results <- results;}
    for (i in 1:length(cl)) {              # for each gene list
        #cat("\nProcessing gene set",i);
        if (class(cl[[i]]) == "smc") {
            clids <- cl[[i]]@ids
        }
        else if (class(cl[[i]]) %in% c("GeneColorSet", "GeneSet")) {
            clids <- cl[[i]]@geneIds
        }
        else {
            clids <- cl[[i]]
        }
        if (options()$verbose) 
            cat("Testing region ", i, "\n")
        ix <- match(clids, rownames(exprs))
        ix <- unique(ix[!is.na(ix)])
        present <- sum(!is.na(ix))
        Setsize[i] <- present 
        if (present < range[1]) {
            if (options()$verbose) 
                cat("Skipping region ", i, " because too small-", 
                  present, ",\n")
            next
        }
        if (present > range[2]) {
            if (options()$verbose) 
                cat("Skipping region ", i, " because too large-", 
                  present, "\n")
            next
        }
        texprs <- exprs[ix, ]           # expression matrix for genes in gene set
        if (any(is.na(texprs))) 
            cat("Warning - 'NA' values within expression data, enrichment scores are estimates only.\n")
        if (!is.matrix(texprs)) 
            texprs <- as.matrix(texprs)
                            
        stat <- try(apply(texprs, 2, t.test, ...))
        means <- try(apply(texprs, 2, mean,trim=0.1))   # trim mean
        ps <- unlist(lapply(stat, function(x) x$p.value))
        stat <- unlist(lapply(stat, function(x) x$statistic))
        p.results[i, ] <- ps
        mean.results[i,] <- means
        results[i, ] <- as.numeric(stat)
        
        # permutation of gene sets of the same size
        if(nPermutation > 2 )  { # no permutation if <=2
            meansRanges = c(0, rep(nPermutation))
            for( k in 1:nPermutation ) {
                ix <- sample.int( dim(exprs)[1], length(ix) )
                texprs <- exprs[ix, ] 
                means <- try(apply(texprs, 2, mean,trim=0.1))   # trim mean
                meansRanges[k] = dynamicRange(means)
            }
            mean2[i] = mean(meansRanges)
            meanSD[i]= sd(meansRanges,na.rm=TRUE)   # NA are removed before calculating standard deviation
        }
    }
    return(list(results = results, p.results = p.results, means = mean.results, size=Setsize, mean2=mean2, meanSD=meanSD))
}

PCApathway<- function () {  # pathway
    x <- convertedData.out;
    library(PGSEA,verbose=FALSE)
    pca.object <- prcomp(t(x))
    pca = 100*pca.object$rotation 
    Npca = 5
    if (Npca > dim(pca)[2]) { Npca = dim(pca)[2] } else pca <-  pca[,1:Npca]
    #pca = pca[,1:5]
    if(is.null(GeneSets.out ) ) return(NULL)  # no species recognized
    if(length(GeneSets.out ) <= 1 ) return(NULL)
    #cat("\n\nGene Sets:",length( GeneSets.out))
    pg = myPGSEA (pca,cl=GeneSets.out,range=c(15,2000),p.value=TRUE, weighted=FALSE,nPermutation=1)

    # correcting for multiple testing
    p.matrix = pg$p.result
    tem = p.adjust(as.numeric(p.matrix),"fdr")
    p.matrix = matrix(tem, nrow=dim(p.matrix)[1], ncol = dim(p.matrix)[2] )
    rownames(p.matrix) = rownames(pg$p.result); colnames(p.matrix) = colnames(pg$p.result)


    selected =c()
    for( i in 1:dim(p.matrix)[2]) {
      tem = which( rank(p.matrix[,i],ties.method='first') <= 5)  # rank by P value
     #tem = which( rank(pg$result[,i],ties.method='first') >= dim(p.matrix)[1]-3.1) # rank by mean
     names(tem) = paste("PC",i," ", rownames(p.matrix)[tem], sep="" )
     selected = c(selected, tem)
    }
    rowids = gsub(" .*","",names(selected))
    rowids = as.numeric( gsub("PC","",rowids) )
    pvals = p.matrix[ cbind(selected,rowids) ]
    a=sprintf("%-1.0e",pvals)
    tem = pg$result[selected,]
    rownames(tem) = paste(a,names(selected)); #colnames(tem)= paste("PC",colnames(tem),sep="")
    
    tem = tem[!duplicated(selected),] 

    #tem = t(tem); tem = t( (tem - apply(tem,1,mean)) ) #/apply(tem,1,sd) )

    smcPlot(tem,scale =  c(-max(tem), max(tem)), show.grid = T, margins = c(3,1, 6, 23), col = .rwb,cex.lab=0.5, main="Pathways analysis on PCA")
    
     }

# correlation PCA with factors  
PCA2factor <- function( ){

    if(is.null(readSampleInfo.out)) return(NULL)
    npc = 5 # number of Principal components
    x <- readData.out$data
    y <- readSampleInfo.out

    pca.object <- prcomp(t(x))
    pcaData = as.data.frame(pca.object$x[,1:npc]); 
    pvals = matrix(1,nrow=npc,ncol=ncol(y))
    for (i in 1:npc ){
        for (j in 1:ncol(y) )
            pvals[i,j] =summary( aov(pcaData[,i] ~ as.factor(y[,j])))[[1]][["Pr(>F)"]][1]
    }
    
    pvals = pvals * npc* ncol(y)   # correcting for multiple testing
    pvals[pvals>1] = 1

    colnames(pvals) = colnames(y)
    rownames(pvals) = paste0("PC",1:npc)
    a="\n Correlation between Principal Components (PCs) with factors\n"
    nchar0 = nchar(a)
    for ( i in 1:npc){
        j = which.min(pvals[i,])
        if(pvals[i,j]< 0.05) a=paste0(a,rownames(pvals)[i], 
                    " is correlated with ", colnames(pvals)[j],
                    " (p=",   sprintf("%-3.2e",pvals[i,j]),").\n")
    }
    if(nchar(a) == nchar0 ) return(NULL) else 
      return( a )
          
} 

# detecting sequencing depth bias
readCountsBias <- function( ){

    totalCounts = colSums(readData.out$rawCounts) 
    groups = as.factor( detectGroups(colnames(readData.out$rawCounts ) ) )
    tem = NULL
    # ANOVA of total read counts vs sample groups parsed by sample name
    pval = summary( aov(totalCounts ~ groups ))[[1]][["Pr(>F)"]][1]
    if(pval <0.05)
      tem = paste("Warning! Sequencing depth bias detected. Total read counts are significantly different among sample groups (p=",
                sprintf("%-3.2e",pval),") based on ANOVA.")

    # ANOVA of total read counts vs factors in experiment design
    if(!is.null(readSampleInfo.out   )  ) {
      y <- readSampleInfo.out
        for (j in 1:ncol(y) ) { 
        pval = summary( aov(totalCounts ~ as.factor(y[,j])))[[1]][["Pr(>F)"]][1]

        if(pval <0.05)
        tem = paste(tem, " Total read counts seem to be correlated with factor",colnames(y)[j], 
                    "(p=",  sprintf("%-3.2e",pval),").  ")
      }
     }
    if(is.null(tem)) return( "No bias detected") else
    return( tem )
          
}

6.7 K-means clustering

################################################################
# K-means clustering
################################################################
     
  #Distribution of SDs 
distributionSD <- function() {

        SDs=apply(convertedData.out,1,sd)
        maxSD = mean(SDs)+ 4*sd(SDs)
        SDs[ SDs > maxSD] = maxSD
        
        top = input_nGenesKNN
        if(top > length(SDs)) top = length(SDs)
        Cutoff=sort(SDs,decreasing=TRUE)[top] 

        SDs = as.data.frame(SDs)

        p <- ggplot(SDs, aes(x=SDs)) + 
          geom_density(color="darkblue", fill="lightblue") +
          labs(x = "Standard deviations of all genes", y="Density")+
          geom_vline(aes(xintercept=Cutoff),
                    color="red", linetype="dashed", size=1) +
                    annotate("text", x = Cutoff + 0.4*sd(SDs[,1]), y = 1,colour = "red", label = paste0("Top ", top))

        p
}


# Decide number of clusters
KmeansNclusters <- function() { # Kmeans clustering

    x <- convertedData.out
    #x <- readData.out
    #par(mfrow=c(1,2))
    n=input_nGenesKNN
    #if(n>6000) n = 6000 # max
    if(n>dim(x)[1]) n = dim(x)[1] # max as data
    #x1 <- x;
    #x=as.matrix(x[1:n,])-apply(x[1:n,],1,mean)
    x = 100* x[1:n,] / apply(x[1:n,],1,sum)  # this is causing problem??????
    #x = x - apply(x,1,mean)  # this is causing problem??????
    #colnames(x) = gsub("_.*","",colnames(x))
    set.seed(2)
    
    k = 30
    wss <- (nrow(x)-1)*sum(apply(x,2,var))
      for (i in 2:k) wss[i] <- sum(kmeans(x,centers=i,iter.max = 30)$withinss)
        par(mar=c(4,5,4,4))
    plot(1:k, wss, type="b", xlab="Number of Clusters (k)",
         ylab="Within groups sum of squares",
         cex=2,cex.lab=2, cex.axis=2, cex.main=2, cex.sub=2 ,xaxt="n"    )
    axis(1, at = seq(1, 30, by = 2),cex.axis=1.5,cex=1.5)
    
}
 
Kmeans <- function() { # Kmeans clustering

    x <- convertedData.out
    #x <- readData.out
    #par(mfrow=c(1,2))
    n=input_nGenesKNN
    if(n>maxGeneClustering) n = maxGeneClustering # max
    if(n>dim(x)[1]) n = dim(x)[1] # max as data
    #x1 <- x;
    #x=as.matrix(x[1:n,])-apply(x[1:n,],1,mean)
    #x = 100* x[1:n,] / apply(x[1:n,],1,sum) 
    x = x[1:n,]
    if( input_kmeansNormalization == 'L1Norm')
        x = 100* x / apply(x,1,function(y) sum(abs(y))) else # L1 norm
    if( input_kmeansNormalization == 'geneMean')
        x = x - apply(x,1,mean)  else # this is causing problem??????
    if( input_kmeansNormalization == 'geneStandardization') 
        x = (x - apply(x,1,mean) ) / apply(x,1,sd)
    #colnames(x) = gsub("_.*","",colnames(x))
    set.seed(input_KmeansReRun)
    k=input_nClusters

    cl = kmeans(x,k,iter.max = 50)
    #myheatmap(cl$centers)  
 

    hc <- hclust2(dist2(cl$centers-apply(cl$centers,1,mean) )  )# perform cluster for the reordering of samples
    tem = match(cl$cluster,hc$order) #  new order 
    x = x[order(tem),] ;    bar = sort(tem)

    #myheatmap2(x-apply(x,1,mean), bar,1000)
    return(list( x = x, bar = bar)) 

  } 


myheatmap2 <- function (x,bar=NULL,n=-1,mycolor=1,clusterNames=NULL,sideColors=NULL ) {
    # number of genes to show
    ngenes = as.character( table(bar))
    if(length(bar) >n && n != -1) {ix = sort( sample(1:length(bar),n) ); bar = bar[ix]; x = x[ix,]  }
    if(! is.null(bar) )
        if(is.null(sideColors) ) 
            sideColors = mycolors

    # this will cutoff very large values, which could skew the color 
    x=as.matrix(x)-apply(x,1,mean)
    cutoff = median(unlist(x)) + 3*sd (unlist(x)) 
    x[x>cutoff] <- cutoff
    cutoff = median(unlist(x)) - 3*sd (unlist(x)) 
    x[x< cutoff] <- cutoff
    #colnames(x)= detectGroups(colnames(x))
    if(is.null(bar)) # no side colors
        heatmap.2(x,  Rowv =F,Colv=F, dendrogram ="none",
            col=heatColors[as.integer(mycolor),], density.info="none", trace="none", scale="none", keysize=.3
            ,key=F, labRow = F
            #,RowSideColors = mycolors[bar]
            ,margins = c(8, 24)
            ,srtCol=45
        ) else
        heatmap.2(x,  Rowv =F,Colv=F, dendrogram ="none",
            col=heatColors[as.integer(mycolor),], density.info="none", trace="none", scale="none", keysize=.3
            ,key=F, labRow = F
            ,RowSideColors = sideColors[bar]
            ,margins = c(8, 24)
            ,srtCol=45
        )
        
    if(!is.null(bar)) { 

        legend.text = paste("Cluster ", toupper(letters)[unique(bar)], " (N=", ngenes,")", sep="") 
        if( !is.null( clusterNames ) && length(clusterNames)>= length( unique(bar) ) )  
            legend.text = paste(clusterNames[ 1:length( unique(bar) )  ], " (N=", ngenes,")", sep="") 
        
        par(lend = 1)           # square line ends for the color legend
        legend("topright",      # location of the legend on the heatmap plot
        legend = legend.text, # category labels
        col = sideColors,  # color key
        lty= 1,             # line style
        lwd = 10 )           # line width
        }
}

KmeansHeatmap <- function() { # Kmeans clustering

    myheatmap2(Kmeans.out$x-apply(Kmeans.out$x,1,mean), Kmeans.out$bar,1000,mycolor=input_heatColors1)
}
 

tSNEgenePlot <- function() {
            Cluster <- Kmeans.out$bar
            train <- as.data.frame( cbind(Cluster,Kmeans.out$x) )

            library(Rtsne,verbose=FALSE)

            train = unique(train)
            Cluster = train$Cluster 
            set.seed(input_seedTSNE)
            
            ## Executing the algorithm on curated data
            tsne <- Rtsne(train[,-1], dims = 2, perplexity=30, verbose=FALSE, max_iter = 400)


            nClusters = length(unique(Cluster) )
            if(input_colorGenes) {          
                plot(tsne$Y[,1], tsne$Y[,2], pch = (0:(nClusters-1))[Cluster], cex = 1.,col = mycolors[Cluster], xlab="X",ylab="Y")
                legend("topright",toupper(letters)[1:nClusters], pch = 0:(nClusters-1), col=mycolors, title="Cluster"  )
            } else
                plot(tsne$Y[,1], tsne$Y[,2],  cex = 1., xlab="X",ylab="Y")

}


# Main function. Find a query set of genes enriched with functional category
FindOverlap <- function (converted,gInfo, GO,selectOrg,minFDR, reduced = FALSE) {
    maxTerms =15 # max number of enriched terms
    idNotRecognized = as.data.frame("ID not recognized!")
    
    if(is.null(converted) ) return(idNotRecognized) # no ID 
    
    # only coding
    gInfo <- gInfo[which( gInfo$gene_biotype == "protein_coding"),]  
    querySet <- intersect( converted$IDs, gInfo[,1]);
    
    if(length(querySet) == 0) return(idNotRecognized )
    
    ix = grep(converted$species[1,1],gmtFiles)
    totalGenes <- converted$species[1,7]
    
    if (length(ix) == 0 ) {return(idNotRecognized )}
    
    # If selected species is not the default "bestMatch", use that species directly
    if(selectOrg != speciesChoice[[1]]) {  
        ix = grep(findSpeciesById(selectOrg)[1,1], gmtFiles )
        if (length(ix) == 0 ) {return(idNotRecognized )}
        totalGenes <- orgInfo[which(orgInfo$id == as.numeric(selectOrg)),7]
    }
    pathway <- dbConnect(sqlite,gmtFiles[ix],flags=SQLITE_RO)
    
        
    sqlQuery = paste( " select distinct gene,pathwayID from pathway where gene IN ('", paste(querySet,collapse="', '"),"')" ,sep="")
    
    #cat(paste0("HH",GO,"HH") )
    
    if( GO != "All") sqlQuery = paste0(sqlQuery, " AND category ='",GO,"'")
    result <- dbGetQuery( pathway, sqlQuery  )
    if( dim(result)[1] ==0) {return(as.data.frame("No matching species or gene ID file!" )) }

    # given a pathway id, it finds the overlapped genes, symbol preferred
    sharedGenesPrefered <- function(pathwayID) {
        tem <- result[which(result[,2]== pathwayID ),1]
        ix = match(tem, converted$conversionTable$ensembl_gene_id) # convert back to original
        tem2 <- unique( converted$conversionTable$User_input[ix] )
        if(length(unique(gInfo$symbol) )/dim(gInfo)[1] >.7  ) # if 70% genes has symbol in geneInfo
        { ix = match(tem, gInfo$ensembl_gene_id); 
        tem2 <- unique( gInfo$symbol[ix] )      }
    return( paste( tem2 ,collapse=" ",sep="") )}
    
    x0 = table(result$pathwayID)                    
    x0 = as.data.frame( x0[which(x0>=Min_overlap)] )# remove low overlaps
    if(dim(x0)[1] <= 5 ) return(idNotRecognized) # no data
    colnames(x0)=c("pathwayID","overlap")
    pathwayInfo <- dbGetQuery( pathway, paste( " select distinct id,n,Description from pathwayInfo where id IN ('", 
                            paste(x0$pathwayID,collapse="', '"),   "') ",sep="") )
    
    x = merge(x0,pathwayInfo, by.x='pathwayID', by.y='id')
    
    x$Pval=phyper(x$overlap-1,length(querySet),totalGenes - length(querySet),as.numeric(x$n), lower.tail=FALSE );
    x$FDR = p.adjust(x$Pval,method="fdr")
    x <- x[ order( x$FDR)  ,]  # sort according to FDR
    
    if(dim(x)[1] > maxTerms ) x = x[1:maxTerms,]    
    
    if(min(x$FDR) > minFDR) x=as.data.frame("No significant enrichment found!") else {
        x <- x[which(x$FDR < minFDR),] 

        x= cbind(x,sapply( x$pathwayID, sharedGenesPrefered ) )
        colnames(x)[7]= "Genes"
        x <- subset(x,select = c(FDR,overlap,n,description,Genes) )
        colnames(x) = c("Corrected P value (FDR)", "Genes in list", "Total genes in category","Functional Category","Genes"  )
        
        # remove redudant gene sets
        if(reduced != FALSE ){  # reduced=FALSE no filtering,  reduced = 0.9 filter sets overlap with 90%
            n=  nrow(x)
            tem=rep(TRUE,n )
            geneLists = lapply(x$Genes, function(y) unlist( strsplit(as.character(y)," " )   ) )
            for( i in 2:n)
                for( j in 1:(i-1) ) { 
                  if(tem[j]) { # skip if this one is already removed
                      commonGenes = length(intersect(geneLists[i] ,geneLists[j] ) )
                      if( commonGenes/ length(geneLists[j] ) > reduced )
                        tem[i] = FALSE  
                  }         
                }                               
            x <- x[which(tem),]     
        }
        

    }
            
    dbDisconnect(pathway)
    return(x )
} 
 
# Given a gene set, finds significant overlaps with a gene set database  object 
findOverlapGMT <- function ( query, geneSet, minFDR=.2 ,minSize=2,maxSize=10000 ){ 
    #geneSets <- readGMT("exampleData/MousePath_TF_gmt.gmt")
    #query <-  geneSets[['TF_MM_FRIARD_C-REL']] 
    #query <- query[1:60]
    total_elements = 30000
    Min_overlap <- 1
    maxTerms =10 # max number of enriched terms
    noSig <- as.data.frame("No significant enrichment found!")
    query <- cleanGeneSet(query)   # convert to upper case, unique()

    if(length(query) <=2) return(noSig)
    if(length(geneSet) <1) return(noSig)
      geneSet <- geneSet[which(sapply(geneSet,length) > minSize)]  # gene sets smaller than 1 is ignored!!!
      geneSet <- geneSet[which(sapply(geneSet,length) < maxSize)]  # gene sets smaller than 1 is ignored!!!
    result <- unlist( lapply(geneSet, function(x) length( intersect (query, x) ) ) )
    result <- cbind(unlist( lapply(geneSet, length) ), result )
    result <- result[ which(result[,2]>Min_overlap), ,drop=F]
    if(dim(result)[1] == 0) return( noSig)
    xx <- result[,2]
    mm <- length(query)
    nn <- total_elements - mm
    kk <- result[,1]
    Pval_enrich=phyper(xx-1,mm,nn,kk, lower.tail=FALSE );
    FDR <- p.adjust(Pval_enrich,method="fdr",n=length(geneSet) )
    result <- as.data.frame(cbind(FDR,result))
    result <- result[,c(1,3,2)]
    result$pathway = rownames(result)
    result$Genes = ""  # place holder just 
    colnames(result)= c("Corrected P value (FDR)", "Genes in list", "Total genes in category","Functional Category","Genes"  )
    result <- result[ which( result[,1] < minFDR),,drop=F]
    if( dim( result)[1] == 0) return(noSig) 
    if(min(FDR) > minFDR) return(noSig) 
    result <- result[order(result[,1] ),]
    if(dim(result)[1] > maxTerms ) result <- result[1:maxTerms,]

    return( result)
}

KmeansGO <- function() {
        pp=0
        minFDR = 0.01

        for( i in 1:input_nClusters) {

            query = rownames(Kmeans.out$x)[which(Kmeans.out$bar == i)]

                result <- findOverlapGMT( query, GeneSets.out,1) 

            if( dim(result)[2] ==1) next;   # result could be NULL
            result$Genes = toupper(letters)[i] 
            if (pp==0 ) { results <- result; pp <- 1;
            } else {
                results <- rbind(results,result)
            }
        }

        if(pp == 0) return( as.data.frame("No enrichment found."))
        results= results[,c(5,1,2,4)]
        colnames(results)= c("Cluster","FDR","Genes","Pathways")
        if(min(results$FDR) > minFDR ) results = as.data.frame("No signficant enrichment found.") else
        results = results[which(results$FDR < minFDR),]
    
    if( is.null(results) )  return ( as.matrix("No significant enrichment.") )  
    if( class(results) != "data.frame")  return ( as.matrix("No significant enrichment.") )
    if( dim(results)[2] ==1)  return ( as.matrix("No significant enrichment.") )
    colnames(results)[2] = "adj.Pval"
    results$Genes <- as.character(results$Genes)
    results$Cluster[which( duplicated(results$Cluster) ) ] <- ""
    results
  }

6.8 Differential expression

################################################################
# Differential expression
################################################################
 
# Differential expression using LIMMA 
DEG.limma <- function (x, maxP_limma=.1, minFC_limma=2, rawCounts,countsDEGMethods,priorCounts, dataFormat, selectedComparisons=NULL, sampleInfo = NULL,modelFactors=NULL, blockFactor = NULL){
    library(limma,verbose=FALSE) # Differential expression
    library(statmod,verbose=FALSE)
    
    # many different situations: 1. just use sample names 2. just one factor  3. two factors no interaction
    # 4. two factors with interaction   5. block factor 
    
    
    topGenes = list();  limmaTrend = FALSE
    if( dataFormat == 2) {   # if normalized data
        eset = new("ExpressionSet", exprs=as.matrix(x)) } else { # counts data
            if (countsDEGMethods == 1 ) { # limma-trend method selected for counts data
                #dge <- DGEList(counts=rawCounts);
                #dge <- calcNormFactors(dge, method = "TMM")
                #eset <- cpm(dge, log=TRUE, prior.count=priorCounts)
                eset = new("ExpressionSet", exprs=as.matrix(x))  # use transformed data for limma 
                limmaTrend = TRUE
            }
    }

    groups = colnames(x)
    groups = detectGroups( groups)
    g =  unique(groups)  
    
    # check for replicates, removes samples without replicates
    reps = as.matrix(table(groups)) # number of replicates per biological sample
    if ( sum( reps[,1] >= 2) <2    ) # if less than 2 samples with replicates
    return( list(results= NULL, comparisons = NULL, Exp.type="Failed to parse sample names to define groups. 
        Cannot perform DEGs and pathway analysis. Please double check column names! Use WT_Rep1, WT_Rep2 etc. ", topGenes=NULL)) 
    # remove samples without replicates
    g <- rownames(reps)[which(reps[,1] >1)]
    ix <- which( groups %in% g)  
    groups <- groups[ix]   
    x<- x[,ix]; rawCounts <- rawCounts[,ix] 
    
    if(length(g) ==2 ) { 
        g= unique(groups)
        comparisons <-  paste(g[2],"-",g[1],sep="")  # "Mutant-WT"
        
        # no sample file, but user selected comparisons using column names
        if( is.null(modelFactors) & length( selectedComparisons) >0  )  
            comparisons = selectedComparisons
        
        design <- model.matrix(~0+groups)
        colnames(design) <- g
        
        if( !is.null(rawCounts) && countsDEGMethods == 2) {  # voom
            dge <- DGEList(counts=rawCounts);
            dge <- calcNormFactors(dge, method = "TMM")  # normalization
            v <- voom(dge, design); fit <- lmFit(v, design) } else 
            fit <- lmFit(eset, design)      # regular limma
            
        cont.matrix <- makeContrasts(contrasts=comparisons, levels=design)
        fit2 <- contrasts.fit(fit, cont.matrix)
        fit2 <- eBayes(fit2, trend=limmaTrend)

        # calls differential gene expression 1 for up, -1 for down
        results <- decideTests(fit2, p.value=maxP_limma, lfc=log2(minFC_limma) )
        #vennDiagram(results,circle.col=rainbow(5))
        topGenes1 =topTable(fit2, number = 1e12,sort.by="M" )
        if (dim(topGenes1)[1] != 0) {
        topGenes1 = topGenes1[,c('logFC','adj.P.Val')] 
        # topGenes1[,1] <-  -1* topGenes1[,1] # reverse direction
        topGenes[[1]] <- topGenes1 }
        # log fold change is actually substract of means. So if the data is natral log transformed, it shoudl be natral log.
        Exp.type = "2 sample groups."
    }
    
    if(length(g) > 2 ) { # more than two sample groups
    
        design <- model.matrix(~ 0+factor(groups))
        colnames(design) <- gsub(".*)","",colnames(design))
        
        if( !is.null(rawCounts) && countsDEGMethods == 2) {  # voom
            v <- voom(rawCounts, design); 
            fit <- lmFit(v, design) 
        } else 
            fit <- lmFit(eset, design)
        
        fit <- eBayes(fit, trend=limmaTrend)
        
        comparisons = ""
        for( i in 1:(length(g)-1) )
            for (j in (i+1):length(g)) 
            comparisons = c(comparisons,paste(g[j],"-",g[i],sep="" ) )
        comparisons <- comparisons[-1]

        # no sample file, but user selected comparisons using column names
        if( is.null(modelFactors) & length( selectedComparisons) >0  )  
            comparisons = selectedComparisons
        
        contrast1 <- makeContrasts(contrasts=comparisons[1], levels=design)
        if(length(comparisons)>1 )
            for( kk in 2:length(comparisons) )
                contrast1<-  cbind(contrast1,makeContrasts(contrasts=comparisons[kk], levels=design)   )
        Exp.type = paste(length(g)," sample groups detected.")   
        
        # if factorial design 2x2, 2x3, 3x5 etc.
            # all samples must be something like WT_control_rep1
        if ( sum(sapply(strsplit(g,"_"),length) == 2 ) == length(g) ) {
        
            #comparisons
            comparisons = ""
            for( i in 1:(length(g)-1) )
                for (j in (i+1):length(g)) 
                if( strsplit(g[i],"_")[[1]][1] == strsplit(g[j],"_")[[1]][1]| strsplit(g[i],"_")[[1]][2] == strsplit(g[j],"_")[[1]][2]) # only compare WT_control vs. WT_treatment
                    comparisons = c(comparisons,paste(g[j],"-",g[i],sep="" ) )
            comparisons <- comparisons[-1]

            #factors genotype treatment levels
            extract_treatment <- function (x) paste( gsub( ".*_","",unlist( strsplit(x,"-")) ), collapse="-")
            extract_genotype <- function (x) gsub( "_.*","",unlist( strsplit(x,"-")) )[1]
            extract_treatment_counting <- unique( gsub( ".*_","",unlist( strsplit(g,"-")) ))
            treatments = sapply(comparisons, extract_treatment)
            genotypes = sapply(comparisons, extract_genotype)
            Exp.type = paste( Exp.type, "\nFactorial design:",length(unique(genotypes)),"X", length( extract_treatment_counting ), sep="" )

            # pairwise contrasts
            contrast1 <- makeContrasts(contrasts=comparisons[1], levels=design)
            for( kk in 2:length(comparisons) )
                contrast1<-  cbind(contrast1,makeContrasts(contrasts=comparisons[kk], levels=design)   )
            contrast.names = colnames(contrast1)

            # interaction contrasts
            for ( kk in 1:(length(comparisons)-1) ) {
            for( kp in (kk+1):length(comparisons)) 
                if( treatments[kp]== treatments[kk] ) 
                {  
                    contrast1 = cbind(contrast1, contrast1[,kp]- contrast1[,kk] )
                    contrast.names = c(contrast.names, paste("I:",  genotypes[kp], "-", genotypes[kk],"(",gsub("-",".vs.",treatments[kp]),")",sep="" ) )
                }   
            }
            colnames(contrast1)=contrast.names
            comparisons = contrast.names

        } # if interaction terms    
        

        # if sample information is uploaded and user selected factors and comparisons
        if( !is.null(modelFactors) & length( selectedComparisons) >0  ) {
            Exp.type = paste("Model: ~", paste(modelFactors,collapse=" + ") )
            interactionTerm = FALSE # default value to be re-write if needed
            
            # model factors that does not contain interaction terms
            # modelFactors "genotype", "condition", "genotype:condition"
            keyModelFactors = modelFactors[ !grepl(":",modelFactors) ]
            
            # "genotype: control vs. mutant"
            factorsVector= gsub(":.*","",selectedComparisons) # corresponding factors for each comparison
            # remove factors not used in comparison, these are batch effects/pairs/blocks, 
            # keyModelFactors = keyModelFactors[ !is.na(match(keyModelFactors, factorsVector)) ]    
            
            # if a factor is selected both in block and main factors, then use it as block factor
            keyModelFactors = keyModelFactors[ is.na(match(keyModelFactors, blockFactor)) ] 
        
            #------Design matrix            
            sampleInfo2 = sampleInfo[,keyModelFactors,drop=F] # remove factors not used.
            groups = apply(sampleInfo2,1, function(x) paste(x,collapse="_")  )
            g =  unique(groups)  
            
            design <- model.matrix(~ 0+factor(groups))
            colnames(design) <- gsub(".*)","",colnames(design))
        
            if( !is.null(rawCounts) && countsDEGMethods == 2) {  # voom
                v <- voom(rawCounts, design); 
                fit <- lmFit(v, design) 
            } else 
                fit <- lmFit(eset, design)
        
            fit <- eBayes(fit, trend=limmaTrend)    
            
            #-----------Making comaprisons
            if( length(keyModelFactors) != 2 | length(blockFactor) >1 )  { # if only one factor, or more than two then use all pairwise comparisons
                comparisons = gsub(".*: ","",selectedComparisons)
                comparisons = gsub(" vs\\. ","-",comparisons)
            } else if( length(keyModelFactors) == 2 ){ # two key factors

                if( sum(grepl(":",modelFactors) >0) ) {  # interaction? 
                    interactionTerm =TRUE
                    # all pairwise comparisons
                    comparisons = ""
                    for( i in 1:(length(g)-1) )
                        for (j in (i+1):length(g)) 
                        if( strsplit(g[i],"_")[[1]][1] == strsplit(g[j],"_")[[1]][1]| strsplit(g[i],"_")[[1]][2] == strsplit(g[j],"_")[[1]][2]) # only compare WT_control vs. WT_treatment
                            comparisons = c(comparisons,paste(g[j],"-",g[i],sep="" ) )
                    comparisons <- comparisons[-1]
                    
                    # pairwise contrasts
                    contrast1 <- makeContrasts(contrasts=comparisons[1], levels=design)
                    if(length(comparisons)>1 )
                    for( kk in 2:length(comparisons) )
                        contrast1<-  cbind(contrast1,makeContrasts(contrasts=comparisons[kk], levels=design)   )
                    contrast.names = colnames(contrast1)        
                
                    # all possible interactions
                    # interaction contrasts
                    
                    contrast2 = NULL
                    contrast.names =""
                    for ( kk in 1:(dim(contrast1)[2]-1) ) {
                    for( kp in (kk+1):dim(contrast1)[2]) 
                        #if( treatments[kp]== treatments[kk] ) 
                        {  if(is.null(contrast2))  
                            contrast2 = contrast1[,kp]- contrast1[,kk]  else                    
                            contrast2 = cbind(contrast2, contrast1[,kp]- contrast1[,kk] )
                            
                            contrast.names = c(contrast.names, paste0("I:",  colnames(contrast1)[kp], ".vs.", colnames(contrast1)[kk] ) )
                        }   
                    }
                    colnames(contrast2)=contrast.names[-1]
                    
                    # remove nonsense contrasts from interactions
                     contrast2 = contrast2[,which(apply(abs(contrast2),2,max)==1),drop=F]
                     contrast2 = contrast2[,which(apply(abs(contrast2),2,sum)==4),drop=F]   
                     contrast2 = t( unique(t(contrast2)) ) # remove duplicate columns       
                    
                    # remove unwanted contrasts involving more than three levels in either factor
                    keep= c()
                    for( i in 1:dim(contrast2)[2]) {
                        tem = rownames(contrast2)[ contrast2[ ,i ] != 0   ]
                        tem1 = unique (  unlist(gsub("_.*","", tem) ) )
                        tem2 = unique (  unlist(gsub(".*_","", tem) ) )
                        if( length(tem1) == 2 & length(tem2) ==2 )
                        keep = c(keep, colnames(contrast2) [i] )
                    }
                    contrast2 = contrast2[,keep,drop=F]
                    comparisons2 = colnames(contrast2)               
                }

                # "stage: MN vs. EN"  -->  c("MN_AB-EN_AB", "EN_Nodule-EN_AB") 
                #  comparisons in all levels of the other factor 
                transformComparisons <- function (comparison1){
                    tem = gsub(".*: ","",comparison1)
                    tem = unlist(strsplit(tem, " vs\\. ") ) # control  mutant                           
                    factor1= gsub(":.*","",comparison1)

                    ix = match(factor1, keyModelFactors) # 1: first factor, 2: 2nd factor
                    otherFactor = keyModelFactors[3-ix]   # 3-1 = 2; 3-1=1
                    otherFactorLevels = unique( sampleInfo2[,otherFactor] )             
                    comparisons = c( )
                    
                    for (factorLevels in otherFactorLevels) {
                        if( ix == 1){
                            comparisons = c( comparisons, paste(paste0( tem, "_",factorLevels),collapse="-") )
                        } else {
                            comparisons = c( comparisons,  paste(paste0(factorLevels, "_", tem),collapse="-") )
                        }
                    }
                    return(comparisons)     
                }   
                
                comparisons = unlist( sapply(selectedComparisons, transformComparisons ))
                comparisons = as.vector(comparisons)
            
            } # two factors
            
            # make contrasts
            contrast1 <- makeContrasts(contrasts=comparisons[1], levels=design)
            if(length(comparisons) >1 )
            for( kk in 2:length(comparisons) )
                contrast1<-  cbind(contrast1,makeContrasts(contrasts=comparisons[kk], levels=design)   )

            if( interactionTerm ) {  # if interaction terms
                contrast1 = cbind(contrast1,contrast2)
                contrast.names = c(colnames(contrast1), colnames(contrast2) )
                comparisons = c(comparisons,comparisons2)
            }
            
            # block design to remove batch effect or paired samples
            # corfit <- duplicateCorrelation(eset,design,block=targets$Subject)
            # corfit$consensus
            #Then this inter-subject correlation is input into the linear model fit:
            # fit <- lmFit(eset,design,block=targets$Subject,correlation=corfit$consensus)

            if(length(blockFactor) >= 1 ) { # if a factor is selected as block
                if(length(blockFactor) >= 1 ) 
                    blockFactor = blockFactor[1] # if multiple use the first one

                block = sampleInfo[, blockFactor]  # the column not used
            
                if( !is.null(rawCounts) && countsDEGMethods == 2) {  # voom
                    v <- voom(rawCounts, design);
                    corfit <- duplicateCorrelation(v,design,block= block)           
                    fit <- lmFit(v, design,block=block,correlation=corfit$consensus) 
                } else {
                    corfit <- duplicateCorrelation(eset,design,block= block)
                    fit <- lmFit(eset, design,block=block,correlation=corfit$consensus)
                }
                fit <- eBayes(fit, trend=limmaTrend)            
                
            } # block factors


        } # use selected factors

        
        fit2 <- contrasts.fit(fit, contrast1)
        fit2 <- eBayes(fit2, trend=limmaTrend)
        #topTable(fit2, coef=1, adjust="BH")
        results <- decideTests(fit2, p.value=maxP_limma, lfc= log2(minFC_limma ))
        #vennDiagram(results[,1:5],circle.col=rainbow(5))

        #colnames(results) = unlist(sapply( colnames(results), changeNames  ) )
        #comparisons3 <-  unlist(sapply( comparisons, changeNames  ) ) 

        # extract fold change for each comparison
        # there is issues with direction of foldchange. Sometimes opposite
        top <- function (comp) {
            tem <- topTable(fit2, number = 1e12,coef=comp,sort.by="M" ) 
            if(dim(tem)[1] == 0) { return (1) 
            } else  {           
                # compute fold change for the first gene (ranked by absolute value)
                tem2 = as.numeric( x[ which(rownames(x)== rownames(tem)[1]) , ] )
                names(tem2) = colnames(x) 
                    
                return( tem[,c(1,5)]) 
            }  
                                                    
        }  # no significant gene returns 1, otherwise a data frame
        
        
        topGenes <- lapply(comparisons, top)
        topGenes <- setNames(topGenes, comparisons )

        ix <- which( unlist( lapply(topGenes, class) ) == "numeric")
        if( length(ix)>0) topGenes <- topGenes[ - ix ]
        # if (length(topGenes) == 0) topGenes = NULL;
    }
    
    #cat("\n", names(topGenes) )
    #cat("\n", colnames(results))
    #cat("\n", comparisons3)
    #comparisons <- comparisons3
    # it does not make any sense! comparisons can not be changed!
    #comparisons =comparisons3  
    #cat("\n", comparisons)     
        
    return( list(results= results, comparisons=comparisons, Exp.type=Exp.type, topGenes=topGenes)) 
}

# Differential expression using DESeq2
DEG.DESeq2 <- function (  rawCounts,maxP_limma=.05, minFC_limma=2, selectedComparisons=NULL, sampleInfo = NULL,modelFactors=NULL, blockFactor = NULL, referenceLevels=NULL){
    library(DESeq2,verbose=FALSE) # count data analysis
    groups = as.character ( detectGroups( colnames( rawCounts ) ) )
    g = unique(groups)# order is reversed
    
    
    # check for replicates, removes samples without replicates
    reps = as.matrix(table(groups)) # number of replicates per biological sample
    if ( sum( reps[,1] >= 2) <2 ) # if less than 2 samples with replicates
    return( list(results= NULL, comparisons = NULL, Exp.type="Failed to parse sample names to define groups. 
        Cannot perform DEGs and pathway analysis. Please double check column names! Use WT_Rep1, WT_Rep2 etc. ", topGenes=NULL)) 
    # remove samples without replicates
    g <- rownames(reps)[which(reps[,1] >1)]
    ix <- which( groups %in% g)  
    groups <- groups[ix]   
    rawCounts <- rawCounts[,ix] 
    
    
    Exp.type = paste(length(g)," sample groups detected.")
    comparisons = ""
    for( i in 1:(length(g)-1) )
        for (j in (i+1):length(g)) 
        comparisons = c(comparisons,paste(g[j],"-",g[i],sep="" ) )
    comparisons <- comparisons[-1]

    colData = cbind(colnames(rawCounts), groups )

    # no sample file, but user selected comparisons using column names
    if( is.null(modelFactors) & length( selectedComparisons) >0  )  
        comparisons = selectedComparisons
        
    comparisons2 = comparisons   # this is for showing comparison names, which might be different from internally   
    # Set up the DESeqDataSet Object and run the DESeq pipeline
    dds = DESeqDataSetFromMatrix(countData=rawCounts,
                                colData=colData,
                                design=~groups)                             
    
    if( is.null(modelFactors)  ) 
        dds = DESeq(dds)    else  
    {    # using selected factors and comparisons
        # build model
        modelFactors = c(modelFactors,blockFactor) # block factor is just added in. 

        factors = modelFactors   # selected factors and interactions: c( "strain", "treatment",  "strain:treatment")
        factors = factors[ !grepl(":",factors )]   # non-interaction terms
        # interaction terms like strain:treatment
        Interactions = modelFactors[ grepl(":",modelFactors )]
        
        colData = sampleInfo  
        factorsCoded = toupper(letters )[1: dim(colData)[2] ]   # Factors are encoded as "A", "B", "C"; this avoid illigal letters
        names(factorsCoded) =  colnames(colData)  # this is for look up; each column of sampleInfo  
        colnames(colData) = factorsCoded # all columns named A B C D 

        colData = as.data.frame(colData)
        
        # set reference levels for factors
        if(! is.null( referenceLevels) ) {   # c("genotype:wt", "treatment:control" )
            # first factor
            for ( refs in referenceLevels)
                if(! is.null( refs) ) {
                    ix = match(gsub(":.*","",refs), colnames(sampleInfo) ) # corresponding column id for factor
                    colData[,ix] = as.factor( colData[,ix] )
                    colData[,ix] = relevel(colData[,ix],gsub(".*:","",refs)  )
                }
        }
                
        # base model
        DESeq2.Object= paste("dds = DESeqDataSetFromMatrix(countData=rawCounts, colData=colData, design=~ ", 
                             paste( factorsCoded[factors],collapse="+")) # only use selected factors        
        Exp.type = paste("Model: ~", paste(modelFactors,collapse=" + ") )

        
        # create model
        if( length(Interactions)>0 ) { # if there is interaction
            for( interactionTerms in Interactions) {
                interactingFactors = unlist(strsplit(interactionTerms,":" ) )  # split strain:treatment as "strain" and "mutant"
                tem = paste(factorsCoded [ interactingFactors ],collapse=":")   # convert "strain:mutant" to "A:B"
                DESeq2.Object = paste(DESeq2.Object, " + ",tem)
            }           
        }
        DESeq2.Object= paste( DESeq2.Object, ")") # ends the model

        eval(parse(text = DESeq2.Object) )
        dds = DESeq(dds)  # main function       
        
        # comparisons 
        # "group: control vs. mutant"
        comparisons = gsub(".*: ","",selectedComparisons)
        comparisons = gsub(" vs\\. ","-",comparisons)
        factorsVector= gsub(":.*","",selectedComparisons) # corresponding factors for each comparison
        
        # comparison2 holds names for display with real factor names
        # comparison  is used in calculation it is A, B, C for factors
        comparisons2 = comparisons    
        #comparisons2 = gsub(" vs\\. ","-",selectedComparisons) 
        #comparisons2 = gsub(":","_",comparisons2)      
        # Note that with interaction terms, not all meaningful comparisons is listed for selection. 
        # this is complex. Only under reference level.
        
        # comparisons due to interaction terms
        if( length(Interactions)>0 ) { # if there is interaction
            interactionComparisons = resultsNames(dds)
            interactionComparisons = interactionComparisons[ grepl("\\.",interactionComparisons )   ]
    
            comparisons = c(comparisons,interactionComparisons )
            
            # translate comparisons generated in interaction terms back to real factor names
            interactionComparisons2 = interactionComparisons
            for ( i in 1:length(interactionComparisons2 ) ) {
                tem = unlist(strsplit(interactionComparisons2[i],"\\." ) )
                tem_factors = substr(tem,1,1) 
                
                tem_factors[1] = names(factorsCoded)[factorsCoded == tem_factors[1]]  # get the first letter and translate into real factor names
                tem_factors[2] = names(factorsCoded)[factorsCoded == tem_factors[2]]  # get the 2nd letters and translate into real factor names

                interactionComparisons2[i] <- paste0( "I:",tem_factors[1], "_",substr(tem[1],2,nchar(tem[1]) ),".",
                                                              tem_factors[2], "_",substr(tem[2],2,nchar(tem[2]) ) 
                                                    )               
            }
            comparisons2 = c(comparisons2,interactionComparisons2 )
        }
    } # if selected factors 
    
    # extract contrasts according to comprisons defined above
    result1 = NULL; allCalls = NULL;
    topGenes = list(); pk = 1 # counter
    pp=0 # first results?
    for( kk in 1:length(comparisons) ) {
        tem = unlist( strsplit(comparisons[kk],"-") )
        
        if(is.null(modelFactors)) # if just group comparison using sample names
            selected = results(dds, contrast=c("groups", tem[1], tem[2]) )   else {
            if(!grepl("\\.", comparisons[kk] ) )    # if not interaction term: they contain .  interaction term
                selected = results(dds, contrast=c( factorsCoded[ factorsVector[kk] ],tem[1], tem[2]) ) else # either A, B, C ...
                selected = results(dds, name=comparisons[kk] ) # interaction term
            }

        selected$calls =0   
        selected$calls [which( selected$log2FoldChange > log2(minFC_limma) & selected$padj < maxP_limma ) ]  <-  1
        selected$calls [ which( selected$log2FoldChange <  -log2(minFC_limma) & selected$padj < maxP_limma ) ] <-  -1
        colnames(selected)= paste( as.character(comparisons2[kk]), "___",colnames(selected),sep="" )
        selected = as.data.frame(selected)
        if (pp==0){  # if first one with significant genes, collect gene list and Pval+ fold
        result1 = selected; pp = 1; 
        # selected[,2] <- -1 * selected[,2] # reverse fold change direction
        topGenes[[1]] = selected[,c(2,6)]; 
        names(topGenes)[1] = comparisons2[kk]; } else 
            { result1 = merge(result1,selected,by="row.names"); 
                rownames(result1) = result1[,1]; 
                result1 <- result1[,-1]
                pk= pk+1; 
                # selected[,2] <- -1 * selected[,2] # reverse fold change direction
                topGenes[[pk]] = selected[,c(2,6)]; 
                names(topGenes)[pk] = comparisons2[kk];  # assign name to comprison
            }
    }

    Interactions = c()
    if( !is.null(modelFactors) )
        Interactions = modelFactors[ grepl(":",modelFactors )]
        
#---  add comprisons for non-reference levels. It adds to the results1 object.  
    if( length(Interactions)>0 ) { # if there is interaction
        factorLookup=c() # a factor whose values are factors and names are factor and level combination conditionTreated, genotypeWT
        levelLookup = c()
        
        for( i in 1:dim(sampleInfo)[2]) {
            sampleInfo2 = unique(sampleInfo)
            tem = rep(toupper(letters)[i],dim(sampleInfo2)[1]  )
            names(tem) = paste0(toupper(letters)[i],sampleInfo2[,i])
            factorLookup = c(factorLookup,tem)  
            
            tem = as.character( sampleInfo2[,i] )
            names(tem) = paste0(toupper(letters)[i],sampleInfo2[,i])
            levelLookup = c(levelLookup, tem)
        }
        
        # split  genotypeI.conditionTrt --> c("genotype","I","conditoin","Trt")
        splitInteractionTerms <- function (term) {
            if(!grepl("\\.",term) ) return(NULL)
            terms2 = unlist(strsplit(term,"\\.") )
                     # factor1, level1, factor2, level2
            return(c(factorLookup[terms2[1]], levelLookup[terms2[1]],factorLookup[terms2[2]], levelLookup[terms2[2]]   ) )
        }
        # none interaction terms 
        NoneInterTerms = resultsNames(dds)[ !grepl( "\\.", resultsNames(dds)) ]
        NoneInterTerms=NoneInterTerms[-1]
        allInteractionTerms = resultsNames(dds)[ grepl( "\\.", resultsNames(dds)) ]


        for( kk in 1:length(NoneInterTerms) ) { # for each none interaction term
            if(!is.null(modelFactors) ) {# if not just group comparison using sample names
                    #current factor
                    cFactor = gsub("_.*","",NoneInterTerms[kk] )
                
                    for(interactionTerm in allInteractionTerms ) {
                    
                        splited = splitInteractionTerms (interactionTerm)  # 4 components
                        if (cFactor != splited[1] & cFactor != splited[3]  ) 
                            next;                       
                        
                        selected = results(dds, list(c( NoneInterTerms[kk],interactionTerm ) ) ) 
                        comparisonName = paste0( NoneInterTerms[kk],"__", gsub("\\.","",interactionTerm) )
                        
                        if( cFactor == splited[1] )
                            otherLevel = splited[4] else otherLevel = splited[2]
                            
                        comparisonName = paste0(#names(factorsCoded)[which(factorsCoded==cFactor)], # real factor name
                                                gsub("_vs_","-", substr(NoneInterTerms[kk], 3, nchar(NoneInterTerms[kk]  )  )), # the comparison
                                                "_for_",otherLevel)
                        comparisons2 = c(comparisons2, comparisonName)
                        selected$calls =0   
                        selected$calls [which( selected$log2FoldChange > log2(minFC_limma) & selected$padj < maxP_limma ) ]  <-  1
                        selected$calls [ which( selected$log2FoldChange <  -log2(minFC_limma) & selected$padj < maxP_limma ) ] <-  -1
                        colnames(selected)= paste( comparisonName, "___",colnames(selected),sep="" )
                        selected = as.data.frame(selected)
                        if (pp==0){  # if first one with significant genes, collect gene list and Pval+ fold
                            result1 = selected; pp = 1; 
                            # selected[,2] <- -1 * selected[,2] # reverse fold change direction
                            topGenes[[1]] = selected[,c(2,6)]; 
                            names(topGenes)[1] = comparisonName; } else 
                            { result1 = merge(result1,selected,by="row.names"); 
                                rownames(result1) = result1[,1]; 
                                result1 <- result1[,-1]
                                pk= pk+1; 
                                # selected[,2] <- -1 * selected[,2] # reverse fold change direction
                                topGenes[[pk]] = selected[,c(2,6)]; 
                                names(topGenes)[pk] = comparisonName;  # assign name to comprison
                            }
                    } #for  
                        
            } #if
        } #for
    
    
    } #if




#---
    #if( length(comparisons) == 1) topGenes <- topGenes[[1]] # if only one comparison, topGenes is not a list, just a data frame itself.
    if(! is.null(result1)) { 
    # note that when you only select 1 column from a data frame it automatically converts to a vector. drop =FALSE prevents that.
    allCalls = as.matrix( result1[,grep("calls",colnames(result1)), drop = FALSE  ] )
    colnames(allCalls)= gsub("___.*","", colnames(allCalls))
    colnames(allCalls)= gsub("\\.","-", colnames(allCalls)) # note that samples names should have no "."
    colnames(allCalls)= gsub("^I-","I:", colnames(allCalls))
    }
    return( list(results= allCalls, comparisons = comparisons2, Exp.type=Exp.type, topGenes=topGenes)) 
}

# main function
limma <- function() {  
    if(input_dataFileFormat == 1 ) {  # if count data
         if(input_CountsDEGMethod == 3 ) {    # if DESeq2 method
                # rawCounts = read.csv("exampleData/airway_GSE52778.csv", row.names=1)
                # res =DEG.DESeq2(rawCounts, .05, 2) 
                # res1 =DEG.limma(rawCounts, .1, 1.5,rawCounts, 2,3) 
                    
            return(   DEG.DESeq2( convertedCounts.out,input_limmaPval, input_limmaFC,
                                    input_selectModelComprions, readSampleInfo.out,
                                    c(input_selectFactorsModel,input_selectInteractions), 
                                    input_selectBlockFactorsModel, factorReferenceLevels.out )  )
            }
            if(input_CountsDEGMethod < 3 )    # voom or limma-trend
                return( DEG.limma(convertedData.out, input_limmaPval, input_limmaFC,
                                    convertedCounts.out, input_CountsDEGMethod,
                                    priorCounts=input_countsLogStart,input_dataFileFormat,
                                    input_selectModelComprions, readSampleInfo.out,
                                    c(input_selectFactorsModel,input_selectInteractions),
                                    input_selectBlockFactorsModel) )
    } else if (input_dataFileFormat == 2 ){ # normalized data
     return( DEG.limma(convertedData.out, input_limmaPval, input_limmaFC,
                        convertedCounts.out, input_CountsDEGMethod,
                        priorCounts=input_countsLogStart,input_dataFileFormat,
                        input_selectModelComprions, readSampleInfo.out,
                        c(input_selectFactorsModel,input_selectInteractions),
                        input_selectBlockFactorsModel) )
    } else {   # dataFileFormat == 3 user just uploaded fold change matrix
    
        x = convertedData.out
        
        pvals = convertedPvals.out
        if(!is.null(pvals) ) {
          ix = match(rownames(x), rownames(pvals))
          pvals = pvals[ix,]
        }


        # looks like ratio data, take log2
        if( sum(round(apply(x,2, median) + .2) == 1 ) == dim(x)[2] & min(x) > 0) 
            x = log2(x)
        
        Exp.type = "None standard data without replicates."
        all.Calls = x # fake calls
        for( i in 1: dim(all.Calls)[2]) { 
            tem <- all.Calls[,i]
            all.Calls[which( tem <= log2(input_limmaFC) & tem >=  -log2(input_limmaFC) ) ,i] = 0            
            all.Calls[which( tem >  log2(input_limmaFC) ) ,i] = 1
            all.Calls[which( tem < -log2(input_limmaFC) ) ,i] = -1      
            if(!is.null(pvals) ) 
                all.Calls[ which( pvals[,i] > input_limmaPval),i] = 0
        }
        comparisons = colnames(all.Calls)
        extractColumn <- function (i) {
            topGenes = as.data.frame( convertedData.out[,i,drop=FALSE])
            if(is.null(pvals) ) topGenes$FDR = 0 else 
                topGenes$FDR = pvals[,i]# fake fdr
                
            colnames(topGenes) = c("Fold","FDR")
            return(topGenes)    
        } 
        topGenes = lapply( 1:dim( x )[2], extractColumn )
        topGenes <- setNames(topGenes, colnames(x ) )
        
        return( list(results= all.Calls, comparisons = colnames(x ), Exp.type=Exp.type, topGenes=topGenes) )
    }
        
}

    
DEG.data <- function() {

          genes = limma.out$results
          genes = as.data.frame( genes[which( rowSums( abs (genes) ) != 0 ),] )
          colnames(genes) = colnames( limma.out$results )
          genes = merge(genes,convertedData.out, by='row.names')
          colnames(genes)[1] = "1: upregulation, -1: downregulation"
            # add gene symbol
        ix = match( genes[,1], allGeneInfo.out[,1])
        genes <- cbind(as.character( allGeneInfo.out$symbol)[ix],genes) 
        colnames(genes)[1] = "Symbol"
        genes <- genes[,c(2,1,3:dim(genes)[2]) ]
        return(genes)
        }
  

vennPlot <- function() {
    
            results = limma.out$results

            # split by up or down regulation
            if(input_UpDownRegulated) {     
                resultUp = results; 
                resultUp[resultUp < 0 ] <- 0;
                colnames(resultUp) = paste0("Up_", colnames(resultUp))
                resultDown = results; 
                resultDown[resultDown > 0] <- 0;
                colnames(resultDown) = paste0("Down_", colnames(resultDown))                
                results <- cbind(resultUp, resultDown)
            }           
            
            ixa = c()
            
            
            for (comps in  input_selectComparisonsVenn) { 
                 if(!grepl("^I:|^I-|^Up_I:|^Up_I-|^Down_I:|^Down_I-", comps) ) {  # if not interaction term
                    ix = match(comps, colnames(results)) 
                  } else {
                        #mismatch in comparison names for interaction terms for DESeq2
                        #I:water_Wet.genetic_Hy      in the selected Contrast
                        #Diff-water_Wet-genetic_Hy   in column names
                        tem = gsub("^I-","I:" ,colnames(results))
                        tem = gsub("-","\\.",tem)
                        ix = match(comps, tem) 

                        if(is.na(ix) ) # this is for limma package
                            ix = match(comps, colnames(results))                        
                      }
                    ixa = c(ixa,ix)
                  }
                    
            results = results[,ixa,drop=FALSE] # only use selected comparisons
            if(dim(results)[2] >5) results <- results[,1:5]
            colnames(results) = gsub("^I-","I:" ,colnames(results)) 
            
            vennDiagram(results,circle.col=rainbow(5), cex=c(1.,1, 0.7) ) # part of limma package


  }


sigGeneStats <- function( ){
        
        results = limma.out$results

         library(reshape2)
         Up =  apply(results, 2, function(x) sum(x == 1) )
         Down = apply(results, 2, function(x) sum(x == -1) ) 
         stats = rbind(Up, Down)
                 
         gg <- melt(stats)

         colnames(gg) = c("Regulation","Comparisons","Genes")
         
 
         p= ggplot(gg, aes(x=Comparisons, y=Genes, fill=  Regulation )  )+
             geom_bar(position="dodge", stat="identity") + coord_flip() +
             theme(legend.position = "top") + 
             scale_fill_manual(values=c("red", "blue")) +
             ylab("Number of differntially expressed genes") +
             theme(axis.title.y=element_blank(),
                axis.text=element_text(size=14)) 
            
        p
            

}


sigGeneStatsTable <- function( ) {

        results = limma.out$results

         Up =  apply(results, 2, function(x) sum(x == 1) )
         Down = apply(results, 2, function(x) sum(x == -1) ) 
         stats = rbind(Up, Down)
         stats = t(stats)
         stats=cbind(rownames(stats), stats)
         colnames(stats)[1]="Comparisons"

         return(as.data.frame(stats))

}


selectedHeatmap.data <- function() {
          genes <- limma.out$results
          if( is.null(genes) ) return(NULL)
          if(!grepl("I:", input_selectContrast) ) {  # if not interaction term
            ix = match(input_selectContrast, colnames(genes)) 
          } else {
            #mismatch in comparison names for interaction terms for DESeq2
            #I:water_Wet.genetic_Hy      in the selected Contrast
            #Diff-water_Wet-genetic_Hy   in column names
            tem = gsub("I-","I:" ,colnames(genes))
            tem = gsub("-","\\.",tem)
            ix = match(input_selectContrast, tem) 
            
            if(is.na(ix) ) # this is for limma package
                ix = match(input_selectContrast, colnames(genes))           
            
          }
      
          if(is.null(ix)) return(NULL)
          if(is.na(ix)) return(NULL)
          if( sum(abs(genes[,ix] )  ) <= 1 ) return(NULL) # no significant genes for this comparison
          if(dim(genes)[2] < ix ) return(NULL)
          query = rownames(genes)[which(genes[,ix] != 0)]
          if(length(query) == 0) return(NULL)
          iy = match(query, rownames(convertedData.out  ) )
          

         iz = findContrastSamples(  input_selectContrast, 
                                    colnames(convertedData.out),
                                    readSampleInfo.out,
                                    input_selectFactorsModel,
                                    input_selectModelComprions, 
                                    factorReferenceLevels.out,
                                    input_CountsDEGMethod,
                                    input_dataFileFormat
                                )
    
        # color bar
         bar = genes[,ix]
         bar = bar[bar!=0]

         # retreive related data         
         genes = convertedData.out[iy,iz,drop=FALSE]
         
         genes = genes[order(bar),,drop=FALSE] # needs to be sorted because myheatmap2 does not reorder genes
         bar = sort(bar)


         return(list(genes=genes, bar=bar ))

    
}
# find sample index for selected comparisons
findContrastSamples <- function(selectContrast, allSampleNames,sampleInfo=NULL, selectFactorsModel=NULL,selectModelComprions =NULL , referenceLevels=NULL, countsDEGMethod=NULL, dataFileFormat=NULL ){
    iz= match( detectGroups(allSampleNames), unlist(strsplit( selectContrast, "-"))   )
    iz = which(!is.na(iz))       
    if ( !is.null(sampleInfo) & !is.null(selectFactorsModel) & length(selectModelComprions)>0 ) {

        comparisons = gsub(".*: ","",selectModelComprions)   # strings like: "groups: mutant vs. control"
        comparisons = gsub(" vs\\. ","-",comparisons)       
        factorsVector= gsub(":.*","",selectModelComprions) # corresponding factors

          # if read counts data and DESeq2
        if(dataFileFormat==1 & countsDEGMethod == 3) { # if DESeq2
            contrast = gsub("_for_.*","",selectContrast) # could be "wt-mu"   or "wt-mu_for_conditionB"
            ik = match( contrast, comparisons )   # selected contrast lookes like: "mutant-control"

            otherFactorLevel = gsub(".*_for_","",selectContrast)
            # find the corresponding factor for the other factor
            otherFactor=" "
            if(nchar( otherFactorLevel ) >0){
                for( eachFactor in colnames(sampleInfo) )
                    if ( otherFactorLevel %in%  sampleInfo[,eachFactor ] )
                        otherFactor = eachFactor        
            }
            
            if (is.na(ik)) iz=1:(length(allSampleNames))  else {  # interaction term, use all samples       
                selectedfactor= factorsVector[ ik ] # corresponding factors

                iz = which(sampleInfo[,selectedfactor] %in%  unlist(strsplit( contrast, "-")) )

                #filter by other factors: reference level
                if(! is.null( referenceLevels) ) {   # c("genotype:wt", "treatment:control" )
                    for ( refs in referenceLevels)
                        if(! is.null( refs) & gsub(":.*","",refs) != selectedfactor ) {
                            currentFactor = gsub(":.*","",refs)
                            if(nchar( otherFactorLevel ) >0 & currentFactor == otherFactor ) { # if not reference level
                                iz = intersect( iz, which(sampleInfo[,currentFactor] == otherFactorLevel  ) )
                            } else
                                iz = intersect( iz, which(sampleInfo[,currentFactor] == gsub(".*:","",refs)  ) )
                            
                        }
                }           
                iz = iz[which(!is.na(iz))]
                
            # switching from limma to DESeq2 causes problem, as reference level is not defined.

            } 
            
        } else {  # not DESeq2
            
                    # given level find corresponding sample ids
                    findIDsFromLevel <- function (aLevel){
                        # find factor
                        currentFactor=""
                        for( eachFactor in colnames(sampleInfo) )
                            if ( aLevel %in%  sampleInfo[,eachFactor ] )
                                currentFactor = eachFactor          
                        if(nchar(currentFactor) >0 ) 
                            return( which(sampleInfo[,currentFactor ] %in% aLevel   )  ) else return(NULL)
                    }       
                    
                    if( !grepl(".*_.*-.*_.*",selectContrast )) iz = c.out
                    levels4 = unlist( strsplit( unlist( strsplit(selectContrast,"-") ), "_") ) #double split!
                    if(length(levels4)!=4) { 
                        iz = c.out 
                    } else {
                        iz = intersect( findIDsFromLevel(levels4[1]),  findIDsFromLevel(levels4[2])  ) # first sample
                        iz = c(iz, intersect( findIDsFromLevel(levels4[3]),  findIDsFromLevel(levels4[4])  )   ) # 2nd sample
                        }
                } #else
            
            
            
        }   
     
     if (grepl("I:",selectContrast)) iz=1:length(allSampleNames) # if it is factor design use all samples
     if( is.na(iz)[1] | length(iz)<=1 )    iz=1:length(allSampleNames)

    return(iz)
}



selectedHeatmap <- function() {

         bar = selectedHeatmap.data.out$bar +2;
         bar[bar==3] =2

         myheatmap2( selectedHeatmap.data.out$genes,bar,200,mycolor=input_heatColors1,c("Down","Up") )
     
}



selectedHeatmap.data <- function(){

          genes <- limma.out$results
          if( is.null(genes) ) return(NULL)
          if(!grepl("I:", input_selectContrast) ) {  # if not interaction term
            ix = match(input_selectContrast, colnames(genes)) 
          } else {
            #mismatch in comparison names for interaction terms for DESeq2
            #I:water_Wet.genetic_Hy      in the selected Contrast
            #Diff-water_Wet-genetic_Hy   in column names
            tem = gsub("I-","I:" ,colnames(genes))
            tem = gsub("-","\\.",tem)
            ix = match(input_selectContrast, tem) 
            
            if(is.na(ix) ) # this is for limma package
                ix = match(input_selectContrast, colnames(genes))           
            
          }
      
          if(is.null(ix)) return(NULL)
          if(is.na(ix)) return(NULL)
          if( sum(abs(genes[,ix] )  ) <= 1 ) return(NULL) # no significant genes for this comparison
          if(dim(genes)[2] < ix ) return(NULL)
          query = rownames(genes)[which(genes[,ix] != 0)]
          if(length(query) == 0) return(NULL)
          iy = match(query, rownames(convertedData.out  ) )
          

         iz = findContrastSamples(  input_selectContrast, 
                                    colnames(convertedData.out),
                                    readSampleInfo.out,
                                    input_selectFactorsModel,
                                    input_selectModelComprions, 
                                    factorReferenceLevels.out,
                                    input_CountsDEGMethod,
                                    input_dataFileFormat
                                )
    
        # color bar
         bar = genes[,ix]
         bar = bar[bar!=0]

         # retreive related data         
         genes = convertedData()[iy,iz,drop=FALSE]
         
         genes = genes[order(bar),,drop=FALSE] # needs to be sorted because myheatmap2 does not reorder genes
         bar = sort(bar)


         return(list(genes=genes, bar=bar ))

}



DEG.data <- function(){

    genes = limma.out$results
    genes = as.data.frame( genes[which( rowSums(genes) != 0 ),] )
    colnames(genes) = colnames( limma.out$results )
    genes = merge(genes,convertedData.out, by='row.names')
    colnames(genes)[1] = "1: upregulation, -1: downregulation"
    # add gene symbol
    ix = match( genes[,1], allGeneInfo.out[,1])
    genes <- cbind(as.character( allGeneInfo.out$symbol)[ix],genes) 
    colnames(genes)[1] = "Symbol"
    genes <- genes[,c(2,1,3:dim(genes)[2]) ]
    return(genes)
}


AllGeneListsGMT <- function() {

        
            results = limma.out$results

            results2 = cbind(results, results)
            colnames(results2)= c( paste0("UP_", colnames(results)),paste0("Down_", colnames(results)) )
            for(i in 1:dim(results)[2] ) {
                results2[,i*2-1] = results[,i]
                results2[ which(results2[,i*2-1] < 0 ) , i*2-1] = 0
                results2[,i*2] = results[,i]    
                results2[ which(results2[,i*2] > 0 ) , i*2] = 0 
            }

            geneList1 <- function (i) {
                ix = which(results2[,i] !=0 )
                return(paste0(colnames(results2)[i],"\t", length(ix),"\t",
                        paste(rownames(results2 )[ix], collapse="\t" ) ) )          
            }
            tem = sapply(1:dim(results2)[2],geneList1 )
                
            return( paste(tem, collapse="\n") )
}



geneListData <- function( ) {
        
        noSig = as.data.frame("No significant genes find!")
        if( is.null(input_selectContrast) ) return(NULL)
        if( is.null( limma.out$comparisons ) ) return(NULL) # if no significant genes found
        if( length(limma.out$topGenes) == 0 ) return(noSig)
        if(length( limma.out$comparisons)  ==1 )  
        { top1=limma.out$topGenes[[1]]  
        } else {
          top = limma.out$topGenes
          ix = match(input_selectContrast, names(top))
          if( is.na(ix)) return (noSig)
          top1 <- top[[ix]]; 
          }
          if(dim(top1)[1] == 0 ) return (noSig)
          colnames(top1)= c("Fold","FDR")
          #top1 = merge(top1,convertedData(), by='row.names')
          #colnames(top1)[1] = "Genes"
          top1 = top1[order(-abs(top1$Fold)) ,]
          if ( length( which( top1$FDR <=  input_limmaPval  &  abs(top1$Fold)  >= log2(input_limmaFC) ) ) == 0 )
            return( noSig)
          top1 <- top1[which(top1$FDR <=  input_limmaPval ) ,]
          top1 <- top1[which(abs(top1$Fold)  >= log2( input_limmaFC)) ,]
          top1$Top_Genes <- rownames(top1)
          top1 <- top1[,c(3,1,2)]
          
          # if new species
          if( input_selectGO2 == "ID not recognized!" | input_selectOrg == "NEW") return (top1); 
          
          #convertedID = convertID(top1[,1],input_selectOrg, "GOBP" );#"gmax_eg_gene"
          # tem <- geneInfo(convertedID,input_selectOrg) #input_selectOrg ) ;
         #  tem <- geneInfo(converted(),input_selectOrg)
          top1 <- merge(top1, allGeneInfo.out, by.x ="Top_Genes", by.y="ensembl_gene_id",all.x=T )
          
          if ( sum( is.na(top1$band)) == dim(top1)[1] ) top1$chr = top1$chromosome_name else
            top1$chr = paste( top1$chromosome_name, top1$band,sep="")
          
          top1 <- top1[,c('Top_Genes','Fold','FDR','symbol','chr','gene_biotype')]
          
         
        #  ix = match(top1[,1], tem$ensembl_gene_id)
         # if( sum(is.na( tem$Symbol[ix]) ) != length(ix) ) 
          # { top1 <- cbind(top1, tem$Symbol[ix]); colnames(top1)[4]= "Symbol" }
          top1 = top1[order(-abs(as.numeric( top1$Fold))) ,]
          top1$FDR <- sprintf("%-3.2e",top1$FDR )
          colnames(top1) <- c("Ensembl ID", "log2 Fold Change", "Adj.Pval", "Symbol","Chr","Type")
          if ( sum( is.na(top1$Symbol)) == dim(top1)[1] ) top1 <- top1[,-4] 

          return(top1)
    
  }
  

  
volcanoPlot <- function( ) {
    if(length( limma.out$comparisons)  ==1 )  
    { top1=limma.out$topGenes[[1]]  
    } else {
      top = limma.out$topGenes
      ix = match(input_selectContrast, names(top))
      if( is.na(ix)) return (NULL)
      top1 <- top[[ix]]; 
      }

      if(dim(top1)[1] == 0 ) return (NULL)
      colnames(top1)= c("Fold","FDR")
     top1 <- as.data.frame(top1) # convert to data frame
     top1 <- top1[which(!(is.na(top1$Fold)|is.na(top1$FDR)    )),] # remove NA's 
     top1$upOrDown <- 1
     #write.csv(top1,"tem.csv")
     top1$upOrDown[ which(top1$FDR <=  input_limmaPval& top1$Fold  >= log2( input_limmaFC)) ]  <- 2
     top1$upOrDown[ which(top1$FDR <=  input_limmaPval & top1$Fold  <= -log2( input_limmaFC)) ]  <- 3
     par(mar=c(5,5,1,1))
     plot(top1$Fold,-log10(top1$FDR),col = c("grey30", "red","blue")[top1$upOrDown],
     pch =16, cex = .3, xlab= "log2 fold change", ylab = "- log10 (FDR)",
     cex.lab=2, cex.axis=2, cex.main=2, cex.sub=2   )    
     legend("bottomright",c("Upregulated","Downregulated"),fill = c("red","blue"),cex=1.1 )
     
}

scatterPlot <- function( ){
 
    if(length( limma.out$comparisons)  ==1 )  
    { top1=limma.out$topGenes[[1]]  
    } else {
      top = limma.out$topGenes
      ix = match(input_selectContrast, names(top))
      if( is.na(ix)) return (NULL)
      top1 <- top[[ix]]; 
      }
      if(dim(top1)[1] == 0 ) return (NULL)
      colnames(top1)= c("Fold","FDR")
     top1 <- as.data.frame(top1) # convert to data frame
     top1 <- top1[which(!(is.na(top1$Fold)|is.na(top1$FDR)    )),] # remove NA's 
     top1$upOrDown <- 1
     #write.csv(top1,"tem.csv")
     top1$upOrDown[ which(top1$FDR <=  input_limmaPval& top1$Fold  >= log2( input_limmaFC)) ]  <- 2
     top1$upOrDown[ which(top1$FDR <=  input_limmaPval & top1$Fold  <= -log2( input_limmaFC)) ]  <- 3


     
         iz = findContrastSamples(  input_selectContrast, 
                                    colnames(convertedData.out),
                                    readSampleInfo.out,
                                    input_selectFactorsModel,
                                    input_selectModelComprions, 
                                    factorReferenceLevels.out,
                                    input_CountsDEGMethod,
                                    input_dataFileFormat
                                )
     
     genes <- convertedData.out[,iz]
     
     g = detectGroups(colnames(genes))
     
     if(length(unique(g))  > 2) { plot.new(); text(0.5,0.5, "Not available.") } else{
        average1 <- apply( genes[, which( g == unique(g)[1] ) ],1,mean)

        average2 <- apply(  genes[, which( g == unique(g)[2] ) ],1,mean)

        genes2 <- cbind(average1,average2)
        rownames(genes2) = rownames(genes)
        genes2 <-  merge(genes2,top1,by="row.names")

        
        par(mar=c(5,5,1,1))
        plot(genes2$average2,genes2$average1,col = c("grey45","red","blue")[genes2$upOrDown],
        pch =16, cex = .3, xlab= paste("Average expression in", unique(g)[2] ), 
        ylab = paste("Average expression in", unique(g)[1] ),
        cex.lab=2, cex.axis=2, cex.main=2, cex.sub=2    )    
        legend("bottomright",c("Upregulated","Downregulated"),fill = c("red","blue"),cex=1.3 )


     
     }
     
}


MAplot <- function ( ) {
 
    if(length( limma.out$comparisons)  ==1 )  
    { top1=limma.out$topGenes[[1]]  
    } else {
      top = limma.out$topGenes
      ix = match(input_selectContrast, names(top))
      if( is.na(ix)) return (NULL)
      top1 <- top[[ix]]; 
      }
      if(dim(top1)[1] == 0 ) return (NULL)
      colnames(top1)= c("Fold","FDR")
     top1 <- as.data.frame(top1) # convert to data frame
     top1 <- top1[which(!(is.na(top1$Fold)|is.na(top1$FDR)    )),] # remove NA's 
     top1$upOrDown <- 1
     #write.csv(top1,"tem.csv")
     top1$upOrDown[ which(top1$FDR <=  input_limmaPval& top1$Fold  >= log2( input_limmaFC)) ]  <- 2
     top1$upOrDown[ which(top1$FDR <=  input_limmaPval & top1$Fold  <= -log2( input_limmaFC)) ]  <- 3
     
         iz = findContrastSamples(  input_selectContrast, 
                                    colnames(convertedData.out),
                                    readSampleInfo.out,
                                    input_selectFactorsModel,
                                    input_selectModelComprions, 
                                    factorReferenceLevels.out,
                                    input_CountsDEGMethod,
                                    input_dataFileFormat
                                )
     


        average1 <- as.data.frame( apply( convertedData()[,iz],1,mean) )
        colnames(average1) = "Average"
        rownames(average1) = rownames(convertedData.out)
        
        genes2 <-  merge(average1,top1,by="row.names")

        par(mar=c(5,5,1,1))
        plot(genes2$Average,genes2$Fold,col = c("grey45","red","blue")[genes2$upOrDown],
        pch =16, cex = .3, xlab= "Average expression", 
        ylab = "Log2 fold change",
        cex.lab=2, cex.axis=2, cex.main=2, cex.sub=2    )   
            abline(h=0)
        legend("bottomright",c("Upregulated","Downregulated"),fill = c("red","blue"),cex=1.3 )
 
}


geneListGOTable <- function() {     
        NoSig=NULL
        # using expression data
        genes <- selectedHeatmap.data.out$genes
        if(is.null(genes) ) return(NULL) 
        if(dim(genes)[1] <= minGenesEnrichment ) return(NoSig) # if has only few genes
        
        fc = selectedHeatmap.data.out$bar       
        # GO
        results1 <- NULL; result <- NULL
        pp <- 0
        for( i in c(1,-1) ) {

            
            if( length(which(fc*i<0)) <= minGenesEnrichment) next; 
            query = rownames(genes)[which(fc*i<0)]
            if( length(query) <= minGenesEnrichment) next;  
            

            result <- findOverlapGMT( query, GeneSets.out,1) 


            if( dim(result)[2] ==1) next;   # result could be NULL
            if(i == -1) result$direction = "Up regulated"  else result$direction = "Down regulated" # changed
            if (pp==0 ) { results1 <- result; pp = 1;} else  results1 = rbind(results1,result)
        }

        if ( pp == 0 ) return (NoSig)
        if ( is.null( results1) ) return (NoSig)
        if( dim(results1)[2] == 1 ) return(NoSig)  # Returns a data frame: "No significant results found!"
        
        results1= results1[,c(6,1,2,4,5)] # changed
        colnames(results1)= c("List","FDR","nGenes","GO terms or pathways","Genes") # changed
        minFDR = 0.01
        if(min(results1$FDR) > minFDR ) results1 = as.data.frame("No signficant enrichment found.") else
        results1 = results1[which(results1$FDR < minFDR),]
        
        if(dim(results1)[2] != 5) return(NoSig)  # changed
        colnames(results1)= c("Direction","adj.Pval","nGenes","Pathways", "Genes") # changed
        
        results1
    
  }


geneListGO <- function() {  
  results1 = geneListGOTable.out
  tem = input_removeRedudantSets
  if(dim(results1)[2] ==1) return(results1) else { 
    results1$adj.Pval <- sprintf("%-2.1e",as.numeric(results1$adj.Pval) )
    results1[,1] <- as.character(results1[,1])
    results1[ duplicated (results1[,1] ),1 ] <- ""  
    
    return( results1[,-5] )
    }   
  }


# a program for ploting enrichment results by highlighting the similarities among terms
# must have columns: Direction, adj.Pval   Pathways Genes
#  Direction    adj.Pval    nGenes  Pathways        Genes
#Down regulated 3.58E-59    131 Ribonucleoprotein complex biogenesis    36  Nsun5 Nhp2 Rrp15 
#Down regulated 2.55E-57    135 NcRNA metabolic process 23  Nsun5 Nhp2 Rrp15 Emg1 Ddx56 Rsl1d1 enrichmentPlot <- function( enrichedTerms){
# Up or down regulation is color-coded
# gene set size if represented by the size of marker
enrichmentPlot <- function( enrichedTerms, rightMargin=33) {
  if(class(enrichedTerms) != "data.frame") return(NULL)
  library(dendextend) # customizing tree
  
  geneLists = lapply(enrichedTerms$Genes, function(x) unlist( strsplit(as.character(x)," " )   ) )
  names(geneLists)= enrichedTerms$Pathways

  # compute overlaps percentage--------------------

  n = length(geneLists)
  w <- matrix(NA, nrow = n, ncol = n)
# compute overlaps among all gene lists
    for (i in 1:n) {
        for (j in i:n) {
            u <- unlist(geneLists[i])
            v <- unlist(geneLists[j])
            w[i, j] = length(intersect(u, v))/length(unique(c(u,v)))
        }
    }
# the lower half of the matrix filled in based on symmetry
    for (i in 1:n) 
        for (j in 1:(i-1)) 
            w[i, j] = w[j,i] 

  Terms = paste( sprintf("%-1.0e",as.numeric(enrichedTerms$adj.Pval)), 
                names(geneLists))
  rownames(w) = Terms
  colnames(w) = Terms
  par(mar=c(0,0,1,rightMargin)) # a large margin for showing 

  dend <- as.dist(1-w) %>%
    hclust (method="average") 
  ix = dend$order # permutated order of leaves

  leafType= as.factor( gsub(" .*","", enrichedTerms$Direction[ix] ) )
  if(length(unique(enrichedTerms$Direction)  ) ==2 )
    leafColors = c("green","red")  else  # mycolors
    leafColors = mycolors 
  #leafSize = unlist( lapply(geneLists,length) ) # leaf size represent number of genes
  #leafSize = sqrt( leafSize[ix] )  
  leafSize = -log10(as.numeric( enrichedTerms$adj.Pval[ix] ) ) # leaf size represent P values
  leafSize = 1.5*leafSize/max( leafSize ) + .2
  
    dend %>% 
    as.dendrogram(hang=-1) %>%
    set("leaves_pch", 19) %>%   # type of marker
    set("leaves_cex", leafSize) %>% #Size
    set("leaves_col", leafColors[leafType]) %>% # up or down genes
    plot(horiz=TRUE)
    
  #legend("top",pch=19, col=leafColors[1:2],legend=levels(leafType),bty = "n",horiz =T  )
  # add legend using a second layer
    par(lend = 1)           # square line ends for the color legend
    add_legend("top",pch=19, col=leafColors[1:2],legend=levels(leafType),bty = "n",horiz =T 

    )
  
}


# numChar=100 maximum number of characters
# n=200  maximum number of nodes
# degree.cutoff = 0    Remove node if less connected
#from PPInfer
enrich.net2 <-  function (x, gene.set, node.id, node.name = node.id, pvalue, 
    n = 50, numChar = NULL, pvalue.cutoff = 0.05, edge.cutoff = 0.05, 
    degree.cutoff = 0, edge.width = function(x) {
        5 * x^2
    }, node.size = function(x) {
        2.5 * log10(x)
    }, group = FALSE, group.color = c("green","red" ), group.shape = c("circle", 
        "square"), legend.parameter = list("topright"), show.legend = TRUE, plotting=TRUE,
    ...) 
{
    library(igraph)
    
    x <- data.frame(x, group)
    colnames(x)[length(colnames(x))] <- "Group"
    x <- x[as.numeric( x[, pvalue]) < pvalue.cutoff, ]
    x <- x[order(x[, pvalue]), ]
    n <- min(nrow(x), n)
    if (n == 0) {
        stop("no enriched term found...")
    }
    x <- x[1:n, ]
    index <- match(x[, node.id], names(gene.set))
    geneSets <- list()
    for (i in 1:n) {
        geneSets[[i]] <- gene.set[[index[i]]]
    }
    names(geneSets) <- x[, node.name]
    if (is.null(numChar)) {
        numChar <- max(nchar(as.character(x[, node.name])))
    }
    else {
        if (length(unique(substr(x[, node.name], 1, numChar))) < 
            nrow(x)) {
            numChar <- max(nchar(as.character(x[, node.name])))
            message("Note : numChar is too small.", "\n")
        }
    }
    x[, node.name] <- paste(substr(x[, node.name], 1, numChar), 
        ifelse(nchar(as.character(x[, node.name])) > numChar, 
            "...", ""), sep = "")
    w <- matrix(NA, nrow = n, ncol = n)

    for (i in 1:n) {
        for (j in i:n) {
            u <- unlist(geneSets[i])
            v <- unlist(geneSets[j])
            w[i, j] = length(intersect(u, v))/length(unique(c(u, 
                v)))
        }
    }
    list.edges <- stack(data.frame(w))
    list.edges <- cbind(list.edges[, 1], rep(x[, node.name], 
        n), rep(x[, node.name], each = n))
    list.edges <- list.edges[list.edges[, 2] != list.edges[,3], ]
    list.edges <- list.edges[!is.na(list.edges[, 1]), ]
    g <- graph.data.frame(list.edges[, -1], directed = FALSE)
    E(g)$width = edge.width(as.numeric(list.edges[, 1]))
    V(g)$size <- node.size(lengths(geneSets))
    g <- delete.edges(g, E(g)[as.numeric(list.edges[, 1]) < edge.cutoff])
    index.deg <- igraph::degree(g) >= degree.cutoff
    g <- delete.vertices(g, V(g)[!index.deg])
    x <- x[index.deg, ]
    index <- index[index.deg]
    if (length(V(g)) == 0) {
        stop("no categories greater than degree.cutoff...")
    }
    n <- min(nrow(x), n)
    x <- x[1:n, ]
    group.level <- sort(unique(group))
    pvalues <- x[, pvalue]
    for (i in 1:length(group.level)) {
        index <- x[, "Group"] == group.level[i]
        V(g)$shape[index] <- group.shape[i]
        group.pvalues <- pvalues[index]
        if (length(group.pvalues) > 0) {
            if (max(group.pvalues) == min(group.pvalues)) {
                V(g)$color[index] <- adjustcolor(group.color[i], 
                  alpha.f = 0.5)
            }
            else {
                V(g)$color[index] <- sapply(1 - (group.pvalues - 
                  min(group.pvalues))/(max(group.pvalues) - min(group.pvalues)), 
                  function(x) {
                    adjustcolor(group.color[i], alpha.f = x)
                  })
            }
        }
    }
    if(plotting) { 
        plot(g,, vertex.label.dist=1, ...)
        if (show.legend) {
            legend.parameter$legend <- group.level
            legend.parameter$text.col <- group.color
            legend.parameter$bty <- "n" 
            do.call(legend, legend.parameter)
        }}
    return(g)
}


enrichmentNetwork <- function(enrichedTerms){
    geneLists = lapply(enrichedTerms$Genes, function(x) unlist( strsplit(as.character(x)," " )   ) )
    names(geneLists)= enrichedTerms$Pathways
    enrichedTerms$Direction = gsub(" .*","",enrichedTerms$Direction )

    g <- enrich.net2(enrichedTerms, geneLists, node.id = "Pathways", numChar = 100, 
       pvalue = "adj.Pval", edge.cutoff = 0.2, pvalue.cutoff = 1, degree.cutoff = 0,
       n = 200, group = enrichedTerms$Direction, vertex.label.cex = 1, vertex.label.color = "black")

}
  
enrichmentNetworkPlotly <- function(enrichedTerms){
    geneLists = lapply(enrichedTerms$Genes, function(x) unlist( strsplit(as.character(x)," " )   ) )
    names(geneLists)= enrichedTerms$Pathways

    g <- enrich.net2(enrichedTerms, geneLists, node.id = "Pathways", numChar = 100, 
       pvalue = "adj.Pval", edge.cutoff = 0.2, pvalue.cutoff = 1, degree.cutoff = 0,
       n = 200, group = enrichedTerms$Direction, vertex.label.cex = 0.8, vertex.label.color = "black"
       ,plotting=TRUE)

    vs <- V(g)
    es <- as.data.frame(get.edgelist(g))
    Nv <- length(vs)
    Ne <- length(es[1]$V1)
    # create nodes
    L <- layout.kamada.kawai(g)
    Xn <- L[,1]
    Yn <- L[,2]
    inc = (max(Yn)-min(Yn))/50  # for shifting
    #group <- ifelse(V(g)$shape == "circle", "GO", "KEGG")
     group <- as.character(enrichedTerms$Direction)
    network <- plot_ly(x = ~Xn, y = ~Yn, type = "scatter", mode = "markers",
       marker = list(color = V(g)$color, size = V(g)$size*2,
          symbol= ~V(g)$shape, line = list(color = "gray", width = 2)),
       hoverinfo = "text", text = ~paste("</br>", group, "</br>", names(vs))) %>%
       add_annotations( x = ~Xn, y = ~Yn+inc, text = names(vs), showarrow = FALSE,
          font = list(color = "#030303", size = 12))
    # create edges
    edge_shapes <- list()
    for(i in 1:Ne)
    {
       v0 <- es[i,]$V1
       v1 <- es[i,]$V2
       index0 <- match(v0, names(V(g)))
       index1 <- match(v1, names(V(g)))
       edge_shape <- list(type = "line", line = list(color = "gray" ,
          width = E(g)$width[i]/2), x0 = Xn[index0], y0 = Yn[index0],
          x1 = Xn[index1], y1 = Yn[index1])
       edge_shapes[[i]] <- edge_shape
    }
    # create network
    axis <- list(title = "", showgrid = FALSE, showticklabels = FALSE, zeroline = FALSE)
    h <- layout(network, title = "Enrichment Network", shapes = edge_shapes,
       xaxis = axis, yaxis = axis)
    config(h, showLink = TRUE)
       
       
}
  
 #Homo sapies --> hsapiens
shortSpeciesNames <- function(tem){
     tem2 = strsplit(as.character(tem)," " )       
     return( tolower( paste0(substr(tem2[[1]][1],1,1), tem2[[1]][2]  ) ) )
}

findTaxonomyID <- function( speciesName = "mmusculus") {
    
    if(!is.null(input_speciesName) ) { # if species name is entered
       ix = match(input_speciesName, STRING10_species$official_name)
       } else if( input_selectGO2 != "ID not recognized!" )
       { # if no species is entered, try to resolve species using existing info     
            codedNames = sapply(STRING10_species$compact_name,shortSpeciesNames )
            ix = match(speciesName , codedNames)
            if(input_selectOrg != speciesChoice[[1]]) {  # if species is entered
                selectedSpecies = findSpeciesById(input_selectOrg)[1,1]
                ix = match( gsub("_.*","", selectedSpecies ), codedNames)               
            }

        } else return(NULL) 
 
    if(length(ix) == 0 | is.na(ix) ) return(NULL) 
    return(STRING10_species$species_id[ix])
}

STRINGdb_geneList <- function() {
    library(STRINGdb,verbose=FALSE)
    taxonomyID = findTaxonomyID.out

        if(is.null( taxonomyID ) ) return(NULL)
        #Intialization
        string_db <- STRINGdb$new( version="10", species=taxonomyID,
                               score_threshold=0, input_directory="" )
                
        # using expression data
        genes <- geneListData.out
        colnames(genes)[1:2]=c("gene","lfc")
        mapped <- string_db$map(genes,"gene", removeUnmappedRows = TRUE )

        up= subset(mapped, lfc>0, select="STRING_id", drop=TRUE )

        down= subset(mapped, lfc<0, select="STRING_id", drop=TRUE )     
        
        mappingRatio = nrow(mapped)/ nrow(genes)
        if(nrow(mapped) == 0) return(NULL) else
         return( list(up=up, down=down, ratio=mappingRatio, geneTable=mapped ) )
}


STRINGDB_mapping_stat <- function( ) {

        if( is.null(STRINGdb_geneList.out ) ) return("No genes mapped by STRINGdb. Please enter or double-check species name above.")
        if(! is.null(STRINGdb_geneList.out ) ) { 
            tem=paste0( 100*round(STRINGdb_geneList()$ratio,3), "% genes mapped by STRING web server.")
            if(STRINGdb_geneList()$ratio <0.3 ) tem = paste(tem, "Warning!!! Very few gene mapped. Double check if the correct species is selected.")
            return( tem  )
        }
}


stringDB_GO_enrichmentData <- function() {
        library(STRINGdb,verbose=FALSE)
                           
        tem = input_STRINGdbGO
        taxonomyID = findTaxonomyID.out
        if(is.null( taxonomyID ) ) return(NULL)     
        NoSig = as.data.frame("No significant enrichment found.")
        ####################################
        #Intialization
        string_db <- STRINGdb$new( version="10", species=taxonomyID,
                               score_threshold=0, input_directory="" )
                
        # using expression data
        genes <- selectedHeatmap.data.out$genes
        if(is.null(genes) ) return(NULL) 
        if(dim(genes)[1] <= minGenesEnrichment ) return(NoSig) # if has only few genes
        
        fc = selectedHeatmap.data.out$bar       
        # GO
        results1 <- NULL; result <- NULL
        pp <- 0
        for( i in c(1:2) ) {
            
            if( length(which(fc*i<0)) <= minGenesEnrichment) next; 
            query = rownames(genes)[which(fc*i<0)]
            
            if( length(query) <= minGenesEnrichment) next;  
            
            ids = STRINGdb_geneList.out[[i]]
            
            result <- string_db$get_enrichment( ids, category = input_STRINGdbGO, methodMT = "fdr", iea = TRUE )
            if(nrow(result) == 0 ) next; 
            if(nrow(result) > 15)  result <- result[1:15,]

            if( dim(result)[2] ==1) next;   # result could be NULL
            if(i == 1) result$direction = "Up regulated"  else result$direction = "Down regulated"
            if (pp==0 ) { results1 <- result; pp = 1;} else  results1 = rbind(results1,result)
        }

        if ( pp == 0 ) return (NoSig)
        if ( is.null( results1) ) return (NoSig)
        if( dim(results1)[2] == 1 ) return(NoSig)  # Returns a data frame: "No significant results found!"
        
        results1= results1[,c(7,5,3, 6)]
        colnames(results1)= c("List","FDR","nGenes","GO terms or pathways")
        minFDR = 0.01
        if(min(results1$FDR) > minFDR ) results1 = as.data.frame("No signficant enrichment found.") else
        results1 = results1[which(results1$FDR < minFDR),]
        
        if(dim(results1)[2] != 4) return(NoSig)
        colnames(results1)= c("Direction","adj.Pval","nGenes","Pathways")
        results1$adj.Pval <- sprintf("%-2.1e",as.numeric(results1$adj.Pval) )   
        rownames(results1)=1:nrow(results1)
        results1[ duplicated (results1[,1] ),1 ] <- ""  
        
        return( results1 )
}


stringDB_network1 <- function( geneLists = 1 ) {  # geneLists =2 for both up and down
    library(STRINGdb)
                           
        tem = input_nGenesPPI
        taxonomyID = findTaxonomyID.out
        if(is.null( taxonomyID ) ) return(NULL)     
        ####################################
        NoSig = as.data.frame("No significant enrichment found.")
        if(is.null(STRINGdb_geneList.out ) ) return(NULL)
        

        #Intialization
        string_db <- STRINGdb$new( version="10", species=taxonomyID,
                               score_threshold=0, input_directory="" )
        # only up regulated is ploted       
        for( i in c(1:geneLists) ) {
            ids = STRINGdb_geneList.out[[i]]
            if(length(ids)> input_nGenesPPI )  # n of genes cannot be more than 400
                ids <- ids[1:input_nGenesPPI]

            string_db$plot_network( ids,add_link=FALSE)

        }
}



# generates a html file containing results and also links to STRING-db
stringDB_network_link <- function(){
        library(STRINGdb,verbose=FALSE)
        taxonomyID = findTaxonomyID.out
        if(is.null( taxonomyID ) ) return(NULL)     
        
        NoSig = as.data.frame("No significant enrichment found.")
        if(is.null(STRINGdb_geneList.out ) ) return(NULL)

        #Intialization
        string_db <- STRINGdb$new( version="10", species=taxonomyID,
                               score_threshold=0, input_directory="" )
            # upregulated
           ids = STRINGdb_geneList.out[[1]]
            if(length(ids)> input_nGenesPPI )  # n of genes cannot be more than 400
                ids <- ids[1:input_nGenesPPI]

            link1 = string_db$get_link( ids)
            Pval1 = string_db$get_ppi_enrichment( ids)
            tem = "<h5> Interactive and annotated PPI networks among DEGs: <br/>  "
            tem = paste(tem, "<a href=\"", link1, "\" target=\"_blank\"> Up-regulated; </a>"  )

            # downregulated
            ids = STRINGdb_geneList()[[2]]
            if(length(ids)> input_nGenesPPI )
                ids <- ids[1:input_nGenesPPI]

            link2 = string_db$get_link( ids)
            Pval2 = string_db$get_ppi_enrichment( ids)              
            tem = paste(tem, " &nbsp  <a href=\"", link2, "\"target=\"_blank\"> Down-regulated; </a>"  )
            
            # both up and down with color code

            geneTable = STRINGdb_geneList.out$geneTable
            if(nrow(geneTable)> input_nGenesPPI ) 
                geneTable <- geneTable[1:input_nGenesPPI,]
            geneTable =  string_db$add_diff_exp_color( geneTable, logFcColStr="lfc" ) 
            payload_id <- string_db$post_payload( geneTable$STRING_id,colors=geneTable$color )
            link3 = string_db$get_link(geneTable$STRING_id, payload_id = payload_id)            
            tem = paste(tem, " &nbsp  <a href=\"", link3, "\"target=\"_blank\"> Both with fold-changes color coded.</a></h5>"  )

            tem2 = paste("<h5> PPI enrichment P values: ")  
            tem2 = paste0(tem2,"Up-regulated: ", sprintf("%-3.2e",Pval1[1]), " &nbsp Down-regulated: ", sprintf("%-3.2e",Pval2[1]),".")
            tem2 = paste(tem2, " Small P values indicate more PPIs among DEGs than background. </h5>" )
            tem = paste(tem2,tem )
            return(tem) 
}

6.9 Pathway analysis

################################################################
# Pathway analysis
################################################################

gagePathwayData <- function( ){
    library(gage,verbose=FALSE) # pathway analysis  



    ####################################
    
        myrange = c(input_minSetSize, input_maxSetSize)
        noSig = as.data.frame("No significant pathway found.")
        if( length(limma.out$topGenes) == 0 ) return(noSig)
        if(length( limma.out$comparisons)  ==1 )  
        { top1=limma.out$topGenes[[1]]  
        } else {
          top = limma.out$topGenes
          ix = match(input_selectContrast1, names(top))
          if( is.na(ix)) return (noSig)
          top1 <- top[[ix]]; 
          }
          if(dim(top1)[1] == 0 ) return (noSig)
          colnames(top1)= c("Fold","FDR")

          top1 = top1[which(top1$FDR <input_GenePvalCutoff) , ]
 

          gmt = GeneSets.out 
          if(length( GeneSets.out )  == 0)  { return(as.data.frame("No gene set found!"))}
          #converted = convertID(rownames(top1),input_selectOrg)
          #
           #gmt = readGeneSets(converted, top1, input_selectGO, input_selectOrg, myrange )
         # cat("Sets",length(gmt))

         fold = top1[,1]; names(fold) <- rownames(top1)
         
         if(input_absoluteFold) fold <- abs(fold)
         paths <- gage(fold, gsets = gmt, ref = NULL, samp = NULL)

          paths <-  rbind(paths$greater,paths$less)
         # write.csv(paths,"tem.csv")     
         # cat( dim(paths) )
          if(dim(paths)[1] < 1 | dim(paths)[2]< 6 ) return( noSig )
          top1 <- paths[,c('stat.mean','set.size','q.val')]
          colnames(top1)= c("statistic","Genes","adj.Pval")
          top1 <- top1[order(top1[,3]) ,]  
          if ( length( which( top1[,3] <=  input_pathwayPvalCutoff   ) ) == 0 )
            return( noSig)
          top1 <- top1[which(top1[,3] <=  input_pathwayPvalCutoff ) ,,drop=FALSE]
          if(dim(top1)[1] > input_nPathwayShow ) 
             top1 <- top1[1:input_nPathwayShow, ,drop=FALSE]
             
            top1 <- as.data.frame(top1)
            top1 <- cbind(rep( input_selectContrast1, dim(top1)[1]),row.names(top1), top1); 
            top1$statistic <- as.character( round(as.numeric(top1$statistic),4)); 
            top1$adj.Pval <- sprintf("%-2.1e",as.numeric(top1$adj.Pval) )
            top1[,2] <- as.character(top1[,2]);top1[,1] <- as.character(top1[,1])
            colnames(top1)[1] <- "Direction"
            p.m <- "GAGE"

            colnames(top1)[2] <- paste(p.m," analysis:", gsub("-"," vs ",input_selectContrast1 ) )
            top1[ which( top1[,3] >0),1 ] <- "Up" #gsub("-"," > ",input_selectContrast1 )
            top1[ which( top1[,3] <0),1 ] <- "Down" # gsub("-"," < ",input_selectContrast1 )
            top1 <- top1[order( top1[,1], -abs(as.numeric( top1[,3]) ) ) ,]
            top1[ duplicated (top1[,1] ),1 ] <- ""

          return( top1)
}

fgseaPathwayData <- function() {
    library(fgsea,verbose=FALSE) # fast GSEA

    noSig = as.data.frame("No significant pathway found.")
    if( length(limma.out$topGenes) == 0 ) return(noSig)
    if(length( limma.out$comparisons)  ==1 )  
    { top1=limma.out$topGenes[[1]]  
    } else {
      top = limma.out$topGenes
      ix = match(input_selectContrast1, names(top))
      if( is.na(ix)) return (noSig)
      top1 <- top[[ix]]; 
      }
      if(dim(top1)[1] == 0 ) return (noSig)
      colnames(top1)= c("Fold","FDR")
      
      # remove some genes
      top1 = top1[which(top1$FDR <input_GenePvalCutoff) , ]
      

      gmt = GeneSets.out 
     if(length( GeneSets.out )  == 0)  { return(as.data.frame("No gene set found!"))}

      #converted = convertID(rownames(top1),input_selectOrg)
      #
       #gmt = readGeneSets(converted, top1, input_selectGO, input_selectOrg, myrange )
     # cat("Sets",length(gmt))

     fold = top1[,1]; names(fold) <- rownames(top1)
     if(input_absoluteFold) fold <- abs(fold) # use absolute value of fold change, disregard direction
     paths <- fgsea(pathways = gmt, 
                  stats = fold,
                  minSize=input_minSetSize,
                  maxSize=input_maxSetSize,
                  nperm=10000)
     # paths <-  rbind(paths$greater,paths$less)
      if(dim(paths)[1] < 1  ) return( noSig )
           paths <- as.data.frame(paths)
       paths <- paths[order(-abs( paths[,5])) ,]  # sort by NES
      # paths <- paths[order( paths[,3]) ,]  # sort by FDR
      top1 <- paths[,c(1,5,7,3)]
      # rownames(top1) <- paths[,1] #paste(1:dim(paths)[1],": ",paths[,1],sep="" )
      colnames(top1)= c("Pathway", "NES","Genes","adj.Pval")
      
      if ( length( which( top1[,4] <=  input_pathwayPvalCutoff   ) ) == 0 )
        return( noSig)
      top1 <- top1[which(top1[,4] <=  input_pathwayPvalCutoff ) , ,drop=FALSE]
      if(dim(top1)[1] > input_nPathwayShow ) 
         top1 <- top1[1:input_nPathwayShow,,drop=FALSE]
     #top1 <- cbind(row.names(top1), top1); colnames(top1)[1] <-input_selectContrast1   
        top1 <- as.data.frame(top1)
        top1 <- cbind(rep( input_selectContrast1, dim(top1)[1]), top1); 
        top1[,4] = as.character( round(as.numeric(top1[,4]),4)); 
        top1$adj.Pval <- sprintf("%-2.1e",as.numeric(top1$adj.Pval) )
        top1[,1] <- as.character(top1[,1])
        colnames(top1)[1] <- "Direction"

        p.m <- "GSEA"

        colnames(top1)[2] <- paste(p.m," analysis:", gsub("-"," vs ",input_selectContrast1 ) )
        top1[ which( as.numeric( top1[,3]) >0),1 ] <- "Up" #gsub("-"," > ",input_selectContrast1 )
        top1[ which( as.numeric( top1[,3]) <0),1 ] <- "Down" #gsub("-"," < ",input_selectContrast1 )
        top1 <- top1[order( top1[,1], -abs(as.numeric( top1[,3]) ) ) ,]
        top1[ duplicated (top1[,1] ),1 ] <- ""   
        top1[,3] = as.character( round(as.numeric(top1[,3]),4));

     return( top1)
}

extract1 <- function (x) {
  words <- unlist ( strsplit(x,"_"))
  if(length( words )  <=4 ) return(gsub("_"," ",x)) else {
  words <- words[-c(1:4)]
  return( proper(paste(words,collapse = " ") ) )}
}

PGSEApathway <- function (converted,convertedData, selectOrg,GO,gmt, myrange,Pval_pathway,top){
    subtype = detectGroups(colnames(convertedData))
    library(PGSEA,verbose=FALSE)
    Pvalue = 0.01  # cut off to report in PGSEA. Otherwise NA
    #Pval_pathway = 0.2   # cut off for P value of ANOVA test  to writ to file 
    # top = 30   # number of pathways to show
    if(length(gmt) ==0 ) return( list(pg3 = NULL, best = 1 ) )
    # centering by mean
    #pg = myPGSEA (convertedData - rowMeans(convertedData),
     #            cl=gmt,range=myrange,p.value=TRUE, weighted=FALSE,nPermutation=100)

    #if( class(convertedData) != "data.frame" | class(convertedData) != "matarix") return( list(pg3 = NULL, best = 1 ) )
    #if( dim(convertedData)[2] <2 ) return( list(pg3 = NULL, best = 1 ) )
    
     pg = PGSEA (convertedData - rowMeans(convertedData),cl=gmt,range=myrange,p.value=TRUE, weighted=FALSE)
    
    pg2 = pg$results;
    pg2 = pg2[rowSums(is.na(pg2))<ncol(pg2) ,]  # remove se/wrts with all missing(non-signficant)
    if (dim(pg2)[1] < 2 ) return.out
    best = max(abs(pg2))
    
    if(length(subtype) < 4 || length(unique(subtype)) <2 ||length(unique(subtype)) == dim(convertedData)[2] ) { 
     pg2 = pg2[order(-apply(pg2,1,sd)     )   ,]
     return( list(pg3 = pg2[1:top,], best = best ) )
    } 
    
    cat("\nComputing P values using ANOVA\n");
    pathPvalue <- function ( k){
     return( summary(aov(pg2[k,]~subtype) )[[1]][["Pr(>F)"]][1] )
    }
    Pvalues = sapply(1:dim(pg2)[1], pathPvalue)
    Pvalues = p.adjust(Pvalues, "fdr")
    
    #if(min(Pvalues) > Pval_pathway ) return( list(pg3 = NULL, best = best ) )  else {  
    if(sort(Pvalues)[2] > Pval_pathway ) return( list(pg3 = NULL, best = best ) )  else {  

    NsigT = rowSums(pg$p.results<Pvalue)
    
    result=cbind( as.matrix(Pvalues),NsigT,pg2); 
    result = result[ order(result[,1])   ,] 
    
    result = result[which(result[,1] < Pval_pathway),,drop=F]
    #result = result[which(result[,2] >2)    ,]
    pg2 = result[,-2]

    # when there is only 1 left in the matrix pg2 becomes a vector
    if(sum( Pvalues<Pval_pathway) == 1) { pg3 = t( as.matrix(pg2));pg3 = rbind(pg3,pg3);} else
    { if(dim(pg2)[1] > top ) {  pg3 = pg2[1:top,]; } else {  pg3 = pg2;  } }

    rownames(pg3) = sapply(rownames(pg3) , extract1)
    a=sprintf("%-3.2e",pg3[,1])
    rownames(pg3) = paste(a,rownames(pg3),sep=" ")
    pg3 =pg3[,-1]
    
    pg3 <- pg3[order( -apply(pg3,1,sd)    ),] # sort by SD
 
    return( list(pg3 = pg3, best = best ) )
    }
 }

PGSEAplot <- function(){
  
    library(PGSEA,verbose=FALSE)

    if(input_selectGO == "ID not recognized!" ) return( NULL)

    myrange = c(input_minSetSize, input_maxSetSize)
    genes = convertedData.out
if (is.null(input_selectContrast1 ) ) return(NULL)

    gmt = GeneSets.out


    # find related samples
    iz = findContrastSamples(input_selectContrast1, colnames(convertedData.out),readSampleInfo.out,
                                        input_selectFactorsModel,input_selectModelComprions, 
                                        factorReferenceLevels.out,input_CountsDEGMethod,
                                        input_dataFileFormat  )

    
    
    genes = genes[,iz]  
    
    subtype = detectGroups(colnames(genes )) 
    if(length( GeneSets.out )  == 0)  { plot.new(); text(0.5,0.5, "No gene sets!")} else {
    result = PGSEApathway(converted.out,genes, input_selectOrg,input_selectGO,
                 GeneSets.out,  myrange, input_pathwayPvalCutoff, input_nPathwayShow    )
                     
    if( is.null(result$pg3) ) { plot.new(); text(0.5,1, "No significant pathway found!")} else 
    smcPlot(result$pg3,factor(subtype),scale = c(-max(result$pg3), max(result$pg3)), 
    show.grid = T, margins = c(3,1, 13,28), col = .rwb,cex.lab=0.5, main="Pathway Analysis:PGSEA")
    }
    
}

convertEnsembl2Entrez <- function (query,Species) { 
    querySet <- cleanGeneSet( unlist( strsplit( toupper(names( query)),'\t| |\n|\\,' )  ) )
    speciesID <- orgInfo$id[ which(orgInfo$ensembl_dataset == Species)]  # note uses species Identifying
    # idType 6 for entrez gene ID
    result <- dbGetQuery( convert,
                        paste( " select  id,ens,species from mapping where ens IN ('", paste(querySet,collapse="', '"),
                                "') AND  idType ='",idType_Entrez,"'",sep="") ) # slow
                            
    if( dim(result)[1] == 0  ) return(NULL)
    result <- subset(result, species==speciesID, select = -species)

    ix = match(result$ens,names(query)  )

    tem <- query[ix];  names(tem) = result$id
    return(tem)
  
}

ReactomePAPathwayData <- function( ){
    library(ReactomePA,verbose=FALSE) # pathway analysis
 
    ensemblSpecies <- c("hsapiens_gene_ensembl","rnorvegicus_gene_ensembl", "mmusculus_gene_ensembl",
                       "celegans_gene_ensembl","scerevisiae_gene_ensembl", "drerio_gene_ensembl", "dmelanogaster_gene_ensembl")
    ReactomePASpecies= c("human", "rat", "mouse", "celegans", "yeast", "zebrafish", "fly" )
    # cbind(ensemblSpecies,ReactomePASpecies)  # double check mapping
    
    
    myrange = c(input_minSetSize, input_maxSetSize)
    noSig = as.data.frame("No significant pathway found.")
    if( length(limma.out$topGenes) == 0 ) return(noSig)
    if(length( limma.out$comparisons)  ==1 )  
    { top1=limma.out$topGenes[[1]]  
    } else {
      top = limma.out$topGenes
      ix = match(input_selectContrast1, names(top))
      if( is.na(ix)) return (noSig)
      top1 <- top[[ix]]; 
      }
      if(dim(top1)[1] == 0 ) return (noSig)
      colnames(top1)= c("Fold","FDR")

      # remove some genes
      top1 = top1[which(top1$FDR <input_GenePvalCutoff) , ]
      
     
     fold = top1[,1]; names(fold) <- rownames(top1)
     if(input_absoluteFold) fold <- abs(fold) # use absolute value of fold change, disregard direction

       Species <- converted.out$species
      ix <- match( Species,ensemblSpecies ) 
      if(is.na(ix) ) return(as.data.frame("Species not coverted by ReactomePA package!"))
      
      fold <- convertEnsembl2Entrez (fold, Species)  
         fold <- sort(fold,decreasing =T)
      paths <- gsePathway(fold, nPerm=5000, organism = ReactomePASpecies[ix],
                minGSSize= input_minSetSize, 
                maxGSSize= input_maxSetSize,
                pvalueCutoff=0.5,
                pAdjustMethod="BH", verbose=FALSE)
      paths <- as.data.frame(paths)
      
      if(is.null(paths) ) return( noSig)
      if(dim(paths)[1] ==0 ) return( noSig)

     # paths <-  rbind(paths$greater,paths$less)
      if(dim(paths)[1] < 1  ) return( noSig )
           paths <- as.data.frame(paths)
       paths <- paths[order(-abs( paths[,5])) ,]  # sort by NES
      # paths <- paths[order( paths[,3]) ,]  # sort by FDR
      top1 <- paths[,c(2,5,3,7)]
      # rownames(top1) <- paths[,1] #paste(1:dim(paths)[1],": ",paths[,1],sep="" )
      colnames(top1)= c("Pathway", "NES","Genes","adj.Pval")
      
      if ( length( which( top1[,4] <=  input_pathwayPvalCutoff   ) ) == 0 )
        return( noSig)
      top1 <- top1[which(top1[,4] <=  input_pathwayPvalCutoff ) ,,drop=FALSE]
      if(dim(top1)[1] > input_nPathwayShow ) 
         top1 <- top1[1:input_nPathwayShow,,drop=FALSE]
     #top1 <- cbind(row.names(top1), top1); colnames(top1)[1] <-input_selectContrast1   
        top1 <- as.data.frame(top1)
        top1 <- cbind(rep( input_selectContrast1, dim(top1)[1]), top1); 
        top1[,4] = as.character( round(as.numeric(top1[,4]),4)); 
        top1$adj.Pval <- sprintf("%-2.1e",as.numeric(top1$adj.Pval) )
        top1[,1] <- as.character(top1[,1])
        colnames(top1)[1] <- "Direction"
        if(input_pathwayMethod == 1 ) p.m <- "GAGE"
        else if(input_pathwayMethod == 2 ) p.m <- "PGSEA"
        else if(input_pathwayMethod == 3 ) p.m <- "GSEA"
        else if(input_pathwayMethod == 4 ) p.m <- "PGSEA_All"
        else if(input_pathwayMethod == 5 ) p.m <- "ReactomePA"
        colnames(top1)[2] <- paste(p.m," analysis:", gsub("-"," vs ",input_selectContrast1 ) )
        top1[ which( as.numeric( top1[,3]) >0),1 ] <- "Up" #gsub("-"," > ",input_selectContrast1 )
        top1[ which( as.numeric( top1[,3]) <0),1 ] <- "Down" #gsub("-"," < ",input_selectContrast1 )
        top1 <- top1[order( top1[,1], -abs(as.numeric( top1[,3]) ) ) ,]
        top1[ duplicated (top1[,1] ),1 ] <- ""   
      top1[,3] = as.character( round(as.numeric(top1[,3]),4));
     return( top1)
}

selectedPathwayData <- function( ){
 
    if(input_sigPathways == "All") return (NULL) 
    ix <- which(names(GeneSets.out ) == input_sigPathways   ) # find the gene set
    if(length(ix) == 0 ) return(NULL)
    genes <- GeneSets.out[[ix]] # retrieve genes

    
    # find related samples  
    iz = findContrastSamples(input_selectContrast1, colnames(convertedData.out),readSampleInfo.out,
                                        input_selectFactorsModel,input_selectModelComprions, 
                                        factorReferenceLevels.out,input_CountsDEGMethod ,
                                        input_dataFileFormat )
    x <-  convertedData.out[which(rownames(convertedData.out) %in% genes), iz ]
    if( input_selectOrg != "NEW") {
        ix = match( rownames(x), allGeneInfo.out[,1])
        if( sum( is.na(allGeneInfo.out$symbol )) != dim(allGeneInfo.out )[1] )  # symbol really exists? 
           rownames(x) <- paste(rownames(x),":", as.character( allGeneInfo.out$symbol)[ix])
   }
    
    return( x )
}

selectedPathwayHeatmap <- function() {

    x = selectedPathwayData.out
    if(dim(x)[1]<=2 | dim(x)[2]<=2 ) return(NULL)
    groups = detectGroups(colnames(x))

    # this will cutoff very large values, which could skew the color 
    x=as.matrix(x)-apply(as.matrix(x),1,mean)
    cutoff = median(unlist(x)) + 3*sd (unlist(x)) 
    x[x>cutoff] <- cutoff
    cutoff = median(unlist(x)) - 3*sd (unlist(x)) 
    x[x< cutoff] <- cutoff
    
    # sometimes, a gene can be all zero in selected samples.
    x <- x[which(apply(x,1,sd)>0) ,]
    
    lmat = rbind(c(5,4),c(0,1),c(3,2))
    lwid = c(1.5,6)
    lhei = c(.5,.2,8)
    
    
    if( dim(x)[1]>200) 
    heatmap.2(x, distfun = dist2,hclustfun=hclust2,
     col=heatColors[as.integer(input_heatColors1),], density.info="none", trace="none", scale="none", keysize=.5
    ,key=T, symkey=F
    ,dendrogram = "row"
    ,ColSideColors=mycolors[ groups]
    ,labRow=""
    ,margins=c(10,15)
    ,srtCol=45
    ,lmat = lmat, lwid = lwid, lhei = lhei
    #,main ="Title"
    )

    if( dim(x)[1]<=200) 
    heatmap.2(x, distfun = dist2,hclustfun=hclust2,
     col=heatColors[as.integer(input_heatColors1),], density.info="none", trace="none", scale="none", keysize=.5
    ,key=T, symkey=F, 
    dendrogram = "row",
    ,labRow=gsub(".*:","",rownames(x))
    ,ColSideColors=mycolors[ groups]
    ,margins=c(10,15)
    ,cexRow=1.2
    ,srtCol=45
    ,lmat = lmat, lwid = lwid, lhei = lhei
    #,main ="Title"
    )
}

convertEnsembl2KEGG <- function (query,Species) {  # not working
    querySet <- cleanGeneSet( unlist( strsplit( toupper(names( query)),'\t| |\n|\\,' )  ) )
    speciesID <- orgInfo$id[ which(orgInfo$ensembl_dataset == Species)]  # note uses species Identifying
    # idType 6 for entrez gene ID
    result <- dbGetQuery( convert,
                        paste( " select  id,ens,species from mapping where ens IN ('", paste(querySet,collapse="', '"),
                                "') AND  idType ='",idType_KEGG,"'",sep="") )   # slow
                            
    if( dim(result)[1] == 0  ) return(NULL)
    result <- subset(result, species==speciesID, select = -species)

    ix = match(result$ens,names(query)  )

    tem <- query[ix];  names(tem) = result$id
    return(tem)  
}

keggPathwayID <- function (pathwayDescription, Species, GO,selectOrg) {
    ix = grep(Species,gmtFiles)

    if (length(ix) == 0 ) {return(NULL)}
    
    # If selected species is not the default "bestMatch", use that species directly
    if(selectOrg != speciesChoice[[1]]) {  
        ix = grep(findSpeciesById(selectOrg)[1,1], gmtFiles )
        if (length(ix) == 0 ) {return(NULL )}
        totalGenes <- orgInfo[which(orgInfo$id == as.numeric(selectOrg)),7]
    }
    pathway <- dbConnect(sqlite,gmtFiles[ix],flags=SQLITE_RO)
    
    # change Parkinson's disease to Parkinson\'s disease    otherwise SQL 
    pathwayDescription <- gsub("\'","\'\'",pathwayDescription)
                            
    pathwayInfo <- dbGetQuery( pathway, paste( " select * from pathwayInfo where description =  '", 
                            pathwayDescription,   "' AND name LIKE '",GO,"%'",sep="") )     
    pathwayInfo <- dbGetQuery( pathway, paste( " select * from pathwayInfo where description =  '", 
                            pathwayDescription,   "'",sep="") ) 
                            
    dbDisconnect(pathway);
    if(dim(pathwayInfo)[1] != 1 ) {return(NULL) }
    tem = gsub(".*:","",pathwayInfo[1,2])  
    return( gsub("_.*","",tem) )
}

KeggImage <- function(){

   # First generate a blank image. Otherwise return(NULL) gives us errors.
    outfile <- tempfile(fileext='.png')
    png(outfile, width=400, height=300)

    frame()
    dev.off()
    blank <- list(src = outfile,
         contentType = 'image/png',
         width = 400,
         height = 300,
         alt = " ") 

    if(is.null( input_selectGO ) ) return(outfile)
    if(input_selectGO != "KEGG") return(outfile)
    if(is.null(gagePathwayData.out ) ) return(outfile)
    if(is.null( input_sigPathways))  return (outfile) 
    # if( is.null(selectedPathwayData()) ) return(blank)

    library(pathview)


    if (is.null(input_selectContrast1 ) ) return(outfile)
    
    if(input_sigPathways == "All") return (outfile) 
    
    if( length(limma.out$topGenes) == 0 ) return(outfile)

    # get fold change
    if(length( limma.out$comparisons)  ==1 )  
    { top1=limma.out$topGenes[[1]]  
    } else {
      top = limma.out$topGenes
      ix = match(input_selectContrast1, names(top))
      if( is.na(ix)) return (outfile)
      top1 <- top[[ix]]; 
      }
      if(dim(top1)[1] == 0 ) return (outfile)
     # cat("here5")
      colnames(top1)= c("Fold","FDR")
      Species <- converted.out$species
      
     fold = top1[,1]; names(fold) <- rownames(top1)
     fold <- convertEnsembl2Entrez(fold,Species)
     
     keggSpecies <- as.character( keggSpeciesID[which(keggSpeciesID[,1] == Species),3] )
     
     if(nchar( keggSpecies) <=2 ) return(outfile) # not in KEGG

     # kegg pathway id

    pathID = keggPathwayID(input_sigPathways, Species, "KEGG",input_selectOrg)
    
    #cat("\nhere5  ",keggSpecies, " ",Species," ",input_sigPathways, "pathID:",pathID,"End", fold[1:5],names(fold)[1:5],"\n")
    #cat("\npathway:",is.na(input_sigPathways))
    #cat("\n",fold[1:5],"\n",keggSpecies,"\n",pathID)
    if(is.null(pathID) ) return(blank) # kegg pathway id not found.
    if(nchar(pathID)<3 ) return(blank)
    randomString <- gsub(".*file","",tempfile()) 

    outfile <- paste(pathID,".",randomString,".png",sep="")
    
        try( 
         pv.out <- pathview(gene.data = fold, pathway.id = pathID, out.suffix = randomString, species = keggSpecies, kegg.native=TRUE)
        )
        
    
    return( outfile )
}

# list of pathways with details
pathwayListData  <- function(){
    
    pathways = NULL
    if( input_pathwayMethod == 1)  
        if(!is.null(gagePathwayData.out)) 
            if(dim(gagePathwayData.out)[2] >1) { 
                pathways <- gagePathwayData.out
                colnames(pathways)[2] ="Pathways";  
                colnames(pathways)[4] ="nGenes"; 
            }
    if( input_pathwayMethod == 3) 
        if(!is.null(fgseaPathwayData.out)) 
            if(dim(fgseaPathwayData.out)[2] >1) {
                pathways <- fgseaPathwayData.out
                colnames(pathways)[2] ="Pathways";  
                colnames(pathways)[4] ="nGenes"; 
            }
    
    if( input_pathwayMethod == 2) 
        if(!is.null(PGSEAplot.data.out)) 
            if(dim(PGSEAplot.data.out)[2] >1) {
                pathways <- as.data.frame( PGSEAplot.data.out)
                pathways$Pathways = substr(rownames(pathways),10, nchar( rownames(pathways)) )
                pathways$adj.Pval = gsub(" .*","", rownames(pathways))
                pathways$Direction ="Diff"
                
                }
    if( input_pathwayMethod == 4) 
        if(!is.null(PGSEAplotAllSamples.data.out)) 
            if(dim(PGSEAplotAllSamples.data.out)[2] >1) {
                pathways <- as.data.frame( PGSEAplotAllSamples.data.out)
                pathways$Pathways = substr(rownames(pathways),10, nchar( rownames(pathways)) )
                pathways$adj.Pval = gsub(" .*","", rownames(pathways))
                pathways$Direction ="Diff"
                
                }           
    if( is.null( pathways) ) return(NULL)
    
    # if no gene set data, return pathway list
    if(is.null(GeneSets.out ) ) return(pathways) 
    

    pathways$adj.Pval = as.numeric(pathways$adj.Pval)
    
    if(nrow(pathways)>1)
    for( i in 2:nrow(pathways) )
        if(nchar(pathways$Direction[i]) <=1)
            pathways$Direction[i] = pathways$Direction[i-1]
            
    # gene symbol matching symbols 
    probeToGene = NULL
    if( input_selectGO != "ID not recognized!" & input_selectOrg != "NEW")
    if(sum(is.na( allGeneInfo.out$symbol ) )/ dim( allGeneInfo.out )[1] <.5 ) { # if more than 50% genes has symbol
        probeToGene = allGeneInfo.out[,c("ensembl_gene_id","symbol")]
        probeToGene$symbol = gsub(" ","",probeToGene$symbol)

        ix = which( is.na(probeToGene$symbol) |
                    nchar(probeToGene$symbol)<2 | 
                    toupper(probeToGene$symbol)=="NA" |  
                    toupper(probeToGene$symbol)=="0"  )             
        probeToGene[ix,2] = probeToGene[ix,1]  # use gene ID

    }       

    
    pathways$Genes =""
    # looking up genes for each pathway
    for(i in 1:nrow(pathways) ){ 
        ix <- which(names(GeneSets.out ) == pathways$Pathways[i]   ) # find the gene set
        if(length(ix) != 0 ) { 
            genes <- GeneSets.out[[ix]] # retrieve genes
            
            if(!is.null(probeToGene) ) { 
                iy = match(genes,probeToGene[,1])
                genes = probeToGene[iy,2]
            }
            
            pathways$Genes[i] = paste(genes, collapse=" ")
        }
    }
    return(pathways)


}

6.10 Genome-wide view

################################################################
# Genome-wide view
################################################################
 
genomePlotly <- function( ){
        # default plot
        fake = data.frame(a=1:3,b=1:3)
        p <- ggplot(fake, aes(x = a, y = b)) +
                             geom_blank() + ggtitle("No genes with position info.") +
                             theme(axis.title.x=element_blank(),axis.title.y=element_blank())
                             
        if(length( limma.out$comparisons)  ==1 )  {
            top1=limma.out$topGenes[[1]]  
        } else {
          top = limma.out$topGenes
          ix = match(input_selectContrast2, names(top))
          if( is.na(ix)) return (ggplotly(p))
          top1 <- top[[ix]]; 
        }
          if(dim(top1)[1] == 0 ) return (ggplotly(p))
          colnames(top1)= c("Fold","FDR")
          
         # write.csv(merge(top1,allGeneInfo(), by.x="row.names",by.y="ensembl_gene_id"  ),"tem.csv"  )
         x <- merge(top1,allGeneInfo.out, by.x="row.names",by.y="ensembl_gene_id"  )

         # if no chromosomes found. For example if user do not convert gene IDs.
         if( dim(x)[1] >5  ) { 
        
             x <- x[order(x$chromosome_name,x$start_position),]
     
             tem = sort( table( x$chromosome_name), decreasing=T)

             chromosomes <- names( tem[tem >= 5 ] )  # chromosomes with less than 100 genes are excluded
             if(length(chromosomes) > 50) chromosomes <- chromosomes[1:50]  # at most 50 chromosomes
             chromosomes <- chromosomes[ nchar(chromosomes)<=12] # chr. name less than 10 characters
             chromosomes = chromosomes[order(as.numeric(chromosomes) ) ]
             # chromosomes = chromosomes[!is.na(as.numeric(chromosomes) ) ]
             chromosomesNumbers = as.numeric(chromosomes)

             # convert chr.x to numbers      
              j = max( chromosomesNumbers,na.rm=T) 
              for( i in 1:length( chromosomes)) {
               if ( is.na(chromosomesNumbers[i]) ) 
               { chromosomesNumbers[i] <- j+1; j <- j+1; }
             }
              
             x <- x[which(x$chromosome_name %in% chromosomes   ),]
             x <- droplevels(x)
             
            # find the number coding for chromosome 
             getChrNumber <- function (chrName){
             return( chromosomesNumbers[ which( chromosomes == chrName)] )
             }
              x$chrNum = 1 # numeric coding
              x$chrNum <- unlist( lapply( x$chromosome_name, getChrNumber) )
             
             x$Row.names <- as.character( x$Row.names)
         
             # if symbol is missing use Ensembl id
             x$symbol = as.character(x$symbol)   
             ix = which(is.na(x$symbol))
             ix2 = which(nchar(as.character(x$symbol))<= 2 )
             ix3 = which( duplicated(x$symbol))
             ix = unique( c(ix,ix2,ix3))
             x$symbol[ix] <- x$Row.names[ix] 
                         
         
             ##################################
             # plotting
            # x = read.csv("tem_genome.csv")
            x = x[!is.na(x$chromosome_name),]
            x = x[!is.na(x$start_position),]    
            # only keep significant genes
            #x = x[which(x$FDR<input_limmaPval),]
            # x = x[which(abs(x$Fold) > log2( input_limmaFC)),]
            
            ix = which( (x$FDR< as.numeric(input_limmaPvalViz)) & (abs(x$Fold) > as.numeric(input_limmaFCViz) ) )
            
            if (length(ix) > 5) { 

                x = x[ix,]      
                
                x$start_position = x$start_position/1000000 # Mbp
                chrD = 20 # distance between chrs.
                foldCutoff = 3   # max log2 fold 
                
                x$Fold = x$Fold / sd(x$Fold)  # standardize fold change
                
                x$Fold[which(x$Fold > foldCutoff )] = foldCutoff   # log2fold within -5 to 5
                x$Fold[which(x$Fold <   -1*foldCutoff )] = -1*foldCutoff 
                x$Fold = 4* x$Fold
                

                
                x$y = x$chrNum*chrD + x$Fold
                chrLengthTable = aggregate(start_position~chrNum, data=x,max )
                chrTotal = dim(chrLengthTable)[1]   
                x$R = as.factor(sign(x$Fold))
                
                colnames(x)[ which(colnames(x) == "start_position")] = "x"

                p= ggplot(x, aes(x = x, y = y, colour = R, text = symbol ) ) + geom_point(shape = 20, size = .2)
                
                    #label y with chr names
                p <- p +  scale_y_continuous(labels = paste("chr",chromosomes[chrLengthTable$chrNum],sep=""), breaks = chrD* (1:chrTotal), limits = c(0, 
                    chrD*chrTotal + 5) )
                # draw horizontal lines for each chr.
                for( i in 1:dim(chrLengthTable)[1] )
                    p = p+ annotate( "segment",x = 0, xend = chrLengthTable$start_position[i],
                        y = chrLengthTable$chrNum[i]*chrD, yend = chrLengthTable$chrNum[i]*chrD)
                # change legend     http://ggplot2.tidyverse.org/reference/scale_manual.html
                p=p+scale_colour_manual(name="",   # customize legend text
                    values=c("blue", "red"),
                    breaks=c("1","-1"),
                    labels=c("Up", "Dn"))   
                p = p + xlab("Position on chrs. (Mbp)") +    theme(axis.title.y=element_blank())                     
                p= p + theme(legend.position="none")
                # p <- ggplot(mtcars, aes(x=hp, y=mpg)) + geom_point(shape=20, size=17)
               # p=p+ geom_smooth(method = "lm",se=FALSE)
                #p+ geom_line(aes(y=rollmean(y,7, na.pad=TRUE)))
              
               # Customize hover text https://cran.r-project.org/web/packages/plotly/plotly.pdf
               # style(ggplotly(p),hoverinfo="text")  # not working
              }  # if no genes else
            
        }
        ggplotly(p)
}

# results from PREDA
genomePlotData <- function( ){
    if (is.null(input_selectContrast2 ) ) return(NULL)
    
    if( length(limma.out$topGenes) == 0 ) return(NULL)
    
    if(length( limma.out$comparisons)  ==1 )  
    { top1=limma.out$topGenes[[1]]  
    } else {
      top = limma.out$topGenes
      ix = match(input_selectContrast2, names(top))
      if( is.na(ix)) return (NULL)
      top1 <- top[[ix]]; 
      }
      if(dim(top1)[1] == 0 ) return (NULL)
      colnames(top1)= c("Fold","FDR")
      
     # write.csv(merge(top1,allGeneInfo(), by.x="row.names",by.y="ensembl_gene_id"  ),"tem.csv"  )
     x <- merge(top1,allGeneInfo.out, by.x="row.names",by.y="ensembl_gene_id"  )
     
     x <- x[order(x$chromosome_name,x$start_position),]
     tem = sort( table( x$chromosome_name), decreasing=T) 
     chromosomes <- names( tem[tem > 100 ] )  # chromosomes with less than 100 genes are excluded
     if(length(chromosomes) > 50) chromosomes <- chromosomes[1:50]  # at most 50 chromosomes

     chromosomes = chromosomes[order(as.numeric(chromosomes) ) ]
     # chromosomes = chromosomes[!is.na(as.numeric(chromosomes) ) ]
     chromosomesNumbers = as.numeric(chromosomes)
     # convert chr.x to numbers
      j = max( chromosomesNumbers,na.rm=T) 
      for( i in 1:length( chromosomes)) {
       if ( is.na(chromosomesNumbers[i]) ) 
       { chromosomesNumbers[i] <- j+1; j <- j+1; }
     }
      
     x <- x[which(x$chromosome_name %in% chromosomes   ),]
     x <- droplevels(x)
     
    # find the number coding for chromosome 
     getChrNumber <- function (chrName){
     return( chromosomesNumbers[ which( chromosomes == chrName)] )
     }
      x$chrNum = 1 # numeric coding
      x$chrNum <- unlist( lapply( x$chromosome_name, getChrNumber) )
     
     x$Row.names <- as.character( x$Row.names)
     fold = x$Fold; names(fold) = x$Row.names
     
    # write.csv(x,"tem.csv")
     x <- x[!duplicated(x$Row.names),]
     #rownames(x) = x$Row.names
     

    GEanalysisResults <- genomePlotDataPre.out; 
    
    genomic_regions_UP<-PREDAResults2GenomicRegions(GEanalysisResults, qval.threshold=input_RegionsPvalCutoff, smoothStatistic.tail="upper", smoothStatistic.threshold= input_StatisticCutoff)
    genomic_regions_DOWN<-PREDAResults2GenomicRegions(GEanalysisResults, qval.threshold=input_RegionsPvalCutoff, smoothStatistic.tail="lower", smoothStatistic.threshold= -1 *input_StatisticCutoff)

    if( is.null(genomic_regions_UP$Test) &&  is.null(genomic_regions_UP$Test)  ) return(-1) # no significant regions
    
    regions <- 0 
    if( !is.null(genomic_regions_UP$Test)) { 
    dataframe_UPregions<-GenomicRegions2dataframe(genomic_regions_UP[[1]])
    dataframe_UPregions$Regulation <- "Up"
    regions = dataframe_UPregions
    }

    if( !is.null(genomic_regions_DOWN$Test) ){ 
    dataframe_DOWNregions<-GenomicRegions2dataframe(genomic_regions_DOWN[[1]])
    dataframe_DOWNregions$Regulation <- "Down"
    if ( class(regions) != "data.frame" ) regions <- dataframe_DOWNregions else # if UP regions is NULL
     regions = rbind(regions,dataframe_DOWNregions  )
    }

    if( class(regions) != "data.frame") return(-1)

    Regions <- regions[,c(4,1:3)] 
    Regions <- Regions[which(Regions$end != Regions$start),]
    Regions$size = round((Regions$end - Regions$start )/1000000,3)
    Regions$chr <- chromosomes[ Regions$chr ] # convert from chr. number to names

    # find the gene indices in the up or down regulated regions
    regulatedGenes <- function (i) {
    ix =  which( Regions$chr == x$chromosome_name[i] &
         Regions$start < x$start_position[i] #&(Regions$end > x$start_position[i] + x$genomeSpan[i])
         &(Regions$end > x$start_position[i] )) # if the start position is within the region
    if( length(ix) == 0 | length(ix) >1  ) return(NA) else return( ix )
    }

    regionID = unlist( lapply(1:dim(x)[1], regulatedGenes) )
    x1 = x[which(!is.na(regionID)),]
    regionID = regionID[!is.na(regionID)]
    x1 = cbind(regionID,Regions[regionID, ], x1[,c('symbol', 'Row.names', 'Fold','FDR', 'band','start_position')])
    x1 = x1[order(x1$regionID,x1$start_position ), ]    
    colnames(x1)[8] = "Ensembl"; colnames(x1)[4] = "Region.Start";colnames(x1)[5] = "Region.End"; colnames(x1)[12] = "Gene.Start"

    # number of genes
    tem = table(x1$regionID)
    Regions$Ngenes = 0
    Regions$Ngenes[as.integer(names(tem) ) ] <- tem

    # cytoband per region
    tem = unique(x1[,c('regionID','band')])
    tem$band = gsub("\\..*","",tem$band)
    tem = unique(tem)
    Regions$band =""
    Regions$ID <- 1:dim(Regions)[1]
    for ( i in 1:dim(Regions)[1] )
      Regions$band[i] <- paste( tem[ which( tem[,1] == Regions$ID[i]),2],  collapse=";" )

    # Genes
    Regions$Genes <- ""
    for ( i in 1:dim(Regions)[1] )
      Regions$Genes[i] <- paste( x1$symbol[ which( x1[,1] == Regions$ID[i])],  collapse=" " )
    return( list(mainResult = GEanalysisResults, Regions = Regions, Genes = x1,legend.x=max(x$start_position)*.6, legend.y=max(chromosomesNumbers)-3  ) )
}

genomePlot <- function(){
    library(PREDA,verbose=FALSE)  # showing expression on genome
    library(PREDAsampledata,verbose=FALSE) 
    library(hgu133plus2.db,verbose=FALSE)
    
  if( class(genomePlotData.out) != "list" ) { plot.new(); text(0.2,1, "No significant regions found!")} else {
   GEanalysisResults <- genomePlotData.out$mainResult;
     if( is.null( GEanalysisResults ) ) return(NULL)    
    genomic_regions_UP<-PREDAResults2GenomicRegions(GEanalysisResults, qval.threshold=input_RegionsPvalCutoff, smoothStatistic.tail="upper", smoothStatistic.threshold= -1 *input_StatisticCutoff)
    genomic_regions_DOWN<-PREDAResults2GenomicRegions(GEanalysisResults, qval.threshold=input_RegionsPvalCutoff, smoothStatistic.tail="lower", smoothStatistic.threshold= -1 *input_StatisticCutoff)
     
    checkplot<-genomePlot(GEanalysisResults, genomicRegions=c(genomic_regions_UP, genomic_regions_DOWN), grouping=c(1, 1), scale.positions="Mb", region.colors=c("red","blue"))
    legend(x=genomePlotData.out$legend.x, y=genomePlotData.out$legend.y , legend=c("UP", "DOWN"), fill=c("red","blue"))
    } #if  else
}

# pre-calculating PREDA, so that changing FDR cutoffs does not trigger entire calculation
genomePlotDataPre <- function( ){
    library(PREDA,verbose=FALSE)  # showing expression on genome
    library(PREDAsampledata,verbose=FALSE) 
    library(hgu133plus2.db,verbose=FALSE)
    if (is.null(input_selectContrast2 ) ) return(NULL)
    
    if( length(limma.out$topGenes) == 0 ) return(NULL)
    
    if(length( limma.out$comparisons)  ==1 )  
    { top1=limma.out$topGenes[[1]]  
    } else {
      top = limma.out$topGenes
      ix = match(input_selectContrast2, names(top))
      if( is.na(ix)) return (NULL)
      top1 <- top[[ix]]; 
      }
      if(dim(top1)[1] == 0 ) return (NULL)
      colnames(top1)= c("Fold","FDR")
      
     # write.csv(merge(top1,allGeneInfo(), by.x="row.names",by.y="ensembl_gene_id"  ),"tem.csv"  )
     x <- merge(top1,allGeneInfo.out, by.x="row.names",by.y="ensembl_gene_id"  )
     
     #hist(top1[,1])
     
     ########## PREDA
    infofile <- system.file("sampledata", "GeneExpression", "sampleinfoGE_PREDA.txt", package = "PREDAsampledata")
    sampleinfo<-read.table(infofile, sep="\t", header=TRUE)
    head(sampleinfo)

    data(ExpressionSetRCC)
    GEstatisticsForPREDA<-statisticsForPREDAfromEset(ExpressionSetRCC, statisticType="tstatistic", referenceGroupLabel="normal", classVector=sampleinfo[,"Class"])
    analysesNames(GEstatisticsForPREDA)
    
    GEGenomicAnnotations<-eset2GenomicAnnotations(ExpressionSetRCC, retain.chrs=1:22) # needs hgu133plus2.db package
    GEGenomicAnnotationsForPREDA<-GenomicAnnotations2GenomicAnnotationsForPREDA(GEGenomicAnnotations, reference_position_type="median")
    GEDataForPREDA<-MergeStatisticAnnotations2DataForPREDA(GEstatisticsForPREDA, GEGenomicAnnotationsForPREDA, sortAndCleanNA=TRUE)

    #x = read.csv("PREDA_test.csv")

    #x <- x[,-1] # remove the row index, not needed in Shiny

     x <- x[order(x$chromosome_name,x$start_position),]
     tem = sort( table( x$chromosome_name), decreasing=T) 
     chromosomes <- names( tem[tem > 100 ] )  # chromosomes with less than 100 genes are excluded
     if(length(chromosomes) > 50) chromosomes <- chromosomes[1:50]  # at most 50 chromosomes

     chromosomes = chromosomes[order(as.numeric(chromosomes) ) ]
     # chromosomes = chromosomes[!is.na(as.numeric(chromosomes) ) ]
     chromosomesNumbers = as.numeric(chromosomes)
     # convert chr.x to numbers
      j = max( chromosomesNumbers,na.rm=T) 
      for( i in 1:length( chromosomes)) {
       if ( is.na(chromosomesNumbers[i]) ) 
       { chromosomesNumbers[i] <- j+1; j <- j+1; }
     }
      
     x <- x[which(x$chromosome_name %in% chromosomes   ),]
     x <- droplevels(x)
     
    # find the number coding for chromosome 
     getChrNumber <- function (chrName){
     return( chromosomesNumbers[ which( chromosomes == chrName)] )
     }
      x$chrNum = 1 # numeric coding
      x$chrNum <- unlist( lapply( x$chromosome_name, getChrNumber) )
     
     x$Row.names <- as.character( x$Row.names)
     fold = x$Fold; names(fold) = x$Row.names
     
    # write.csv(x,"tem.csv")
     x <- x[!duplicated(x$Row.names),]
     #rownames(x) = x$Row.names
     
     myData <- GEDataForPREDA
      
     myData@position <- as.integer( x$start_position+ x$genomeSpan/2 )
     myData@ids <- as.character( x$Row.names)
     myData@chr <- as.integer( x$chrNum )
     myData@start <- x$start_position
     myData@end <- x$start_position + x$genomeSpan
     myData@strand <- rep(1,dim(x)[1])
     myData@chromosomesLabels <- chromosomes
     myData@chromosomesNumbers <- as.integer( chromosomesNumbers )
     myData@statistic <- as.matrix( x[,2,drop=FALSE] )
     myData@analysesNames <- "Test"
     myData@testedTail <- "both"
     myData@optionalAnnotations <- as.matrix( x[,1:2,drop=FALSE] )
     myData@optionalAnnotationsHeaders <- c("a","b")
     
    set.seed(2)
    GEanalysisResults<-PREDA_main(myData,nperms = PREDA_Permutations)

    return( GEanalysisResults )
}

6.11 Biclustering

################################################################
# Biclustering
################################################################
 
biclustering <- function( ){
            library(biclust,verbose=FALSE)

            if(input_biclustMethod == "BCQU()" )
                library(QUBIC,verbose=FALSE) # have trouble installing on Linux
            if(input_biclustMethod == "BCUnibic()" )
                library(runibic,verbose=FALSE) # 
            
        
            x <- convertedData.out
            n=input_nGenesBiclust   
            if(n>dim(x)[1]) n = dim(x)[1] # max as data
            if(n<10) n = 10 
            if(n> 2000 ) n = 2000           
            x=as.matrix(x[1:n,])-apply(x[1:n,],1,mean)
            
            if( input_biclustMethod == "BCXmotifs()"  )
                x<-discretize(x)

            #res <- biclust::biclust(as.matrix( x), method = BCQU()) 
            runR = paste( "res <- biclust::biclust(as.matrix( x), method =", input_biclustMethod ,")" )
            eval(parse(text = runR ) )
            

            return(list( x=x, res=res)  )    
}


biclustHeatmap <- function ( ){
            res = biclustering()$res
            if( res@Number == 0 ) { plot.new(); text(0.5,0.5, "No cluster found!")} else {
        
                x = biclust::bicluster(biclustering.out$x, res, as.numeric( input_selectBicluster)  )[[1]]

                    par(mar = c(5, 4, 1.4, 0.2))

                if( dim(x)[1] <=30 )
                    heatmap.2(x,  Rowv =T,Colv=F, dendrogram ="none",
                    col=heatColors[as.integer(input_heatColors1),], density.info="none", trace="none", scale="none", keysize=.2
                    ,key=F, #labRow = T,
                    ,margins = c(8, 24)
                    ,cexRow=1
                    #,srtCol=45
                    ,cexCol=1.  # size of font for sample names
                    ) else      
                    heatmap.2(x,  Rowv =T,Colv=F, dendrogram ="none",
                    col=heatColors[as.integer(input_heatColors1),], density.info="none", trace="none", scale="none", keysize=.2
                    ,key=F, labRow = F,
                    ,margins = c(8, 4)
                    ,cexRow=1
                    #,srtCol=45
                    ,cexCol=1.  # size of font for sample names

                    )               
            }
}


geneListBclustGO <- function( ){        
        
            res = biclustering.out$res
            if( res@Number == 0 ) return(as.data.frame("No clusters found!") ) 
            x = biclust::bicluster(biclustering.out$x, res, as.numeric( input_selectBicluster)  )[[1]]
            
            if( dim(x)[1]  <= minGenesEnrichment) return(NoSig) 
            query = rownames(x)
            

            result <- findOverlapGMT( query, GeneSets.out,1) 
            
                
            result$Genes = "Up regulated"
            

            results1 = result
            if ( is.null( results1) ) return (NoSig)
            if( dim(results1)[2] <5 ) return(NoSig)  # Returns a data frame: "No significant results found!"

            results1= results1[,c(5,1,2,4)]
            colnames(results1)= c("List","FDR","Genes","GO terms or pathways")
            minFDR = 0.01
            if(min(results1$FDR) > minFDR ) results1 = as.data.frame("No signficant enrichment found.") else
            results1 = results1[which(results1$FDR < minFDR),]

            
            if(dim(results1)[2] != 4) return(NoSig)
            colnames(results1)= c("Direction","adj.Pval","Genes","Pathways")
            
            results1$adj.Pval <- sprintf("%-2.1e",as.numeric(results1$adj.Pval) )
            results1[,1] <- as.character(results1[,1])
            tem <- results1[,1]

            results1[ duplicated (results1[,1] ),1 ] <- ""
            
            results1[,-1]           

}

6.12 Co-expression network

################################################################
# Co-expression network
################################################################

wgcna <- function ( ){
            #http://pklab.med.harvard.edu/scw2014/WGCNA.html
            library(WGCNA)

            x <- convertedData.out
            n=input_nGenesNetwork   
            if(n>dim(x)[1]) n = dim(x)[1] # max as data
            if(n<50) return(NULL)
            if(dim(x)[2] <4) return(NULL)
            if(n> maxGeneWGCNA ) n = maxGeneWGCNA           
            #x=as.matrix(x[1:n,])-apply(x[1:n,],1,mean)
            
            datExpr=t(x[1:n,])

            subGeneNames=colnames(datExpr)
            
            #Choosing a soft-threshold to fit a scale-free topology to the network
            powers = c(c(1:10), seq(from = 12, to=20, by=2));
            sft=pickSoftThreshold(datExpr,dataIsExpr = TRUE,powerVector = powers,corFnc = cor,corOptions = list(use = 'p'),networkType = "unsigned")


            softPower = input_mySoftPower;

            #calclute the adjacency matrix
            adj= adjacency(datExpr,type = "unsigned", power = softPower);

            #turn adjacency matrix into topological overlap to minimize the effects of noise and spurious associations
            TOM=TOMsimilarityFromExpr(datExpr,networkType = "unsigned", TOMType = "unsigned", power = softPower);
            colnames(TOM) =subGeneNames
            rownames(TOM) =subGeneNames
            
            ##########################################
            # module detection

            library(flashClust,verbose=FALSE)

            geneTree = flashClust(as.dist(1-TOM),method="average");

            # Set the minimum module size
            minModuleSize = input_minModuleSize;

            # Module identification using dynamic tree cut
            dynamicMods = cutreeDynamic(dendro = geneTree,  method="tree", minClusterSize = minModuleSize);
            #dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM, method="hybrid", deepSplit = 2, pamRespectsDendro = FALSE, minClusterSize = minModuleSize);
            # table(dynamicMods)
            
            dynamicColors = labels2colors(dynamicMods)
            #table(dynamicColors)

            #discard the unassigned genes, and focus on the rest  # this causes problems
            #restGenes= (dynamicColors != "grey")
            #diss1=1-TOMsimilarityFromExpr(datExpr[,restGenes], power = softPower)
            #colnames(diss1) =rownames(diss1) =subGeneNames[restGenes]
            
            #colorCode = as.character(dynamicColors[restGenes])

            moduleInfo = cbind( subGeneNames, dynamicColors, dynamicMods)
            moduleInfo = moduleInfo[which(moduleInfo[,2] != "grey") ,] # remove genes not in any modules
            moduleInfo = moduleInfo[order(moduleInfo[,3]),] # sort
            
            n.modules = length(unique(dynamicColors) ) -1 ; nGenes = dim(moduleInfo)[1] 
            
            return(list(x = t(datExpr),powers=powers,sft=sft, TOM = TOM, dynamicColors = dynamicColors, moduleInfo = moduleInfo,n.modules=n.modules, nGenes =nGenes) )
}


softPower <- function(){

        ####################################  
        
        sft = wgcna.out$sft;
        powers=wgcna.out$powers;
        # Plot the results
        #sizeGrWindow(9, 5)
        par(mfrow = c(1,2));
        cex1 = 0.9;

        # Scale-free topology fit index as a function of the soft-thresholding power
        plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit, signed R^2",type="n", main = paste("Scale independence"));
        text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],labels=powers,cex=cex1,col="red");

        # Red line corresponds to using an R^2 cut-off
        abline(h=0.80,col="red")

        # Mean connectivity as a function of the soft-thresholding power
        plot(sft$fitIndices[,1], sft$fitIndices[,5],xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",main = paste("Mean connectivity"))
        text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
}


modulePlot <- function(){
    
        diss1 = 1-wgcna.out$TOM;
        dynamicColors = wgcna.out$dynamicColors
        
        hier1=flashClust(as.dist(diss1), method="average" )
        #set the diagonal of the dissimilarity to NA 
        diag(diss1) = NA;
        plotDendroAndColors(hier1, dynamicColors, "Dynamic Tree Cut", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05, main = "Gene dendrogram and module colors")

        #Visualize the Tom plot. Raise the dissimilarity matrix to the power of 4 to bring out the module structure
        #TOMplot(diss1, hier1, colorCode)           
}


listWGCNA.Modules <- function(){

        if (is.null(wgcna.out ) ){ # if sample info is uploaded and correctly parsed.
           return(NULL)    
        } else { 
            if( dim(wgcna.out$moduleInfo)[1] == 0  ) {  # if no module
                return(NULL) 
            }   else  { 
                    modules = unique(wgcna.out$moduleInfo[, c("dynamicMods","dynamicColors")] )
                    moduleList = apply(modules,1,paste,collapse=". ")
                    moduleList  = paste0( moduleList, " (", table(wgcna.out$moduleInfo[,"dynamicMods"] )," genes)"  )
                    moduleList = c(moduleList,"Entire network")
                    return(moduleList)


                }

        }           
}


moduleNetwork <- function(){

        outfile <- tempfile(fileext='.txt')
    
        #module = gsub(".* ","",input_selectWGCNA.Module)
        module = unlist(strsplit(input_selectWGCNA.Module," " ) )[2]
        
        moduleColors = wgcna.out$dynamicColors 
        inModule = (moduleColors==module);
        
        if( input_selectWGCNA.Module == "Entire network")
          inModule = rep(TRUE, length(inModule))
        
        datExpr = t(wgcna.out$x )
        probes = colnames(datExpr)
        modProbes = probes[inModule];
        
        modTOM = wgcna.out$TOM[inModule, inModule];
        dimnames(modTOM) = list(modProbes, modProbes)

        nTop = input_topGenesNetwork;
        
        if( nTop > 1000) nTop = 1000; 
        
        IMConn = softConnectivity(datExpr[, modProbes]);
        top = (rank(-IMConn) <= nTop)
        
        # adding symbols 
        probeToGene = NULL
        if( input_selectGO5 != "ID not recognized!" & input_selectOrg != "NEW")
        if(sum(is.na( allGeneInfo.out$symbol ) )/ dim( allGeneInfo.out )[1] <.5 ) { # if more than 50% genes has symbol
            probeToGene = allGeneInfo.out[,c("ensembl_gene_id","symbol")]
            probeToGene$symbol = gsub(" ","",probeToGene$symbol)

            ix = which( is.na(probeToGene$symbol) |
                        nchar(probeToGene$symbol)<2 | 
                        toupper(probeToGene$symbol)=="NA" |  
                        toupper(probeToGene$symbol)=="0"  )             
            probeToGene[ix,2] = probeToGene[ix,1]  # use gene ID

        }
        
        net <- modTOM[top,top] >input_edgeThreshold 

        
        for( i in 1:dim(net)[1])  # remove self connection
            net[i,i] = FALSE
        if(!is.null(probeToGene) & !input_noIDConversion) { # if gene symbol exist
            ix = match( colnames(net), probeToGene[,1])     
            colnames(net) = probeToGene[ix,2]
            ix = match( rownames(net), probeToGene[,1])     
            rownames(net) = probeToGene[ix,2]       
        }
        library(igraph,verbose=FALSE)
        #plot(graph_from_data_frame(d=data.frame(1:10,ncol=2)  ,directed=F) )
        # http://www.kateto.net/wp-content/uploads/2016/01/NetSciX_2016_Workshop.pdf
        plot( graph_from_adjacency_matrix( net, mod ="undirected" ), 
                vertex.label.color="black", vertex.label.dist=3,vertex.size=7)
        
}


networkModuleGO <- function(){      

    
            #module = gsub(".* ","",input_selectWGCNA.Module)
            module = unlist(strsplit(input_selectWGCNA.Module," " ) )[2]    
            moduleColors = wgcna.out$dynamicColors 
            inModule = (moduleColors==module);
            
            if( input_selectWGCNA.Module == "Entire network")
              inModule = rep(TRUE, length(inModule))

            probes = rownames(wgcna.out$x   )
            query  = probes[inModule];
        
            if( length(query)  <= minGenesEnrichment) return(NoSig) 

            

            result <- findOverlapGMT( query, GeneSets.out,1) 
        
                
            result$Genes = "Up regulated"
            

            results1 = result
            if ( is.null( results1) ) return (NoSig)
            if( dim(results1)[2] <5 ) return(NoSig)  # Returns a data frame: "No significant results found!"

            results1= results1[,c(5,1,2,4)]
            colnames(results1)= c("List","FDR","Genes","GO terms or pathways")
            minFDR = 0.01
            if(min(results1$FDR) > minFDR ) results1 = as.data.frame("No signficant enrichment found.") else
            results1 = results1[which(results1$FDR < minFDR),]
            
 
            
            if(dim(results1)[2] != 4) return(NoSig)
            colnames(results1)= c("Direction","adj.Pval","Genes","Pathways")
            
            results1$adj.Pval <- sprintf("%-2.1e",as.numeric(results1$adj.Pval) )
            results1[,1] <- as.character(results1[,1])
            tem <- results1[,1]

            results1[ duplicated (results1[,1] ),1 ] <- ""
            
            results1[,-1]           
}