Chapter 6 R code modules to be used for stand-alone analysis.
Base on the South Dakota State University analysis protocol
http://bioinformatics.sdstate.edu/
- Integrated Differential Expression and Pathway analysis
1.ShinyGO: Gene Ontology Enrichment Analysis
Github: iDEP-SDSU/idep
6.1 Local installation
Local installation of this software is possible through steps below. But it is not supported or updated freqently. Local install is for non-profit organizations only. For-profit businesses please contact us to license the database.
To run iDEP on your local machine (Windows, MacOS, Linux):
Requirements:
More than 250GB available storage
More than 4GB memory
Most recent version of R and RStudio installed.
Upgrade to the most recent version of R and Rstudio.
Start RStudio and install all the R packages. As we need so many R packages, this may take several hours. You can let it run and get started on steps 3 and 4 below to save time. From RStudio console window:
source https://raw.githubusercontent.com/iDEP-SDSU/idep/master/classes/librarySetup.RDownload iDEP source code and example data files from GitHub. The best is to click the green “Code” button and select “Download ZIP” on this page. Unzip to a folder such as C:/IDEP, so that it contains all the subfolders such as config, classes, shinyapps, and so on.
Download the most recent database file from here. Unzip it to the same folder (C:/IDEP), so that your database can be found at C:/IDEP/data/data104.
Start Rstudio and load the ui.R and server.R scripts in the folder C:/IDEP/shinyapps/idep94. And then click on Run app. Similarily, the ShinyGO app could be started at the folder, C:/IDEP/shinyapps/go74/.
6.2 Install packages
source https://raw.githubusercontent.com/iDEP-SDSU/idep/master/classes/librarySetup.R
# dplyr complains this required libraries: libudunits2-dev, libmariadb-client-lgpl-dev
#demo
# install.packages("plotly", repos="http://cran.rstudio.com/", dependencies=TRUE)
# sometimes need to remove all installed packages: https://www.r-bloggers.com/how-to-remove-all-user-installed-packages-in-r/
list.of.packages <- c(
"shiny", "shinyAce", "shinyBS", "plotly",
"RSQLite", "gplots",
"ggplot2", "dplyr", #"tidyverse",
"plotly",
"e1071", "reshape2", "DT",
"data.table", "Rcpp","WGCNA","flashClust","statmod","biclust","igraph","Rtsne",
"visNetwork", "BiocManager","feather","shinyjs","reactable"
)
list.of.bio.packages <- c(
"getDEE2", "limma", "DESeq2", "edgeR", "gage", "fgsea", "ReactomePA", "pathview", "PREDA",
"impute", "runibic","QUBIC","rhdf5", "STRINGdb",
"PREDAsampledata", "sfsmisc", "lokern", "multtest", "hgu133plus2.db",
"org.Ag.eg.db","org.At.tair.db","org.Bt.eg.db","org.Ce.eg.db","org.Cf.eg.db",
"org.Dm.eg.db","org.EcK12.eg.db","org.EcSakai.eg.db","org.Gg.eg.db",
"org.Hs.eg.db","org.Mm.eg.db","org.Mmu.eg.db","org.Pf.plasmo.db",
"org.Pt.eg.db","org.Rn.eg.db","org.Sc.sgd.db","org.Ss.eg.db","org.Xl.eg.db"
)
if(1) { # remove all old packages, to solve problem caused by Bioconductor upgrade
# create a list of all installed packages
ip <- as.data.frame(installed.packages())
# head(ip)
# if you use MRO, make sure that no packages in this library will be removed
ip <- subset(ip, !grepl("MRO", ip$LibPath))
# we don't want to remove base or recommended packages either\
ip <- ip[!(ip[,"Priority"] %in% c("base", "recommended")),]
# determine the library where the packages are installed
path.lib <- unique(ip$LibPath)
# create a vector with all the names of the packages you want to remove
pkgs.to.remove <- ip[,1]
# head(pkgs.to.remove)
# remove the packages
sapply(pkgs.to.remove, remove.packages, lib = path.lib)
}
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
new.bio.packages <- list.of.bio.packages[!(list.of.bio.packages %in% installed.packages()[,"Package"])]
notInstalledPackageCount = length(new.packages) + length(new.bio.packages)
#Install Require packages
while(notInstalledPackageCount != 0){
if(length(new.packages)) install.packages(new.packages, repos="http://cran.rstudio.com/", dependencies=TRUE, quiet=TRUE)
if(length(new.bio.packages)){
BiocManager::install(new.bio.packages, ask = FALSE, dependencies=TRUE, quiet=TRUE)
}
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
new.bio.packages <- list.of.bio.packages[!(list.of.bio.packages %in% installed.packages()[,"Package"])]
if( notInstalledPackageCount == length(new.packages) + length(new.bio.packages) )
{
#no new package installed.
break
}
else {
notInstalledPackageCount = length(new.packages) + length(new.bio.packages)
}
}
#PGSEA is deprecated since Bioconductor 3.12. So we have to install manually from source.
BiocManager::install(c("GO.db","KEGG.db", "annaffy")) # required y PGSEA
install.packages("https://bioconductor.org/packages/3.10/bioc/src/contrib/PGSEA_1.60.0.tar.gz",
repos=NULL, type="source")
list.of.bio.packages = c(list.of.bio.packages, "PGSEA") # add package for testing
#Load Packages
suc = unlist ( lapply(list.of.packages, require, character.only = TRUE) )
if(sum(suc) < length(list.of.packages) )
cat ("\n\nWarnning!!!!!! These R packages cannot be loaded:", list.of.packages[!suc] )
suc = unlist ( lapply(list.of.bio.packages, require, character.only = TRUE) )
if(sum(suc) < length(list.of.bio.packages) )
cat ("\n\nWarnning!!!!!! These Bioconductor packages cannot be loaded:", list.of.bio.packages[!suc] )################################################################
# Install packages
################################################################
# dplyr complains this required libraries: libudunits2-dev, libmariadb-client-lgpl-dev
# install.packages("plotly", repos="http://cran.rstudio.com/", dependencies=TRUE)
# sometimes need to remove all installed packages: https://www.r-bloggers.com/how-to-remove-all-user-installed-packages-in-r/
#cat("\n Installing lots of R packages, this may take several hours ... \n
# Each of these packages took years to develop.\n So be a patient thief.\n
# \n Note 1: Sometimes dependencies needs to be installed manually.
# \n Note 2: If you are using an older version of R, and having trouble with package installation,
# \n sometimes, it is easier to install the new version of R and delete all old packages, and start fresh.
# ")
list.of.packages <- c("dendextend", "htmlwidgets","RSQLite",
"gplots", "ggplot2", "dplyr", "tidyverse","plotly","e1071",
"reshape2", "Rcpp","flashClust","statmod",
"biclust","igraph","Rtsne")
list.of.bio.packages <- c("STRINGdb", "limma", "DESeq2", "edgeR", "gage",
"PGSEA", "fgsea", "ReactomePA", "pathview", "PREDA", "impute", "runibic",
"QUBIC","rhdf5", "PREDAsampledata", "sfsmisc", "lokern", "multtest",
"GSEABase","hgu133plus2.db")
#Install Require packages
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages, repos="http://cran.rstudio.com/", dependencies=TRUE)
new.bio.packages <- list.of.bio.packages[!(list.of.bio.packages %in% installed.packages()[,"Package"])]
if(length(new.bio.packages)){
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install(new.bio.packages, suppressUpdates = T)
}
# WGCNA must be installed after AnnotationDbi, impute, GO.db, preprocessCore
# install.packages(c("WGCNA"))
#Test Load Packages This cause an error when too many packages are loaded.
# suc = unlist ( lapply(list.of.packages, require, character.only = TRUE) )
# if(sum(suc) < length(list.of.packages) )
# cat ("\n\nWarnning!!!!!! These R packages cannot be loaded:", list.of.packages[!suc] )
# suc = unlist ( lapply(list.of.bio.packages, require, character.only = TRUE) )
# if(sum(suc) < length(list.of.bio.packages) )
# cat ("\n\nWarnning!!!!!! These Bioconductor packages cannot be loaded:", list.of.bio.packages[!suc] )6.3 Global variables
################################################################
# Global variables
################################################################
library(RSQLite,verbose=FALSE) # for database connection
library(gplots,verbose=FALSE) # for hierarchical clustering
library(ggplot2,verbose=FALSE) # graphics
library(e1071,verbose=FALSE) # computing kurtosis
library(DT,verbose=FALSE) # for renderDataTable
library(plotly,verbose=FALSE) # for interactive heatmap
library(reshape2,verbose=FALSE) # for melt correlation matrix in heatmap
input_goButton = 0 # if 1, use demo data file
Min_overlap <- 2
minSetSize = 3;
mappingCoverage = 0.60 # 60% percent genes has to be mapped for confident mapping
mappingEdge = 0.5 # Top species has 50% more genes mapped
PvalGeneInfo = 0.05; minGenes = 10 # min number of genes for ploting
kurtosis.log = 50 # log transform is enforced when kurtosis is big
kurtosis.warning = 10 # log transformation recommnded
minGenesEnrichment = 2 # perform GO or promoter analysis only if more than this many genes
PREDA_Permutations =1000
maxGeneClustering = 12000 # max genes for hierarchical clustering and k-Means clustering. Slow if larger
maxGeneWGCNA = 2000 # max genes for co-expression network
maxFactors =6 # max number of factors in DESeq2 models
redudantGeneSetsRatio = 0.9 # remove redundant genesets in enrichment analysis
set.seed(2) # seed for random number generator
mycolors = sort(rainbow(20))[c(1,20,10,11,2,19,3,12,4,13,5,14,6,15,7,16,8,17,9,18)] # 20 colors for kNN clusters
#Each row of this matrix represents a color scheme;
hmcols <- colorRampPalette(rev(c("#D73027", "#FC8D59", "#FEE090", "#FFFFBF",
"#E0F3F8", "#91BFDB", "#4575B4")))(75)
heatColors = rbind( greenred(75), bluered(75), colorpanel(75,"green","black","magenta"),colorpanel(75,"blue","yellow","red"),hmcols )
rownames(heatColors) = c("Green-Black-Red","Blue-White-Red","Green-Black-Magenta","Blue-Yellow-Red","Blue-white-brown")
colorChoices = setNames(1:dim(heatColors)[1],rownames(heatColors)) # for pull down menu
# Create a list for Select Input options
speciesChoice = list()
speciesChoice <- append( setNames( "NEW","**NEW SPECIES**"), speciesChoice )
speciesChoice <- append( setNames( "BestMatch","Best matching species"), speciesChoice )6.4 Read data and pre-process
################################################################
# Read data and pre-process
################################################################
readData <- function(inFile ) {
# these packages moved here to reduce loading time
library(edgeR,verbose=FALSE) # count data D.E.
library(DESeq2,verbose=FALSE) # count data analysis
dataTypeWarning =0
dataType =c(TRUE)
#---------------Read file
x <- read.csv(inFile) # try CSV
if(dim(x)[2] <= 2 ) # if less than 3 columns, try tab-deliminated
x <- read.table(inFile, sep="\t",header=TRUE)
#-------Remove non-numeric columns, except the first column
for(i in 2:dim(x)[2])
dataType = c( dataType, is.numeric(x[,i]) )
if(sum(dataType) <=2) return (NULL) # only less than 2 columns are numbers
x <- x[,dataType] # only keep numeric columns
# rows with all missing values
ix = which( apply(x[,-1],1, function(y) sum( is.na(y) ) ) != dim(x)[2]-1 )
x <- x[ix,]
dataSizeOriginal = dim(x); dataSizeOriginal[2] = dataSizeOriginal[2] -1
x[,1] <- toupper(x[,1])
x[,1] <- gsub(" ","",x[,1]) # remove spaces in gene ids
x = x[order(- apply(x[,2:dim(x)[2]],1,sd) ),] # sort by SD
x <- x[!duplicated(x[,1]) ,] # remove duplicated genes
rownames(x) <- x[,1]
x <- as.matrix(x[,c(-1)])
# remove "-" or "." from sample names
colnames(x) = gsub("-","",colnames(x))
colnames(x) = gsub("\\.","",colnames(x))
#cat("\nhere",dim(x))
# missng value for median value
if(sum(is.na(x))>0) {# if there is missing values
if(input_missingValue =="geneMedian") {
rowMedians <- apply(x,1, function (y) median(y,na.rm=T))
for( i in 1:dim(x)[2] ) {
ix = which(is.na(x[,i]) )
x[ix,i] <- rowMedians[ix]
}
} else if(input_missingValue =="treatAsZero") {
x[is.na(x) ] <- 0
} else if (input_missingValue =="geneMedianInGroup") {
sampleGroups = detectGroups( colnames(x))
for (group in unique( sampleGroups) ){
samples = which( sampleGroups == group )
rowMedians <- apply(x[,samples, drop=F],1, function (y) median(y,na.rm=T))
for( i in samples ) {
ix = which(is.na(x[ ,i] ) )
if(length(ix) >0 )
x[ix, i ] <- rowMedians[ix]
}
}
# missing for entire sample group, use median for all samples
if(sum(is.na(x) )>0 ) {
rowMedians <- apply(x,1, function (y) median(y,na.rm=T))
for( i in 1:dim(x)[2] ) {
ix = which(is.na(x[,i]) )
x[ix,i] <- rowMedians[ix]
}
}
}
}
# Compute kurtosis
mean.kurtosis = mean(apply(x,2, kurtosis),na.rm=T)
rawCounts = NULL
pvals= NULL
if (input_dataFileFormat == 2 ) { # if FPKM, microarray
if ( is.integer(x) ) dataTypeWarning = 1; # Data appears to be read counts
#-------------filtering
#tem <- apply(x,1,max)
#x <- x[which(tem > input_lowFilter),] # max by row is at least
x <- x[ which( apply( x, 1, function(y) sum(y >= input_lowFilter)) >= input_NminSamples2 ) , ]
x <- x[which(apply(x,1, function(y) max(y)- min(y) ) > 0 ),] # remove rows with all the same levels
#--------------Log transform
# Takes log if log is selected OR kurtosis is big than 100
if ( (input_transform == TRUE) | (mean.kurtosis > kurtosis.log ) )
x = log(x+abs( input_logStart),2)
tem <- apply(x,1,sd)
x <- x[order(-tem),] # sort by SD
} else
if( input_dataFileFormat == 1) { # counts data
# data not seems to be read counts
if(!is.integer(x) & mean.kurtosis < kurtosis.log ) {
dataTypeWarning = -1
}
# not used as some counts data like those from CRISPR screen
#validate( # if Kurtosis is less than a threshold, it is not read-count
# need(mean.kurtosis > kurtosis.log, "Data does not seem to be read count based on distribution. Please double check.")
# )
x <- round(x,0) # enforce the read counts to be integers. Sailfish outputs has decimal points.
#x <- x[ which( apply(x,1,max) >= input_minCounts ) , ] # remove all zero counts
# remove genes if it does not at least have minCounts in at least NminSamples
#x <- x[ which( apply(x,1,function(y) sum(y>=input_minCounts)) >= input_NminSamples ) , ] # filtering on raw counts
# using counts per million (CPM) for filtering out genes.
# CPM matrix #N samples > minCounts
x <- x[ which( apply( cpm(DGEList(counts = x)), 1,
function(y) sum(y>=input_minCounts)) >= input_NminSamples ) , ]
rawCounts = x; # ???
# construct DESeqExpression Object
# colData = cbind(colnames(x), as.character(detectGroups( colnames(x) )) )
tem = rep("A",dim(x)[2]); tem[1] <- "B" # making a fake design matrix to allow process, even when there is no replicates
colData = cbind(colnames(x), tem )
colnames(colData) = c("sample", "groups")
dds <- DESeqDataSetFromMatrix(countData = x, colData = colData, design = ~ groups)
dds <- estimateSizeFactors(dds) # estimate size factor for use in normalization later for started log method
# regularized log or VST transformation
if( input_CountsTransform == 3 ) { # rlog is slow, only do it with 10 samples
if(dim(x)[2]<=20 ) {
x <- rlog(dds, blind=TRUE); x <- assay(x) } else
x <- log2( counts(dds, normalized=TRUE) + input_countsLogStart )
}
else {
if ( input_CountsTransform == 2 ) { # vst is fast but aggressive
x <- vst(dds, blind=TRUE)
x <- assay(x)
} else{ # normalized by library sizes and add a constant.
x <- log2( counts(dds, normalized=TRUE) + input_countsLogStart ) # log(x+c)
# This is equivalent to below. But the prior counts is more important
#x <- cpm(DGEList(counts = x),log=TRUE, prior.count=input_countsLogStart ) #log CPM from edgeR
#x <- x-min(x) # shift values to avoid negative numbers
}
}
} else
if( input_dataFileFormat == 3) { # other data type
#neg_lfc neg_fdr pos_lfc pos_fdr
#11 1 11 1
n2 = ( dim(x)[2] %/% 2) # 5 --> 2
# It looks like it contains P values
# ranges of columns add 0.2 and round to whole. For P value columns this should be 1
tem = round( apply(x, 2, function( y) max(y)- min(y)) + .2)
if( sum(tem[(1:n2)*2 ] == 1 ) == n2 |
sum(tem[(1:n2)*2-1 ] == 1 ) == n2 ) {
x = x[,1:(2*n2) ,drop=FALSE ] # if 5, change it to 4
if(tem[2] == 1) { # FDR follows Fold-change
pvals = x [,2*(1:n2 ),drop=FALSE ] # 2, 4, 6
x = x[, 2*(1:n2 )-1,drop=FALSE] # 1, 3, 5
} else { # FDR follows Fold-change
pvals = x [,2*(1:n2 )-1,drop=FALSE ] # 2, 4, 6
x = x[, 2*(1: n2 ),drop=FALSE] # 1, 3, 5
}
}
ix = which(apply(x,1, function(y) max(y)- min(y) ) > 0 )
x <- x[ix,] # remove rows with all the same levels
if(!is.null(pvals) )
pvals = pvals[ix,]
}
dataSize = dim(x);
if(!(dim(x)[1]>5 & dim(x)[2]>1))
stop ( "Data file not recognized. Please double check.")
sampleInfoDemo=NULL
if( input_goButton >0)
sampleInfoDemo <- t( read.csv(demoDataFile2,row.names=1,header=T,colClasses="character") )
finalResult <- list(data = as.matrix(x), mean.kurtosis = mean.kurtosis, rawCounts = rawCounts, dataTypeWarning=dataTypeWarning, dataSize=c(dataSizeOriginal,dataSize),sampleInfoDemo=sampleInfoDemo, pvals =pvals )
return(finalResult)
}
readSampleInfo <- function(inFile){
dataTypeWarning =0
dataType =c(TRUE)
#---------------Read file
x <- read.csv(inFile,row.names=1,header=T,colClasses="character") # try CSV
if(dim(x)[2] <= 2 ) # if less than 3 columns, try tab-deliminated
x <- read.table(inFile, row.names=1,sep="\t",header=TRUE,colClasses="character")
# remove "-" or "." from sample names
colnames(x) = gsub("-","",colnames(x))
colnames(x) = gsub("\\.","",colnames(x))
#----------------Matching with column names of expression file
ix = match(toupper(colnames(readData.out$data)), toupper(colnames(x)) )
ix = ix[which(!is.na(ix))] # remove NA
if(! ( length(unique(ix) ) == dim(readData.out$data)[2]
& dim(x)[1]>=1 & dim(x)[1] <500 ) ) # at least one row, it can not be more than 500 rows
stop( "Error!!! Sample information file not recognized. Sample names must be exactly the same. Each row is a factor. Each column represent a sample. Please see documentation on format.")
#-----------Double check factor levels, change if needed
# remove "-" or "." from factor levels
for( i in 1:dim(x)[1]) {
x[i,] = gsub("-","",x[i,])
x[i,] = gsub("\\.","",x[i,])
}
# if levels from different factors match
if( length(unique(ix) ) == dim(readData.out$data)[2]) { # matches exactly
x = x[,ix]
# if the levels of different factors are the same, it may cause problems
if( sum( apply(x, 1, function(y) length(unique(y)))) > length(unique(unlist(x) ) ) ) {
tem2 =apply(x,2, function(y) paste0( names(y),y)) # factor names are added to levels
rownames(tem2) = rownames(x)
x <- tem2
}
return(t( x ) )
} else retrun(NULL)
}
textTransform <- function () {
k.value = readData.out$mean.kurtosis
tem = paste( "Mean Kurtosis = ", round(k.value,2), ".\n",sep = "")
if( k.value > kurtosis.log) tem = paste(tem, " Detected extremely large values. \n When kurtosis >", kurtosis.log,
", log transformation is automatically applied.\n", sep="") else if (k.value>kurtosis.warning)
{tem = paste(tem, " Detected large numbers with kurtosis >",
kurtosis.warning,". \n Log transformation is recommended.", sep="") }
if(readData.out$dataTypeWarning == 1 ) {
tem = paste (tem, " ------Warning!!! \nData matrix contains all integers. \n It seems to be read counts!!! \n Please select appropriate data type in the previous page and reload file.")}
if(readData.out$dataTypeWarning == -1 ) {
tem = paste (tem, " ------Warning!!! \nData does not look like read counts data. \n Please select appropriate data type in the previous page and reload file.")}
tem
}
# Clean up gene sets. Remove spaces and other control characters from gene names
cleanGeneSet <- function (x){
# remove duplicate; upper case; remove special characters
x <- unique( toupper( gsub("\n| ","",x) ) )
x <- x[which( nchar(x)>1) ] # genes should have at least two characters
return(x)
}
findSpeciesById <- function (speciesID){ # find species name use id
return( orgInfo[which(orgInfo$id == speciesID),] )
}
# just return name
findSpeciesByIdName <- function (speciesID){ # find species name use id
return( orgInfo[which(orgInfo$id == speciesID),3] )
}
# convert gene IDs to ensembl gene ids and find species
convertID <- function (query,selectOrg, selectGO) {
querySet <- cleanGeneSet( unlist( strsplit( toupper(query),'\t| |\n|\\,')))
# querySet is ensgene data for example, ENSG00000198888, ENSG00000198763, ENSG00000198804
querySTMT <- paste( "select distinct id,ens,species from mapping where id IN ('", paste(querySet,collapse="', '"),"')",sep="")
result <- dbGetQuery(convert, querySTMT)
if( dim(result)[1] == 0 ) return(NULL)
if(selectOrg == speciesChoice[[1]]) {
comb = paste( result$species,result$idType)
sortedCounts = sort(table(comb),decreasing=T)
recognized =names(sortedCounts[1])
result <- result[which(comb == recognized),]
speciesMatched=sortedCounts
names(speciesMatched )= sapply(as.numeric(gsub(" .*","",names(sortedCounts) ) ), findSpeciesByIdName )
speciesMatched <- as.data.frame( speciesMatched )
if(length(sortedCounts) == 1) { # if only one species matched
speciesMatched[1,1] <-paste( rownames(speciesMatched), "(",speciesMatched[1,1],")",sep="")
} else {# if more than one species matched
speciesMatched[,1] <- as.character(speciesMatched[,1])
speciesMatched[,1] <- paste( speciesMatched[,1]," (",speciesMatched[,2], ")", sep="")
speciesMatched[1,1] <- paste( speciesMatched[1,1]," ***Used in mapping*** To change, select from above and resubmit query.")
speciesMatched <- as.data.frame(speciesMatched[,1])
}
} else { # if species is selected
result <- result[which(result$species == selectOrg ) ,]
if( dim(result)[1] == 0 ) return(NULL) #stop("ID not recognized!")
speciesMatched <- as.data.frame(paste("Using selected species ", findSpeciesByIdName(selectOrg) ) )
}
result <- result[which(!duplicated(result[,2]) ),] # remove duplicates in ensembl_gene_id
result <- result[which(!duplicated(result[,1]) ),] # remove duplicates in user ID
colnames(speciesMatched) = c("Matched Species (genes)" )
conversionTable <- result[,1:2]; colnames(conversionTable) = c("User_input","ensembl_gene_id")
conversionTable$Species = sapply(result[,3], findSpeciesByIdName )
return(list(originalIDs = querySet,IDs=unique( result[,2]),
species = findSpeciesById(result$species[1]),
#idType = findIDtypeById(result$idType[1] ),
speciesMatched = speciesMatched,
conversionTable = conversionTable
) )
}
# retrieve detailed info on genes
geneInfo <- function (fileName = NULL){
if(!is.null(fileName)) # if only one file #WBGene0000001 some ensembl gene ids in lower case
{ x = read.csv(as.character(fileName) ); x[,1]= toupper(x[,1]) } else # read in the chosen file
{ return(as.data.frame("No file.") ) }
Set=rep("List",nrow(x))
return( cbind(x,Set) )
}
convertedData <- function() {
if( is.null(converted.out ) ) return( readData.out$data) # if id or species is not recognized use original data.
if(input_noIDConversion) return( readData.out$data )
mapping <- converted.out$conversionTable
# cat (paste( "\nData:",input_selectOrg) )
x =readData.out$data
rownames(x) = toupper(rownames(x))
# any gene not recognized by the database is disregarded
# x1 = merge(mapping[,1:2],x, by.y = 'row.names', by.x = 'User_input')
# the 3 lines keeps the unrecogized genes using original IDs
x1 = merge(mapping[,1:2],x, by.y = 'row.names', by.x = 'User_input', all.y=TRUE)
# original IDs used if ID is not matched in database
ix = which(is.na(x1[,2]) )
x1[ix,2] = x1[ix,1]
#multiple matched IDs, use the one with highest SD
tem = apply(x1[,3:(dim(x1)[2])],1,sd)
x1 = x1[order(x1[,2],-tem),]
x1 = x1[!duplicated(x1[,2]) ,]
rownames(x1) = x1[,2]
x1 = as.matrix(x1[,c(-1,-2)])
tem = apply(x1,1,sd)
x1 = x1[order(-tem),] # sort again by SD
return(x1)
}
convertedCounts <- function() {
if( is.null(converted.out ) ) return( readData.out$rawCounts)
mapping <- converted.out$conversionTable
x =readData.out$rawCounts
rownames(x) = toupper(rownames(x))
# any gene not recognized by the database is disregarded
# x1 = merge(mapping[,1:2],x, by.y = 'row.names', by.x = 'User_input')
# the 3 lines keeps the unrecogized genes using original IDs
x1 = merge(mapping[,1:2],x, by.y = 'row.names', by.x = 'User_input', all.y=TRUE)
ix = which(is.na(x1[,2]) )
x1[ix,2] = x1[ix,1] # original IDs used
#multiple matched IDs, use the one with highest SD
tem = apply(x1[,3:(dim(x1)[2])],1,sd)
x1 = x1[order(x1[,2],-tem),]
x1 = x1[!duplicated(x1[,2]) ,]
rownames(x1) = x1[,2]
x1 = as.matrix(x1[,c(-1,-2)])
tem = apply(x1,1,sd)
x1 = x1[order(-tem),] # sort again by SD
return(x1)
} # here
nGenesFilter <- function() {
tem = readData.out$dataSize
ix = match( toupper( rownames(convertedData.out)), toupper(converted.out$conversionTable$ensembl_gene_id ) )
nMatched = sum( !is.na(ix) )
if( input_noIDConversion)
return( paste(tem[1], "genes in", tem[4], "samples.", tem[3], " genes passed filter.\n Original gene IDs used." ) ) else
return( paste(tem[1], "genes in", tem[4], "samples. Of the", tem[3], " genes passed filter, \n",
nMatched, " were converted to Ensembl gene IDs in our database. \n
The remaining ", tem[3]-nMatched, "genes were kept in the data using original IDs." ) )
}
# Define sample groups based on column names
detectGroups <- function (x){ # x are col names
tem <- gsub("[0-9]*$","",x) # Remove all numbers from end
#tem = gsub("_Rep|_rep|_REP","",tem)
tem <- gsub("_$","",tem); # remove "_" from end
tem <- gsub("_Rep$","",tem); # remove "_Rep" from end
tem <- gsub("_rep$","",tem); # remove "_rep" from end
tem <- gsub("_REP$","",tem) # remove "_REP" from end
return( tem )
}
# EDA plots -------------------------------------
densityPlot <- function() {
set.seed(2) # seed for random number generator
mycolors = sort(rainbow(20))[c(1,20,10,11,2,19,3,12,4,13,5,14,6,15,7,16,8,17,9,18)] # 20 colors for kNN clusters
#Each row of this matrix represents a color scheme;
x <- readData.out$data
myColors = mycolors
plot(density(x[,1]),col = myColors[1], lwd=2,
xlab="Expresson values", ylab="Density", main= "Distribution of transformed data",
ylim=c(0, max(density(x[,1])$y)+.02 ) )
for( i in 2:dim(x)[2] )
lines(density(x[,i]),col=myColors[i], lwd=2 )
if(dim(x)[2]< 31 ) # if too many samples do not show legends
legend("topright", cex=1,colnames(x), lty=rep(1,dim(x)[2]), col=myColors )
}
genePlot <- function() {
x <- convertedData.out
Symbols <- rownames(x)
if( input_selectOrg != "NEW") {
ix = match( rownames(x), allGeneInfo.out[,1])
if( sum( is.na(allGeneInfo.out$symbol )) != dim(allGeneInfo.out )[1] ) { # symbol really exists?
Symbols = as.character( allGeneInfo.out$symbol[ix] )
Symbols[which( nchar(Symbols) <= 2 ) ] <- rownames(x) [which( nchar(Symbols) <= 2 ) ]
}
}
x = as.data.frame(x)
x$Genes = Symbols
# matching from the beginning of symbol
ix = which(regexpr( paste("^" , toupper(input_geneSearch),sep="") ,toupper(x$Genes)) > 0)
if(grepl(" ", input_geneSearch) ) # if there is space character, do exact match
ix = match(gsub(" ","", toupper(input_geneSearch)),x$Genes)
# too few or too many genes found
if(length(ix) == 0 | length(ix) > 50 ) return(NULL)
# no genes found
mdf = melt(x[ix,],id.vars="Genes", value.name="value", variable.name="samples")
p1 <- ggplot(data=mdf, aes(x=samples, y=value, group = Genes, shape=Genes, colour = Genes)) +
geom_line() +
geom_point( size=5, fill="white")+ #shape=21 circle
#theme(axis.text.x = element_text(size=16,angle = 45, hjust = 1)) +
labs(y="Transformed expression level") +
coord_cartesian(ylim = c(0, max(mdf$value)))
p1 <- p1 + theme(plot.title = element_text(size = 16,hjust = 0.5)) + # theme(aspect.ratio=1) +
theme(axis.text.x = element_text(angle=45, size = 16, hjust=1),
axis.text.y = element_text( size = 16),
axis.title.x = element_blank(),
axis.title.y = element_text( size = 16) ) +
theme(legend.text=element_text(size=12))
p1
}
geneBarPlotError <- function() {
x <- convertedData.out
Symbols <- rownames(x)
if( input_selectOrg != "NEW") {
ix = match( rownames(x), allGeneInfo.out[,1])
if( sum( is.na(allGeneInfo.out$symbol )) != dim(allGeneInfo.out )[1] ) { # symbol really exists?
Symbols = as.character( allGeneInfo.out$symbol[ix] )
Symbols[which( nchar(Symbols) <= 2 ) ] <- rownames(x) [which( nchar(Symbols) <= 2 ) ]
}
}
x = as.data.frame(x)
x$Genes = Symbols
#write.csv(x,"tem.csv")
# Search for genes
#ix = grep("HOXA",toupper(x$Genes) )
# ix = grep(toupper(input_geneSearch),toupper(x$Genes)) # sox --> Tsox
# matching from the beginning of symbol
ix = which(regexpr( paste("^" , toupper(input_geneSearch),sep="") ,toupper(x$Genes)) > 0)
if(grepl(" ", input_geneSearch) ) # if there is space character, do exact match
ix = match(gsub(" ","", toupper(input_geneSearch)),x$Genes)
# too few or too many genes found
if(length(ix) == 0 | length(ix) > 50 ) return(NULL)
# no genes found
mdf = melt(x[ix,],id.vars="Genes", value.name="value", variable.name="samples")
mdf$count = 1
g = detectGroups(mdf$samples)
Means = aggregate(mdf$value,by=list( g, mdf$Genes ), FUN = mean, na.rm=TRUE )
SDs = aggregate(mdf$value,by=list( g, mdf$Genes ), FUN = sd, na.rm=TRUE )
Ns = aggregate(mdf$count, by= list(g, mdf$Genes) , FUN = sum )
summarized = cbind(Means,SDs[,3],Ns[,3])
colnames(summarized)= c("Samples","Genes","Mean","SD","N")
summarized$SE = summarized$SD / sqrt(summarized$N)
#http://www.sthda.com/english/wiki/ggplot2-barplots-quick-start-guide-r-software-and-data-visualization
p2 <- ggplot(summarized, aes(x=Genes, y=Mean,fill=Samples) ) + # data & aesthetic mapping
geom_bar(stat="identity", position=position_dodge()) + # bars represent average
geom_errorbar(aes(ymin=Mean-SE, ymax=Mean+SE), width=0.2,position=position_dodge(.9)) +
labs(y="Expression Level")
if(input_useSD == 1) {
p2 <- ggplot(summarized, aes(x=Genes, y=Mean,fill=Samples) ) + # data & aesthetic mapping
geom_bar(stat="identity", position=position_dodge()) + # bars represent average
geom_errorbar(aes(ymin=Mean-SD, ymax=Mean+SD), width=0.2,position=position_dodge(.9)) +
labs(y="Expression Level")
}
p2 <- p2 + theme(plot.title = element_text(size = 16,hjust = 0.5)) + # theme(aspect.ratio=1) +
theme(axis.text.x = element_text(angle=45, size = 16, hjust=1),
axis.text.y = element_text( size = 16),
axis.title.x = element_blank(),
axis.title.y = element_text( size = 16) ) +
theme(legend.text=element_text(size=16))
p2
}6.5 Hierarchical clustering
################################################################
# Hierarchical clustering
################################################################
# Functions for hierarchical clustering
hclust2 <- function(x, method="average", ...) # average linkage
hclust(x, method=method, ...)
hclust.ward.D <- function(x, method="ward.D", ...) # average linkage
hclust(x, method=method, ...)
hclust.ward.D2 <- function(x, method="ward.D2", ...) # average linkage
hclust(x, method=method, ...)
hclust.single <- function(x, method="single", ...) # average linkage
hclust(x, method=method, ...)
hclust.mcquitty <- function(x, method="mcquitty", ...) # average linkage
hclust(x, method=method, ...)
hclust.median <- function(x, method="median", ...) # average linkage
hclust(x, method=method, ...)
hclust.centroid <- function(x, method="centroid", ...) # average linkage
hclust(x, method=method, ...)
hclustFuns <- list( averge = hclust2, complete=hclust, single=hclust.single,
median=hclust.median, centroid=hclust.centroid, mcquitty=hclust.mcquitty)
hclustChoices = setNames(1:length(hclustFuns),names(hclustFuns)) # for pull down menu
dist2 <- function(x, ...) # distance function = 1-PCC (Pearson's correlation coefficient)
as.dist(1-cor(t(x), method="pearson"))
dist3 <- function(x, ...) # distance function = 1-abs(PCC) (Pearson's correlation coefficient)
as.dist(1-abs(cor(t(x), method="pearson")))
distFuns <- list(Correlation=dist2, Euclidean=dist,AbsolutePCC=dist3)
# hierarchical clustering tree ----------------------------------------
hmcols <- colorRampPalette(rev(c("#D73027", "#FC8D59", "#FEE090", "#FFFFBF",
"#E0F3F8", "#91BFDB", "#4575B4")))(75)
heatColors = rbind( greenred(75), bluered(75), colorpanel(75,"green","black","magenta"),colorpanel(75,"blue","yellow","red"),hmcols )
rownames(heatColors) = c("Green-Black-Red","Blue-White-Red","Green-Black-Magenta","Blue-Yellow-Red","Blue-white-brown")
colorChoices = setNames(1:dim(heatColors)[1],rownames(heatColors)) # for pull down menu
# adding sample legends to heatmap; this is for the main heatmap
# https://stackoverflow.com/questions/3932038/plot-a-legend-outside-of-the-plotting-area-in-base-graphics
add_legend <- function(...) {
opar <- par(fig=c(0, 1, 0, 1), oma=c(0, 0, 0, 0),
mar=c(0, 0, 0, 6), new=TRUE)
on.exit(par(opar))
plot(0, 0, type='n', bty='n', xaxt='n', yaxt='n')
legend(...)
}
correlationMatrix <- function( ) {
# heatmap of correlation matrix
x <- readData.out$data
maxGene <- apply(x,1,max)
x <- x[which(maxGene > quantile(maxGene)[1] ) ,] # remove bottom 25% lowly expressed genes, which inflate the PPC
melted_cormat <- melt(round(cor(x),2), na.rm = TRUE)
# melted_cormat <- melted_cormat[which(melted_cormat[,1] != melted_cormat[,2] ) , ]
# Create a ggheatmap
ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "green", high = "red", mid = "white",
space = "Lab", limit = c(min(melted_cormat[,3]) ,max(melted_cormat[,3])), midpoint = median(melted_cormat[,3]),
name="Pearson\nCorrelation"
) +
theme_minimal()+ # minimal theme
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 15, hjust = 1))+
theme(axis.text.y = element_text(
size = 15))+
coord_fixed()+
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
legend.justification = c(1, 0),
legend.position = c(0.6, 0.7),
legend.direction = "horizontal")+
guides(fill = FALSE) # + ggtitle("Pearson's Correlation Coefficient (all genes)")
if(input_labelPCC && ncol(x)<30)
ggheatmap <- ggheatmap + geom_text(aes(Var2, Var1, label = value), color = "black", size = 4)
ggheatmap
}
staticHeatmap <- function () {
x <- readData.out$data # x = read.csv("expression1.csv")
n=input_nGenes
#if(n>6000) n = 6000 # max
if(n>dim(x)[1]) n = dim(x)[1] # max as data
# this will cutoff very large values, which could skew the color
if(input_geneCentering)
x=as.matrix(x[1:n,])-apply(x[1:n,],1,mean)
# standardize by gene
if(input_geneNormalize)
x <- x / apply(x,1,sd)
# row centering and normalize
x <- scale(x, center = input_sampleCentering, scale = input_sampleNormalize)
cutoff = median(unlist(x)) + input_heatmapCutoff * sd (unlist(x))
x[x>cutoff] <- cutoff
cutoff = median(unlist(x)) - input_heatmapCutoff *sd (unlist(x))
x[x< cutoff] <- cutoff
groups = detectGroups(colnames(x) )
# if sample info file is uploaded us that info:
if(!is.null(readSampleInfo.out) && !is.null(input_selectFactorsHeatmap) ) {
if(input_selectFactorsHeatmap == "Sample_Name" )
groups = detectGroups(colnames(x) ) else
{ ix = match(input_selectFactorsHeatmap, colnames(readSampleInfo.out ) )
groups = readSampleInfo.out[,ix]
}
}
groups.colors = rainbow(length(unique(groups) ) )
#http://stackoverflow.com/questions/15351575/moving-color-key-in-r-heatmap-2-function-of-gplots-package
lmat = rbind(c(0,4),c(0,1),c(3,2),c(5,0))
lwid = c(2,6) # width of gene tree; width of heatmap
lhei = c(1.5,.2,8,1.1)
#layout matrix
# [,1] [,2]
# [1,] 0 4
# [2,] 0 1
# [3,] 3 2
# [4,] 5 0
# 4--> column tree; 1--> column color bar; 2--> heatmap; 3-> row tree; 5--> color key.
# height of 4 rows is specified by lhei; width of columns is given by lwid
par(mar = c(12, 4, 1.4, 0.2))
if( n>110)
heatmap.2(x, distfun = distFuns[[as.integer(input_distFunctions)]]
,hclustfun=hclustFuns[[as.integer(input_hclustFunctions)]]
,Colv=!input_noSampleClustering
#col=colorpanel(75,"green","black","magenta") ,
#col=bluered(75),
#col=greenred(75),
,col= heatColors[as.integer(input_heatColors1),]
,density.info="none", trace="none", scale="none", keysize=.5
,key=T, symkey=F
,ColSideColors=groups.colors[ as.factor(groups)]
,labRow=""
,margins=c(10,2)
,srtCol=45
,cexCol=2 # size of font for sample names
,lmat = lmat, lwid = lwid, lhei = lhei
)
if( n<=110)
heatmap.2(x, distfun = distFuns[[as.integer(input_distFunctions)]]
,hclustfun=hclustFuns[[as.integer(input_hclustFunctions)]]
,Colv=!input_noSampleClustering
,col= heatColors[as.integer(input_heatColors1),], density.info="none", trace="none", scale="none", keysize=.5
,key=T, symkey=F
#,labRow=labRow
,ColSideColors=groups.colors[ as.factor(groups)]
,margins=c(18,12)
,cexRow=1
,srtCol=45
,cexCol=1.8 # size of font for sample names
,lmat = lmat, lwid = lwid, lhei = lhei
)
par(lend = 1) # square line ends for the color legend
add_legend("topleft",
legend = unique(groups), # category labels
col = groups.colors[ unique(as.factor(groups))], # color key
lty= 1, # line style
lwd = 10 # line width
)
}
#Interactive heatmap via Plotly plot ----------------------------------------------
# interactive heatmap with plotly
heatmapPlotly <- function () {
input_nGenesPlotly= 50
x <- convertedData.out
n=input_nGenesPlotly
#if(n>6000) n = 6000 # max
if(n>dim(x)[1]) n = dim(x)[1] # max as data
if(input_geneCentering)
x=as.matrix(x[1:n,])-apply(x[1:n,],1,mean)
# standardize by gene
if(input_geneNormalize)
x <- x / apply(x,1,sd)
# row centering and normalize
x <- scale(x, center = input_sampleCentering, scale = input_sampleNormalize)
cutoff = median(unlist(x)) + 3*sd (unlist(x))
x[x>cutoff] <- cutoff
cutoff = median(unlist(x)) - 3*sd (unlist(x))
x[x< cutoff] <- cutoff
# adding gene symbol
ix <- match( rownames(x), allGeneInfo.out[,1])
geneSymbols <- as.character( allGeneInfo.out$symbol)[ix]
# if missing or duplicated, use Ensembl ID
ix <- which(nchar( geneSymbols) <=1 | duplicated(geneSymbols ) | is.null(geneSymbols) | is.na(geneSymbols) ); geneSymbols[ ix ] <- rownames(x)[ix]
rownames( x) = geneSymbols;
clust <- x %>%
dist2() %>%
hclust2()
# Get order
ord <- clust$order
# Re-arrange based on order
df <- t( x[ord,] )%>%
melt()
colnames(df)[1:2] <- c("X","Y")
colorNames = unlist(strsplit(tolower(rownames(heatColors)[ as.integer(input_heatColors1) ]),"-" ) )
p <- df %>%
ggplot(aes(X, Y, fill = value)) +
geom_tile()+ scale_fill_gradient2(low = colorNames[1], mid = colorNames[2],high = colorNames[3]) +
theme(axis.title.y=element_blank(), # remove y labels
# axis.text.y=element_blank.out, # keep gene names for zooming
axis.ticks.y=element_blank(),
axis.title.x=element_blank()) +
theme(axis.text.x = element_text(size=10,angle = 45, hjust = 1))
ggplotly(p) %>%
layout(margin = list(b = 150,l=200)) # prevent cutoff of sample names
}6.6 PCA
################################################################
# PCA
################################################################
PCAplot <- function() { #PCA
x <- convertedData.out;
pca.object <- prcomp(t(x))
pcaData = as.data.frame(pca.object$x[,1:2]); pcaData = cbind(pcaData,detectGroups(colnames(x)) )
colnames(pcaData) = c("PC1", "PC2", "Sample_Name")
percentVar=round(100*summary(pca.object)$importance[2,1:2],0)
if(is.null(readSampleInfo.out)) {
p=ggplot(pcaData, aes(PC1, PC2, color=Sample_Name, shape = Sample_Name)) + geom_point(size=5)
} else {
pcaData = cbind(pcaData,readSampleInfo.out )
p=ggplot(pcaData, aes_string("PC1", "PC2", color=input_selectFactors,shape=input_selectFactors2)) + geom_point(size=5)
}
p=p+xlab(paste0("PC1: ",percentVar[1],"% variance"))
p=p+ylab(paste0("PC2: ",percentVar[2],"% variance"))
p=p+ggtitle("Principal component analysis (PCA)")+coord_fixed(ratio=1.0)+
theme(plot.title = element_text(size = 16,hjust = 0.5)) + theme(aspect.ratio=1) +
theme(axis.text.x = element_text( size = 16),
axis.text.y = element_text( size = 16),
axis.title.x = element_text( size = 16),
axis.title.y = element_text( size = 16) ) +
theme(legend.text=element_text(size=16))
print(p)
}
# variance chart
# plot(pca.object,type="bar", xlab="Principal Components", main ="Variances explained")
MDSplot <- function() { # MDS
x <- convertedData.out;
fit = cmdscale( dist2(t(x) ), eig=T, k=2)
# par(pin=c(5,5))
pcaData = as.data.frame(fit$points[,1:2]); pcaData = cbind(pcaData,detectGroups(colnames(x)) )
colnames(pcaData) = c("x1", "x2", "Sample_Name")
if(is.null(readSampleInfo.out)) {
p=ggplot(pcaData, aes(x1, x2, color=Sample_Name, shape = Sample_Name)) + geom_point(size=5)
} else {
pcaData = cbind(pcaData,readSampleInfo.out )
p=ggplot(pcaData, aes_string("x1", "x2", color=input_selectFactors,shape=input_selectFactors2)) + geom_point(size=5)
}
p=p+xlab("Dimension 1")
p=p+ylab("Dimension 2")
p=p+ggtitle("Multidimensional scaling (MDS)")+ coord_fixed(ratio=1.)+
theme(plot.title = element_text(hjust = 0.5)) + theme(aspect.ratio=1) +
theme(axis.text.x = element_text( size = 16),
axis.text.y = element_text( size = 16),
axis.title.x = element_text( size = 16),
axis.title.y = element_text( size = 16) ) +
theme(legend.text=element_text(size=16))
print(p)
}
tSNEplot <- function() { # t-SNE
x <- convertedData.out;
library(Rtsne,verbose=FALSE)
set.seed(input_tsneSeed2)
tsne <- Rtsne(t(x), dims = 2, perplexity=1, verbose=FALSE, max_iter = 400)
pcaData = as.data.frame(tsne$Y); pcaData = cbind(pcaData,detectGroups(colnames(x)) )
colnames(pcaData) = c("x1", "x2", "Sample_Name")
if(is.null(readSampleInfo.out)) {
p=ggplot(pcaData, aes(x1, x2, color=Sample_Name, shape = Sample_Name)) + geom_point(size=5)
} else {
pcaData = cbind(pcaData,readSampleInfo.out )
p=ggplot(pcaData, aes_string("x1", "x2", color=input_selectFactors,shape=input_selectFactors2)) + geom_point(size=5)
}
p=p+xlab("Dimension 1")
p=p+ylab("Dimension 2")
p=p+ggtitle("t-SNE plot")+ coord_fixed(ratio=1.)+
theme(plot.title = element_text(hjust = 0.5)) + theme(aspect.ratio=1) +
theme(axis.text.x = element_text( size = 16),
axis.text.y = element_text( size = 16),
axis.title.x = element_text( size = 16),
axis.title.y = element_text( size = 16) ) +
theme(legend.text=element_text(size=16))
print(p)
}
# read pathway data from local file, not used if using database
# Read gene sets GMT file
# This functions cleans and converts to upper case
readGMTRobust <- function (file1) { # size restriction
# Read in the first file
x <- scan(file1, what="", sep="\n")
# x <- gsub("\t\t.","",x) # GMT files saved by Excel has a lot of empty cells "\t\t\t\t" "\t." means one or more tab
x <- gsub(" ","",x) # remove white space
x <- toupper(x) # convert to upper case
#----Process the first file
# Separate elements by one or more whitespace
y <- strsplit(x, "\t")
# Extract the first vector element and set it as the list element name
names(y) <- sapply(y, `[[`, 1)
#names(y) <- sapply(y, function(x) x[[1]]) # same as above
# Remove the first vector element from each list element
y <- lapply(y, `[`, -c(1,2))
#y <- lapply(y, function(x) x[-1]) # same as above
# remove duplicated elements
for ( i in 1:length(y) ) y[[i]] <- cleanGeneSet(y[[i]])
# check the distribution of the size of gene lists sapply(y, length) hold a vector of sizes
if( max( sapply(y,length) ) <5) cat("Warning! Gene sets have very small number of genes!\n Please double check format.")
y <- y[which(sapply(y,length) > 1)] # gene sets smaller than 1 is ignored!!!
return(y)
}
readGeneSets <- function (fileName, convertedData, GO,selectOrg, myrange) {
pathway <- dbConnect(sqlite,fileName)
if(is.null(GO) ) GO <- "GOBP" # initial value not properly set; enforcing
# get Gene sets
querySet = rownames(convertedData)
sqlQuery = paste( " select distinct gene,pathwayID from pathway where gene IN ('", paste(querySet,collapse="', '"),"')" ,sep="")
# cat(paste0("\n\nhere:",GO,"There"))
if( GO != "All") sqlQuery = paste0(sqlQuery, " AND category ='",GO,"'")
result <- dbGetQuery( pathway, sqlQuery )
if( dim(result)[1] ==0) {return(list( x=as.data.frame("No matching species or gene ID file!" )) )}
# list pathways and frequency of genes
pathwayIDs = aggregate( result$pathwayID, by = list(unique.values = result$pathwayID), FUN = length)
pathwayIDs = pathwayIDs[which(pathwayIDs[,2]>= myrange[1] ),]
pathwayIDs = pathwayIDs[which( pathwayIDs[,2] <= myrange[2] ),]
if(dim(pathwayIDs)[1] ==0 ) geneSets = NULL;
# convert pathways into lists like those generated by readGMT
geneSets = lapply(pathwayIDs[,1], function(x) result[which(result$pathwayID == x ),1] )
pathwayInfo <- dbGetQuery( pathway, paste( " select distinct id,Description from pathwayInfo where id IN ('",
paste(pathwayIDs[,1],collapse="', '"), "') ",sep="") )
ix = match( pathwayIDs[,1], pathwayInfo[,1])
names(geneSets) <- pathwayInfo[ix,2]
#geneSets <- geneSets[ -which(duplicated(names(geneSets) ))] # remove geneSets with the same name
dbDisconnect(pathway)
return( geneSets )
}
# Runs pathway analysis using PGSEA; this is copied and revised from PGSEA package
myPGSEA <- function (exprs, cl, range = c(25, 500), ref = NULL, center = TRUE,
p.value = 0.005, weighted = TRUE, nPermutation=100, enforceRange = TRUE, ...) {
if (is(exprs, "ExpressionSet"))
exprs <- exprs(exprs)
if (!is.list(cl))
stop("cl need to be a list")
if (!is.null(ref)) {
if (!is.numeric(ref))
stop("column index's required")
}
if (!is.null(ref)) {
if (options.out$verbose)
cat("Creating ratios...", "\n")
ref_mean <- apply(exprs[, ref], 1, mean, na.rm = TRUE)
exprs <- sweep(exprs, 1, ref_mean, "-")
}
if (center)
exprs <- scale(exprs, scale = FALSE) # column centering is done
results <- matrix(NA, length(cl), ncol(exprs))
rownames(results) <- names(cl)
colnames(results) <- colnames(exprs)
mode(results) <- "numeric"
Setsize = c(rep(0,length(cl))) # gene set size vector
mean2 = c(rep(0,length(cl))) # mean of the range of means
meanSD = c(rep(0,length(cl))) # SD of the range of means
if (is.logical(p.value))
{ p.results <- results; mean.results <- results;}
for (i in 1:length(cl)) { # for each gene list
#cat("\nProcessing gene set",i);
if (class(cl[[i]]) == "smc") {
clids <- cl[[i]]@ids
}
else if (class(cl[[i]]) %in% c("GeneColorSet", "GeneSet")) {
clids <- cl[[i]]@geneIds
}
else {
clids <- cl[[i]]
}
if (options()$verbose)
cat("Testing region ", i, "\n")
ix <- match(clids, rownames(exprs))
ix <- unique(ix[!is.na(ix)])
present <- sum(!is.na(ix))
Setsize[i] <- present
if (present < range[1]) {
if (options()$verbose)
cat("Skipping region ", i, " because too small-",
present, ",\n")
next
}
if (present > range[2]) {
if (options()$verbose)
cat("Skipping region ", i, " because too large-",
present, "\n")
next
}
texprs <- exprs[ix, ] # expression matrix for genes in gene set
if (any(is.na(texprs)))
cat("Warning - 'NA' values within expression data, enrichment scores are estimates only.\n")
if (!is.matrix(texprs))
texprs <- as.matrix(texprs)
stat <- try(apply(texprs, 2, t.test, ...))
means <- try(apply(texprs, 2, mean,trim=0.1)) # trim mean
ps <- unlist(lapply(stat, function(x) x$p.value))
stat <- unlist(lapply(stat, function(x) x$statistic))
p.results[i, ] <- ps
mean.results[i,] <- means
results[i, ] <- as.numeric(stat)
# permutation of gene sets of the same size
if(nPermutation > 2 ) { # no permutation if <=2
meansRanges = c(0, rep(nPermutation))
for( k in 1:nPermutation ) {
ix <- sample.int( dim(exprs)[1], length(ix) )
texprs <- exprs[ix, ]
means <- try(apply(texprs, 2, mean,trim=0.1)) # trim mean
meansRanges[k] = dynamicRange(means)
}
mean2[i] = mean(meansRanges)
meanSD[i]= sd(meansRanges,na.rm=TRUE) # NA are removed before calculating standard deviation
}
}
return(list(results = results, p.results = p.results, means = mean.results, size=Setsize, mean2=mean2, meanSD=meanSD))
}
PCApathway<- function () { # pathway
x <- convertedData.out;
library(PGSEA,verbose=FALSE)
pca.object <- prcomp(t(x))
pca = 100*pca.object$rotation
Npca = 5
if (Npca > dim(pca)[2]) { Npca = dim(pca)[2] } else pca <- pca[,1:Npca]
#pca = pca[,1:5]
if(is.null(GeneSets.out ) ) return(NULL) # no species recognized
if(length(GeneSets.out ) <= 1 ) return(NULL)
#cat("\n\nGene Sets:",length( GeneSets.out))
pg = myPGSEA (pca,cl=GeneSets.out,range=c(15,2000),p.value=TRUE, weighted=FALSE,nPermutation=1)
# correcting for multiple testing
p.matrix = pg$p.result
tem = p.adjust(as.numeric(p.matrix),"fdr")
p.matrix = matrix(tem, nrow=dim(p.matrix)[1], ncol = dim(p.matrix)[2] )
rownames(p.matrix) = rownames(pg$p.result); colnames(p.matrix) = colnames(pg$p.result)
selected =c()
for( i in 1:dim(p.matrix)[2]) {
tem = which( rank(p.matrix[,i],ties.method='first') <= 5) # rank by P value
#tem = which( rank(pg$result[,i],ties.method='first') >= dim(p.matrix)[1]-3.1) # rank by mean
names(tem) = paste("PC",i," ", rownames(p.matrix)[tem], sep="" )
selected = c(selected, tem)
}
rowids = gsub(" .*","",names(selected))
rowids = as.numeric( gsub("PC","",rowids) )
pvals = p.matrix[ cbind(selected,rowids) ]
a=sprintf("%-1.0e",pvals)
tem = pg$result[selected,]
rownames(tem) = paste(a,names(selected)); #colnames(tem)= paste("PC",colnames(tem),sep="")
tem = tem[!duplicated(selected),]
#tem = t(tem); tem = t( (tem - apply(tem,1,mean)) ) #/apply(tem,1,sd) )
smcPlot(tem,scale = c(-max(tem), max(tem)), show.grid = T, margins = c(3,1, 6, 23), col = .rwb,cex.lab=0.5, main="Pathways analysis on PCA")
}
# correlation PCA with factors
PCA2factor <- function( ){
if(is.null(readSampleInfo.out)) return(NULL)
npc = 5 # number of Principal components
x <- readData.out$data
y <- readSampleInfo.out
pca.object <- prcomp(t(x))
pcaData = as.data.frame(pca.object$x[,1:npc]);
pvals = matrix(1,nrow=npc,ncol=ncol(y))
for (i in 1:npc ){
for (j in 1:ncol(y) )
pvals[i,j] =summary( aov(pcaData[,i] ~ as.factor(y[,j])))[[1]][["Pr(>F)"]][1]
}
pvals = pvals * npc* ncol(y) # correcting for multiple testing
pvals[pvals>1] = 1
colnames(pvals) = colnames(y)
rownames(pvals) = paste0("PC",1:npc)
a="\n Correlation between Principal Components (PCs) with factors\n"
nchar0 = nchar(a)
for ( i in 1:npc){
j = which.min(pvals[i,])
if(pvals[i,j]< 0.05) a=paste0(a,rownames(pvals)[i],
" is correlated with ", colnames(pvals)[j],
" (p=", sprintf("%-3.2e",pvals[i,j]),").\n")
}
if(nchar(a) == nchar0 ) return(NULL) else
return( a )
}
# detecting sequencing depth bias
readCountsBias <- function( ){
totalCounts = colSums(readData.out$rawCounts)
groups = as.factor( detectGroups(colnames(readData.out$rawCounts ) ) )
tem = NULL
# ANOVA of total read counts vs sample groups parsed by sample name
pval = summary( aov(totalCounts ~ groups ))[[1]][["Pr(>F)"]][1]
if(pval <0.05)
tem = paste("Warning! Sequencing depth bias detected. Total read counts are significantly different among sample groups (p=",
sprintf("%-3.2e",pval),") based on ANOVA.")
# ANOVA of total read counts vs factors in experiment design
if(!is.null(readSampleInfo.out ) ) {
y <- readSampleInfo.out
for (j in 1:ncol(y) ) {
pval = summary( aov(totalCounts ~ as.factor(y[,j])))[[1]][["Pr(>F)"]][1]
if(pval <0.05)
tem = paste(tem, " Total read counts seem to be correlated with factor",colnames(y)[j],
"(p=", sprintf("%-3.2e",pval),"). ")
}
}
if(is.null(tem)) return( "No bias detected") else
return( tem )
}6.7 K-means clustering
################################################################
# K-means clustering
################################################################
#Distribution of SDs
distributionSD <- function() {
SDs=apply(convertedData.out,1,sd)
maxSD = mean(SDs)+ 4*sd(SDs)
SDs[ SDs > maxSD] = maxSD
top = input_nGenesKNN
if(top > length(SDs)) top = length(SDs)
Cutoff=sort(SDs,decreasing=TRUE)[top]
SDs = as.data.frame(SDs)
p <- ggplot(SDs, aes(x=SDs)) +
geom_density(color="darkblue", fill="lightblue") +
labs(x = "Standard deviations of all genes", y="Density")+
geom_vline(aes(xintercept=Cutoff),
color="red", linetype="dashed", size=1) +
annotate("text", x = Cutoff + 0.4*sd(SDs[,1]), y = 1,colour = "red", label = paste0("Top ", top))
p
}
# Decide number of clusters
KmeansNclusters <- function() { # Kmeans clustering
x <- convertedData.out
#x <- readData.out
#par(mfrow=c(1,2))
n=input_nGenesKNN
#if(n>6000) n = 6000 # max
if(n>dim(x)[1]) n = dim(x)[1] # max as data
#x1 <- x;
#x=as.matrix(x[1:n,])-apply(x[1:n,],1,mean)
x = 100* x[1:n,] / apply(x[1:n,],1,sum) # this is causing problem??????
#x = x - apply(x,1,mean) # this is causing problem??????
#colnames(x) = gsub("_.*","",colnames(x))
set.seed(2)
k = 30
wss <- (nrow(x)-1)*sum(apply(x,2,var))
for (i in 2:k) wss[i] <- sum(kmeans(x,centers=i,iter.max = 30)$withinss)
par(mar=c(4,5,4,4))
plot(1:k, wss, type="b", xlab="Number of Clusters (k)",
ylab="Within groups sum of squares",
cex=2,cex.lab=2, cex.axis=2, cex.main=2, cex.sub=2 ,xaxt="n" )
axis(1, at = seq(1, 30, by = 2),cex.axis=1.5,cex=1.5)
}
Kmeans <- function() { # Kmeans clustering
x <- convertedData.out
#x <- readData.out
#par(mfrow=c(1,2))
n=input_nGenesKNN
if(n>maxGeneClustering) n = maxGeneClustering # max
if(n>dim(x)[1]) n = dim(x)[1] # max as data
#x1 <- x;
#x=as.matrix(x[1:n,])-apply(x[1:n,],1,mean)
#x = 100* x[1:n,] / apply(x[1:n,],1,sum)
x = x[1:n,]
if( input_kmeansNormalization == 'L1Norm')
x = 100* x / apply(x,1,function(y) sum(abs(y))) else # L1 norm
if( input_kmeansNormalization == 'geneMean')
x = x - apply(x,1,mean) else # this is causing problem??????
if( input_kmeansNormalization == 'geneStandardization')
x = (x - apply(x,1,mean) ) / apply(x,1,sd)
#colnames(x) = gsub("_.*","",colnames(x))
set.seed(input_KmeansReRun)
k=input_nClusters
cl = kmeans(x,k,iter.max = 50)
#myheatmap(cl$centers)
hc <- hclust2(dist2(cl$centers-apply(cl$centers,1,mean) ) )# perform cluster for the reordering of samples
tem = match(cl$cluster,hc$order) # new order
x = x[order(tem),] ; bar = sort(tem)
#myheatmap2(x-apply(x,1,mean), bar,1000)
return(list( x = x, bar = bar))
}
myheatmap2 <- function (x,bar=NULL,n=-1,mycolor=1,clusterNames=NULL,sideColors=NULL ) {
# number of genes to show
ngenes = as.character( table(bar))
if(length(bar) >n && n != -1) {ix = sort( sample(1:length(bar),n) ); bar = bar[ix]; x = x[ix,] }
if(! is.null(bar) )
if(is.null(sideColors) )
sideColors = mycolors
# this will cutoff very large values, which could skew the color
x=as.matrix(x)-apply(x,1,mean)
cutoff = median(unlist(x)) + 3*sd (unlist(x))
x[x>cutoff] <- cutoff
cutoff = median(unlist(x)) - 3*sd (unlist(x))
x[x< cutoff] <- cutoff
#colnames(x)= detectGroups(colnames(x))
if(is.null(bar)) # no side colors
heatmap.2(x, Rowv =F,Colv=F, dendrogram ="none",
col=heatColors[as.integer(mycolor),], density.info="none", trace="none", scale="none", keysize=.3
,key=F, labRow = F
#,RowSideColors = mycolors[bar]
,margins = c(8, 24)
,srtCol=45
) else
heatmap.2(x, Rowv =F,Colv=F, dendrogram ="none",
col=heatColors[as.integer(mycolor),], density.info="none", trace="none", scale="none", keysize=.3
,key=F, labRow = F
,RowSideColors = sideColors[bar]
,margins = c(8, 24)
,srtCol=45
)
if(!is.null(bar)) {
legend.text = paste("Cluster ", toupper(letters)[unique(bar)], " (N=", ngenes,")", sep="")
if( !is.null( clusterNames ) && length(clusterNames)>= length( unique(bar) ) )
legend.text = paste(clusterNames[ 1:length( unique(bar) ) ], " (N=", ngenes,")", sep="")
par(lend = 1) # square line ends for the color legend
legend("topright", # location of the legend on the heatmap plot
legend = legend.text, # category labels
col = sideColors, # color key
lty= 1, # line style
lwd = 10 ) # line width
}
}
KmeansHeatmap <- function() { # Kmeans clustering
myheatmap2(Kmeans.out$x-apply(Kmeans.out$x,1,mean), Kmeans.out$bar,1000,mycolor=input_heatColors1)
}
tSNEgenePlot <- function() {
Cluster <- Kmeans.out$bar
train <- as.data.frame( cbind(Cluster,Kmeans.out$x) )
library(Rtsne,verbose=FALSE)
train = unique(train)
Cluster = train$Cluster
set.seed(input_seedTSNE)
## Executing the algorithm on curated data
tsne <- Rtsne(train[,-1], dims = 2, perplexity=30, verbose=FALSE, max_iter = 400)
nClusters = length(unique(Cluster) )
if(input_colorGenes) {
plot(tsne$Y[,1], tsne$Y[,2], pch = (0:(nClusters-1))[Cluster], cex = 1.,col = mycolors[Cluster], xlab="X",ylab="Y")
legend("topright",toupper(letters)[1:nClusters], pch = 0:(nClusters-1), col=mycolors, title="Cluster" )
} else
plot(tsne$Y[,1], tsne$Y[,2], cex = 1., xlab="X",ylab="Y")
}
# Main function. Find a query set of genes enriched with functional category
FindOverlap <- function (converted,gInfo, GO,selectOrg,minFDR, reduced = FALSE) {
maxTerms =15 # max number of enriched terms
idNotRecognized = as.data.frame("ID not recognized!")
if(is.null(converted) ) return(idNotRecognized) # no ID
# only coding
gInfo <- gInfo[which( gInfo$gene_biotype == "protein_coding"),]
querySet <- intersect( converted$IDs, gInfo[,1]);
if(length(querySet) == 0) return(idNotRecognized )
ix = grep(converted$species[1,1],gmtFiles)
totalGenes <- converted$species[1,7]
if (length(ix) == 0 ) {return(idNotRecognized )}
# If selected species is not the default "bestMatch", use that species directly
if(selectOrg != speciesChoice[[1]]) {
ix = grep(findSpeciesById(selectOrg)[1,1], gmtFiles )
if (length(ix) == 0 ) {return(idNotRecognized )}
totalGenes <- orgInfo[which(orgInfo$id == as.numeric(selectOrg)),7]
}
pathway <- dbConnect(sqlite,gmtFiles[ix],flags=SQLITE_RO)
sqlQuery = paste( " select distinct gene,pathwayID from pathway where gene IN ('", paste(querySet,collapse="', '"),"')" ,sep="")
#cat(paste0("HH",GO,"HH") )
if( GO != "All") sqlQuery = paste0(sqlQuery, " AND category ='",GO,"'")
result <- dbGetQuery( pathway, sqlQuery )
if( dim(result)[1] ==0) {return(as.data.frame("No matching species or gene ID file!" )) }
# given a pathway id, it finds the overlapped genes, symbol preferred
sharedGenesPrefered <- function(pathwayID) {
tem <- result[which(result[,2]== pathwayID ),1]
ix = match(tem, converted$conversionTable$ensembl_gene_id) # convert back to original
tem2 <- unique( converted$conversionTable$User_input[ix] )
if(length(unique(gInfo$symbol) )/dim(gInfo)[1] >.7 ) # if 70% genes has symbol in geneInfo
{ ix = match(tem, gInfo$ensembl_gene_id);
tem2 <- unique( gInfo$symbol[ix] ) }
return( paste( tem2 ,collapse=" ",sep="") )}
x0 = table(result$pathwayID)
x0 = as.data.frame( x0[which(x0>=Min_overlap)] )# remove low overlaps
if(dim(x0)[1] <= 5 ) return(idNotRecognized) # no data
colnames(x0)=c("pathwayID","overlap")
pathwayInfo <- dbGetQuery( pathway, paste( " select distinct id,n,Description from pathwayInfo where id IN ('",
paste(x0$pathwayID,collapse="', '"), "') ",sep="") )
x = merge(x0,pathwayInfo, by.x='pathwayID', by.y='id')
x$Pval=phyper(x$overlap-1,length(querySet),totalGenes - length(querySet),as.numeric(x$n), lower.tail=FALSE );
x$FDR = p.adjust(x$Pval,method="fdr")
x <- x[ order( x$FDR) ,] # sort according to FDR
if(dim(x)[1] > maxTerms ) x = x[1:maxTerms,]
if(min(x$FDR) > minFDR) x=as.data.frame("No significant enrichment found!") else {
x <- x[which(x$FDR < minFDR),]
x= cbind(x,sapply( x$pathwayID, sharedGenesPrefered ) )
colnames(x)[7]= "Genes"
x <- subset(x,select = c(FDR,overlap,n,description,Genes) )
colnames(x) = c("Corrected P value (FDR)", "Genes in list", "Total genes in category","Functional Category","Genes" )
# remove redudant gene sets
if(reduced != FALSE ){ # reduced=FALSE no filtering, reduced = 0.9 filter sets overlap with 90%
n= nrow(x)
tem=rep(TRUE,n )
geneLists = lapply(x$Genes, function(y) unlist( strsplit(as.character(y)," " ) ) )
for( i in 2:n)
for( j in 1:(i-1) ) {
if(tem[j]) { # skip if this one is already removed
commonGenes = length(intersect(geneLists[i] ,geneLists[j] ) )
if( commonGenes/ length(geneLists[j] ) > reduced )
tem[i] = FALSE
}
}
x <- x[which(tem),]
}
}
dbDisconnect(pathway)
return(x )
}
# Given a gene set, finds significant overlaps with a gene set database object
findOverlapGMT <- function ( query, geneSet, minFDR=.2 ,minSize=2,maxSize=10000 ){
#geneSets <- readGMT("exampleData/MousePath_TF_gmt.gmt")
#query <- geneSets[['TF_MM_FRIARD_C-REL']]
#query <- query[1:60]
total_elements = 30000
Min_overlap <- 1
maxTerms =10 # max number of enriched terms
noSig <- as.data.frame("No significant enrichment found!")
query <- cleanGeneSet(query) # convert to upper case, unique()
if(length(query) <=2) return(noSig)
if(length(geneSet) <1) return(noSig)
geneSet <- geneSet[which(sapply(geneSet,length) > minSize)] # gene sets smaller than 1 is ignored!!!
geneSet <- geneSet[which(sapply(geneSet,length) < maxSize)] # gene sets smaller than 1 is ignored!!!
result <- unlist( lapply(geneSet, function(x) length( intersect (query, x) ) ) )
result <- cbind(unlist( lapply(geneSet, length) ), result )
result <- result[ which(result[,2]>Min_overlap), ,drop=F]
if(dim(result)[1] == 0) return( noSig)
xx <- result[,2]
mm <- length(query)
nn <- total_elements - mm
kk <- result[,1]
Pval_enrich=phyper(xx-1,mm,nn,kk, lower.tail=FALSE );
FDR <- p.adjust(Pval_enrich,method="fdr",n=length(geneSet) )
result <- as.data.frame(cbind(FDR,result))
result <- result[,c(1,3,2)]
result$pathway = rownames(result)
result$Genes = "" # place holder just
colnames(result)= c("Corrected P value (FDR)", "Genes in list", "Total genes in category","Functional Category","Genes" )
result <- result[ which( result[,1] < minFDR),,drop=F]
if( dim( result)[1] == 0) return(noSig)
if(min(FDR) > minFDR) return(noSig)
result <- result[order(result[,1] ),]
if(dim(result)[1] > maxTerms ) result <- result[1:maxTerms,]
return( result)
}
KmeansGO <- function() {
pp=0
minFDR = 0.01
for( i in 1:input_nClusters) {
query = rownames(Kmeans.out$x)[which(Kmeans.out$bar == i)]
result <- findOverlapGMT( query, GeneSets.out,1)
if( dim(result)[2] ==1) next; # result could be NULL
result$Genes = toupper(letters)[i]
if (pp==0 ) { results <- result; pp <- 1;
} else {
results <- rbind(results,result)
}
}
if(pp == 0) return( as.data.frame("No enrichment found."))
results= results[,c(5,1,2,4)]
colnames(results)= c("Cluster","FDR","Genes","Pathways")
if(min(results$FDR) > minFDR ) results = as.data.frame("No signficant enrichment found.") else
results = results[which(results$FDR < minFDR),]
if( is.null(results) ) return ( as.matrix("No significant enrichment.") )
if( class(results) != "data.frame") return ( as.matrix("No significant enrichment.") )
if( dim(results)[2] ==1) return ( as.matrix("No significant enrichment.") )
colnames(results)[2] = "adj.Pval"
results$Genes <- as.character(results$Genes)
results$Cluster[which( duplicated(results$Cluster) ) ] <- ""
results
}6.8 Differential expression
################################################################
# Differential expression
################################################################
# Differential expression using LIMMA
DEG.limma <- function (x, maxP_limma=.1, minFC_limma=2, rawCounts,countsDEGMethods,priorCounts, dataFormat, selectedComparisons=NULL, sampleInfo = NULL,modelFactors=NULL, blockFactor = NULL){
library(limma,verbose=FALSE) # Differential expression
library(statmod,verbose=FALSE)
# many different situations: 1. just use sample names 2. just one factor 3. two factors no interaction
# 4. two factors with interaction 5. block factor
topGenes = list(); limmaTrend = FALSE
if( dataFormat == 2) { # if normalized data
eset = new("ExpressionSet", exprs=as.matrix(x)) } else { # counts data
if (countsDEGMethods == 1 ) { # limma-trend method selected for counts data
#dge <- DGEList(counts=rawCounts);
#dge <- calcNormFactors(dge, method = "TMM")
#eset <- cpm(dge, log=TRUE, prior.count=priorCounts)
eset = new("ExpressionSet", exprs=as.matrix(x)) # use transformed data for limma
limmaTrend = TRUE
}
}
groups = colnames(x)
groups = detectGroups( groups)
g = unique(groups)
# check for replicates, removes samples without replicates
reps = as.matrix(table(groups)) # number of replicates per biological sample
if ( sum( reps[,1] >= 2) <2 ) # if less than 2 samples with replicates
return( list(results= NULL, comparisons = NULL, Exp.type="Failed to parse sample names to define groups.
Cannot perform DEGs and pathway analysis. Please double check column names! Use WT_Rep1, WT_Rep2 etc. ", topGenes=NULL))
# remove samples without replicates
g <- rownames(reps)[which(reps[,1] >1)]
ix <- which( groups %in% g)
groups <- groups[ix]
x<- x[,ix]; rawCounts <- rawCounts[,ix]
if(length(g) ==2 ) {
g= unique(groups)
comparisons <- paste(g[2],"-",g[1],sep="") # "Mutant-WT"
# no sample file, but user selected comparisons using column names
if( is.null(modelFactors) & length( selectedComparisons) >0 )
comparisons = selectedComparisons
design <- model.matrix(~0+groups)
colnames(design) <- g
if( !is.null(rawCounts) && countsDEGMethods == 2) { # voom
dge <- DGEList(counts=rawCounts);
dge <- calcNormFactors(dge, method = "TMM") # normalization
v <- voom(dge, design); fit <- lmFit(v, design) } else
fit <- lmFit(eset, design) # regular limma
cont.matrix <- makeContrasts(contrasts=comparisons, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2, trend=limmaTrend)
# calls differential gene expression 1 for up, -1 for down
results <- decideTests(fit2, p.value=maxP_limma, lfc=log2(minFC_limma) )
#vennDiagram(results,circle.col=rainbow(5))
topGenes1 =topTable(fit2, number = 1e12,sort.by="M" )
if (dim(topGenes1)[1] != 0) {
topGenes1 = topGenes1[,c('logFC','adj.P.Val')]
# topGenes1[,1] <- -1* topGenes1[,1] # reverse direction
topGenes[[1]] <- topGenes1 }
# log fold change is actually substract of means. So if the data is natral log transformed, it shoudl be natral log.
Exp.type = "2 sample groups."
}
if(length(g) > 2 ) { # more than two sample groups
design <- model.matrix(~ 0+factor(groups))
colnames(design) <- gsub(".*)","",colnames(design))
if( !is.null(rawCounts) && countsDEGMethods == 2) { # voom
v <- voom(rawCounts, design);
fit <- lmFit(v, design)
} else
fit <- lmFit(eset, design)
fit <- eBayes(fit, trend=limmaTrend)
comparisons = ""
for( i in 1:(length(g)-1) )
for (j in (i+1):length(g))
comparisons = c(comparisons,paste(g[j],"-",g[i],sep="" ) )
comparisons <- comparisons[-1]
# no sample file, but user selected comparisons using column names
if( is.null(modelFactors) & length( selectedComparisons) >0 )
comparisons = selectedComparisons
contrast1 <- makeContrasts(contrasts=comparisons[1], levels=design)
if(length(comparisons)>1 )
for( kk in 2:length(comparisons) )
contrast1<- cbind(contrast1,makeContrasts(contrasts=comparisons[kk], levels=design) )
Exp.type = paste(length(g)," sample groups detected.")
# if factorial design 2x2, 2x3, 3x5 etc.
# all samples must be something like WT_control_rep1
if ( sum(sapply(strsplit(g,"_"),length) == 2 ) == length(g) ) {
#comparisons
comparisons = ""
for( i in 1:(length(g)-1) )
for (j in (i+1):length(g))
if( strsplit(g[i],"_")[[1]][1] == strsplit(g[j],"_")[[1]][1]| strsplit(g[i],"_")[[1]][2] == strsplit(g[j],"_")[[1]][2]) # only compare WT_control vs. WT_treatment
comparisons = c(comparisons,paste(g[j],"-",g[i],sep="" ) )
comparisons <- comparisons[-1]
#factors genotype treatment levels
extract_treatment <- function (x) paste( gsub( ".*_","",unlist( strsplit(x,"-")) ), collapse="-")
extract_genotype <- function (x) gsub( "_.*","",unlist( strsplit(x,"-")) )[1]
extract_treatment_counting <- unique( gsub( ".*_","",unlist( strsplit(g,"-")) ))
treatments = sapply(comparisons, extract_treatment)
genotypes = sapply(comparisons, extract_genotype)
Exp.type = paste( Exp.type, "\nFactorial design:",length(unique(genotypes)),"X", length( extract_treatment_counting ), sep="" )
# pairwise contrasts
contrast1 <- makeContrasts(contrasts=comparisons[1], levels=design)
for( kk in 2:length(comparisons) )
contrast1<- cbind(contrast1,makeContrasts(contrasts=comparisons[kk], levels=design) )
contrast.names = colnames(contrast1)
# interaction contrasts
for ( kk in 1:(length(comparisons)-1) ) {
for( kp in (kk+1):length(comparisons))
if( treatments[kp]== treatments[kk] )
{
contrast1 = cbind(contrast1, contrast1[,kp]- contrast1[,kk] )
contrast.names = c(contrast.names, paste("I:", genotypes[kp], "-", genotypes[kk],"(",gsub("-",".vs.",treatments[kp]),")",sep="" ) )
}
}
colnames(contrast1)=contrast.names
comparisons = contrast.names
} # if interaction terms
# if sample information is uploaded and user selected factors and comparisons
if( !is.null(modelFactors) & length( selectedComparisons) >0 ) {
Exp.type = paste("Model: ~", paste(modelFactors,collapse=" + ") )
interactionTerm = FALSE # default value to be re-write if needed
# model factors that does not contain interaction terms
# modelFactors "genotype", "condition", "genotype:condition"
keyModelFactors = modelFactors[ !grepl(":",modelFactors) ]
# "genotype: control vs. mutant"
factorsVector= gsub(":.*","",selectedComparisons) # corresponding factors for each comparison
# remove factors not used in comparison, these are batch effects/pairs/blocks,
# keyModelFactors = keyModelFactors[ !is.na(match(keyModelFactors, factorsVector)) ]
# if a factor is selected both in block and main factors, then use it as block factor
keyModelFactors = keyModelFactors[ is.na(match(keyModelFactors, blockFactor)) ]
#------Design matrix
sampleInfo2 = sampleInfo[,keyModelFactors,drop=F] # remove factors not used.
groups = apply(sampleInfo2,1, function(x) paste(x,collapse="_") )
g = unique(groups)
design <- model.matrix(~ 0+factor(groups))
colnames(design) <- gsub(".*)","",colnames(design))
if( !is.null(rawCounts) && countsDEGMethods == 2) { # voom
v <- voom(rawCounts, design);
fit <- lmFit(v, design)
} else
fit <- lmFit(eset, design)
fit <- eBayes(fit, trend=limmaTrend)
#-----------Making comaprisons
if( length(keyModelFactors) != 2 | length(blockFactor) >1 ) { # if only one factor, or more than two then use all pairwise comparisons
comparisons = gsub(".*: ","",selectedComparisons)
comparisons = gsub(" vs\\. ","-",comparisons)
} else if( length(keyModelFactors) == 2 ){ # two key factors
if( sum(grepl(":",modelFactors) >0) ) { # interaction?
interactionTerm =TRUE
# all pairwise comparisons
comparisons = ""
for( i in 1:(length(g)-1) )
for (j in (i+1):length(g))
if( strsplit(g[i],"_")[[1]][1] == strsplit(g[j],"_")[[1]][1]| strsplit(g[i],"_")[[1]][2] == strsplit(g[j],"_")[[1]][2]) # only compare WT_control vs. WT_treatment
comparisons = c(comparisons,paste(g[j],"-",g[i],sep="" ) )
comparisons <- comparisons[-1]
# pairwise contrasts
contrast1 <- makeContrasts(contrasts=comparisons[1], levels=design)
if(length(comparisons)>1 )
for( kk in 2:length(comparisons) )
contrast1<- cbind(contrast1,makeContrasts(contrasts=comparisons[kk], levels=design) )
contrast.names = colnames(contrast1)
# all possible interactions
# interaction contrasts
contrast2 = NULL
contrast.names =""
for ( kk in 1:(dim(contrast1)[2]-1) ) {
for( kp in (kk+1):dim(contrast1)[2])
#if( treatments[kp]== treatments[kk] )
{ if(is.null(contrast2))
contrast2 = contrast1[,kp]- contrast1[,kk] else
contrast2 = cbind(contrast2, contrast1[,kp]- contrast1[,kk] )
contrast.names = c(contrast.names, paste0("I:", colnames(contrast1)[kp], ".vs.", colnames(contrast1)[kk] ) )
}
}
colnames(contrast2)=contrast.names[-1]
# remove nonsense contrasts from interactions
contrast2 = contrast2[,which(apply(abs(contrast2),2,max)==1),drop=F]
contrast2 = contrast2[,which(apply(abs(contrast2),2,sum)==4),drop=F]
contrast2 = t( unique(t(contrast2)) ) # remove duplicate columns
# remove unwanted contrasts involving more than three levels in either factor
keep= c()
for( i in 1:dim(contrast2)[2]) {
tem = rownames(contrast2)[ contrast2[ ,i ] != 0 ]
tem1 = unique ( unlist(gsub("_.*","", tem) ) )
tem2 = unique ( unlist(gsub(".*_","", tem) ) )
if( length(tem1) == 2 & length(tem2) ==2 )
keep = c(keep, colnames(contrast2) [i] )
}
contrast2 = contrast2[,keep,drop=F]
comparisons2 = colnames(contrast2)
}
# "stage: MN vs. EN" --> c("MN_AB-EN_AB", "EN_Nodule-EN_AB")
# comparisons in all levels of the other factor
transformComparisons <- function (comparison1){
tem = gsub(".*: ","",comparison1)
tem = unlist(strsplit(tem, " vs\\. ") ) # control mutant
factor1= gsub(":.*","",comparison1)
ix = match(factor1, keyModelFactors) # 1: first factor, 2: 2nd factor
otherFactor = keyModelFactors[3-ix] # 3-1 = 2; 3-1=1
otherFactorLevels = unique( sampleInfo2[,otherFactor] )
comparisons = c( )
for (factorLevels in otherFactorLevels) {
if( ix == 1){
comparisons = c( comparisons, paste(paste0( tem, "_",factorLevels),collapse="-") )
} else {
comparisons = c( comparisons, paste(paste0(factorLevels, "_", tem),collapse="-") )
}
}
return(comparisons)
}
comparisons = unlist( sapply(selectedComparisons, transformComparisons ))
comparisons = as.vector(comparisons)
} # two factors
# make contrasts
contrast1 <- makeContrasts(contrasts=comparisons[1], levels=design)
if(length(comparisons) >1 )
for( kk in 2:length(comparisons) )
contrast1<- cbind(contrast1,makeContrasts(contrasts=comparisons[kk], levels=design) )
if( interactionTerm ) { # if interaction terms
contrast1 = cbind(contrast1,contrast2)
contrast.names = c(colnames(contrast1), colnames(contrast2) )
comparisons = c(comparisons,comparisons2)
}
# block design to remove batch effect or paired samples
# corfit <- duplicateCorrelation(eset,design,block=targets$Subject)
# corfit$consensus
#Then this inter-subject correlation is input into the linear model fit:
# fit <- lmFit(eset,design,block=targets$Subject,correlation=corfit$consensus)
if(length(blockFactor) >= 1 ) { # if a factor is selected as block
if(length(blockFactor) >= 1 )
blockFactor = blockFactor[1] # if multiple use the first one
block = sampleInfo[, blockFactor] # the column not used
if( !is.null(rawCounts) && countsDEGMethods == 2) { # voom
v <- voom(rawCounts, design);
corfit <- duplicateCorrelation(v,design,block= block)
fit <- lmFit(v, design,block=block,correlation=corfit$consensus)
} else {
corfit <- duplicateCorrelation(eset,design,block= block)
fit <- lmFit(eset, design,block=block,correlation=corfit$consensus)
}
fit <- eBayes(fit, trend=limmaTrend)
} # block factors
} # use selected factors
fit2 <- contrasts.fit(fit, contrast1)
fit2 <- eBayes(fit2, trend=limmaTrend)
#topTable(fit2, coef=1, adjust="BH")
results <- decideTests(fit2, p.value=maxP_limma, lfc= log2(minFC_limma ))
#vennDiagram(results[,1:5],circle.col=rainbow(5))
#colnames(results) = unlist(sapply( colnames(results), changeNames ) )
#comparisons3 <- unlist(sapply( comparisons, changeNames ) )
# extract fold change for each comparison
# there is issues with direction of foldchange. Sometimes opposite
top <- function (comp) {
tem <- topTable(fit2, number = 1e12,coef=comp,sort.by="M" )
if(dim(tem)[1] == 0) { return (1)
} else {
# compute fold change for the first gene (ranked by absolute value)
tem2 = as.numeric( x[ which(rownames(x)== rownames(tem)[1]) , ] )
names(tem2) = colnames(x)
return( tem[,c(1,5)])
}
} # no significant gene returns 1, otherwise a data frame
topGenes <- lapply(comparisons, top)
topGenes <- setNames(topGenes, comparisons )
ix <- which( unlist( lapply(topGenes, class) ) == "numeric")
if( length(ix)>0) topGenes <- topGenes[ - ix ]
# if (length(topGenes) == 0) topGenes = NULL;
}
#cat("\n", names(topGenes) )
#cat("\n", colnames(results))
#cat("\n", comparisons3)
#comparisons <- comparisons3
# it does not make any sense! comparisons can not be changed!
#comparisons =comparisons3
#cat("\n", comparisons)
return( list(results= results, comparisons=comparisons, Exp.type=Exp.type, topGenes=topGenes))
}
# Differential expression using DESeq2
DEG.DESeq2 <- function ( rawCounts,maxP_limma=.05, minFC_limma=2, selectedComparisons=NULL, sampleInfo = NULL,modelFactors=NULL, blockFactor = NULL, referenceLevels=NULL){
library(DESeq2,verbose=FALSE) # count data analysis
groups = as.character ( detectGroups( colnames( rawCounts ) ) )
g = unique(groups)# order is reversed
# check for replicates, removes samples without replicates
reps = as.matrix(table(groups)) # number of replicates per biological sample
if ( sum( reps[,1] >= 2) <2 ) # if less than 2 samples with replicates
return( list(results= NULL, comparisons = NULL, Exp.type="Failed to parse sample names to define groups.
Cannot perform DEGs and pathway analysis. Please double check column names! Use WT_Rep1, WT_Rep2 etc. ", topGenes=NULL))
# remove samples without replicates
g <- rownames(reps)[which(reps[,1] >1)]
ix <- which( groups %in% g)
groups <- groups[ix]
rawCounts <- rawCounts[,ix]
Exp.type = paste(length(g)," sample groups detected.")
comparisons = ""
for( i in 1:(length(g)-1) )
for (j in (i+1):length(g))
comparisons = c(comparisons,paste(g[j],"-",g[i],sep="" ) )
comparisons <- comparisons[-1]
colData = cbind(colnames(rawCounts), groups )
# no sample file, but user selected comparisons using column names
if( is.null(modelFactors) & length( selectedComparisons) >0 )
comparisons = selectedComparisons
comparisons2 = comparisons # this is for showing comparison names, which might be different from internally
# Set up the DESeqDataSet Object and run the DESeq pipeline
dds = DESeqDataSetFromMatrix(countData=rawCounts,
colData=colData,
design=~groups)
if( is.null(modelFactors) )
dds = DESeq(dds) else
{ # using selected factors and comparisons
# build model
modelFactors = c(modelFactors,blockFactor) # block factor is just added in.
factors = modelFactors # selected factors and interactions: c( "strain", "treatment", "strain:treatment")
factors = factors[ !grepl(":",factors )] # non-interaction terms
# interaction terms like strain:treatment
Interactions = modelFactors[ grepl(":",modelFactors )]
colData = sampleInfo
factorsCoded = toupper(letters )[1: dim(colData)[2] ] # Factors are encoded as "A", "B", "C"; this avoid illigal letters
names(factorsCoded) = colnames(colData) # this is for look up; each column of sampleInfo
colnames(colData) = factorsCoded # all columns named A B C D
colData = as.data.frame(colData)
# set reference levels for factors
if(! is.null( referenceLevels) ) { # c("genotype:wt", "treatment:control" )
# first factor
for ( refs in referenceLevels)
if(! is.null( refs) ) {
ix = match(gsub(":.*","",refs), colnames(sampleInfo) ) # corresponding column id for factor
colData[,ix] = as.factor( colData[,ix] )
colData[,ix] = relevel(colData[,ix],gsub(".*:","",refs) )
}
}
# base model
DESeq2.Object= paste("dds = DESeqDataSetFromMatrix(countData=rawCounts, colData=colData, design=~ ",
paste( factorsCoded[factors],collapse="+")) # only use selected factors
Exp.type = paste("Model: ~", paste(modelFactors,collapse=" + ") )
# create model
if( length(Interactions)>0 ) { # if there is interaction
for( interactionTerms in Interactions) {
interactingFactors = unlist(strsplit(interactionTerms,":" ) ) # split strain:treatment as "strain" and "mutant"
tem = paste(factorsCoded [ interactingFactors ],collapse=":") # convert "strain:mutant" to "A:B"
DESeq2.Object = paste(DESeq2.Object, " + ",tem)
}
}
DESeq2.Object= paste( DESeq2.Object, ")") # ends the model
eval(parse(text = DESeq2.Object) )
dds = DESeq(dds) # main function
# comparisons
# "group: control vs. mutant"
comparisons = gsub(".*: ","",selectedComparisons)
comparisons = gsub(" vs\\. ","-",comparisons)
factorsVector= gsub(":.*","",selectedComparisons) # corresponding factors for each comparison
# comparison2 holds names for display with real factor names
# comparison is used in calculation it is A, B, C for factors
comparisons2 = comparisons
#comparisons2 = gsub(" vs\\. ","-",selectedComparisons)
#comparisons2 = gsub(":","_",comparisons2)
# Note that with interaction terms, not all meaningful comparisons is listed for selection.
# this is complex. Only under reference level.
# comparisons due to interaction terms
if( length(Interactions)>0 ) { # if there is interaction
interactionComparisons = resultsNames(dds)
interactionComparisons = interactionComparisons[ grepl("\\.",interactionComparisons ) ]
comparisons = c(comparisons,interactionComparisons )
# translate comparisons generated in interaction terms back to real factor names
interactionComparisons2 = interactionComparisons
for ( i in 1:length(interactionComparisons2 ) ) {
tem = unlist(strsplit(interactionComparisons2[i],"\\." ) )
tem_factors = substr(tem,1,1)
tem_factors[1] = names(factorsCoded)[factorsCoded == tem_factors[1]] # get the first letter and translate into real factor names
tem_factors[2] = names(factorsCoded)[factorsCoded == tem_factors[2]] # get the 2nd letters and translate into real factor names
interactionComparisons2[i] <- paste0( "I:",tem_factors[1], "_",substr(tem[1],2,nchar(tem[1]) ),".",
tem_factors[2], "_",substr(tem[2],2,nchar(tem[2]) )
)
}
comparisons2 = c(comparisons2,interactionComparisons2 )
}
} # if selected factors
# extract contrasts according to comprisons defined above
result1 = NULL; allCalls = NULL;
topGenes = list(); pk = 1 # counter
pp=0 # first results?
for( kk in 1:length(comparisons) ) {
tem = unlist( strsplit(comparisons[kk],"-") )
if(is.null(modelFactors)) # if just group comparison using sample names
selected = results(dds, contrast=c("groups", tem[1], tem[2]) ) else {
if(!grepl("\\.", comparisons[kk] ) ) # if not interaction term: they contain . interaction term
selected = results(dds, contrast=c( factorsCoded[ factorsVector[kk] ],tem[1], tem[2]) ) else # either A, B, C ...
selected = results(dds, name=comparisons[kk] ) # interaction term
}
selected$calls =0
selected$calls [which( selected$log2FoldChange > log2(minFC_limma) & selected$padj < maxP_limma ) ] <- 1
selected$calls [ which( selected$log2FoldChange < -log2(minFC_limma) & selected$padj < maxP_limma ) ] <- -1
colnames(selected)= paste( as.character(comparisons2[kk]), "___",colnames(selected),sep="" )
selected = as.data.frame(selected)
if (pp==0){ # if first one with significant genes, collect gene list and Pval+ fold
result1 = selected; pp = 1;
# selected[,2] <- -1 * selected[,2] # reverse fold change direction
topGenes[[1]] = selected[,c(2,6)];
names(topGenes)[1] = comparisons2[kk]; } else
{ result1 = merge(result1,selected,by="row.names");
rownames(result1) = result1[,1];
result1 <- result1[,-1]
pk= pk+1;
# selected[,2] <- -1 * selected[,2] # reverse fold change direction
topGenes[[pk]] = selected[,c(2,6)];
names(topGenes)[pk] = comparisons2[kk]; # assign name to comprison
}
}
Interactions = c()
if( !is.null(modelFactors) )
Interactions = modelFactors[ grepl(":",modelFactors )]
#--- add comprisons for non-reference levels. It adds to the results1 object.
if( length(Interactions)>0 ) { # if there is interaction
factorLookup=c() # a factor whose values are factors and names are factor and level combination conditionTreated, genotypeWT
levelLookup = c()
for( i in 1:dim(sampleInfo)[2]) {
sampleInfo2 = unique(sampleInfo)
tem = rep(toupper(letters)[i],dim(sampleInfo2)[1] )
names(tem) = paste0(toupper(letters)[i],sampleInfo2[,i])
factorLookup = c(factorLookup,tem)
tem = as.character( sampleInfo2[,i] )
names(tem) = paste0(toupper(letters)[i],sampleInfo2[,i])
levelLookup = c(levelLookup, tem)
}
# split genotypeI.conditionTrt --> c("genotype","I","conditoin","Trt")
splitInteractionTerms <- function (term) {
if(!grepl("\\.",term) ) return(NULL)
terms2 = unlist(strsplit(term,"\\.") )
# factor1, level1, factor2, level2
return(c(factorLookup[terms2[1]], levelLookup[terms2[1]],factorLookup[terms2[2]], levelLookup[terms2[2]] ) )
}
# none interaction terms
NoneInterTerms = resultsNames(dds)[ !grepl( "\\.", resultsNames(dds)) ]
NoneInterTerms=NoneInterTerms[-1]
allInteractionTerms = resultsNames(dds)[ grepl( "\\.", resultsNames(dds)) ]
for( kk in 1:length(NoneInterTerms) ) { # for each none interaction term
if(!is.null(modelFactors) ) {# if not just group comparison using sample names
#current factor
cFactor = gsub("_.*","",NoneInterTerms[kk] )
for(interactionTerm in allInteractionTerms ) {
splited = splitInteractionTerms (interactionTerm) # 4 components
if (cFactor != splited[1] & cFactor != splited[3] )
next;
selected = results(dds, list(c( NoneInterTerms[kk],interactionTerm ) ) )
comparisonName = paste0( NoneInterTerms[kk],"__", gsub("\\.","",interactionTerm) )
if( cFactor == splited[1] )
otherLevel = splited[4] else otherLevel = splited[2]
comparisonName = paste0(#names(factorsCoded)[which(factorsCoded==cFactor)], # real factor name
gsub("_vs_","-", substr(NoneInterTerms[kk], 3, nchar(NoneInterTerms[kk] ) )), # the comparison
"_for_",otherLevel)
comparisons2 = c(comparisons2, comparisonName)
selected$calls =0
selected$calls [which( selected$log2FoldChange > log2(minFC_limma) & selected$padj < maxP_limma ) ] <- 1
selected$calls [ which( selected$log2FoldChange < -log2(minFC_limma) & selected$padj < maxP_limma ) ] <- -1
colnames(selected)= paste( comparisonName, "___",colnames(selected),sep="" )
selected = as.data.frame(selected)
if (pp==0){ # if first one with significant genes, collect gene list and Pval+ fold
result1 = selected; pp = 1;
# selected[,2] <- -1 * selected[,2] # reverse fold change direction
topGenes[[1]] = selected[,c(2,6)];
names(topGenes)[1] = comparisonName; } else
{ result1 = merge(result1,selected,by="row.names");
rownames(result1) = result1[,1];
result1 <- result1[,-1]
pk= pk+1;
# selected[,2] <- -1 * selected[,2] # reverse fold change direction
topGenes[[pk]] = selected[,c(2,6)];
names(topGenes)[pk] = comparisonName; # assign name to comprison
}
} #for
} #if
} #for
} #if
#---
#if( length(comparisons) == 1) topGenes <- topGenes[[1]] # if only one comparison, topGenes is not a list, just a data frame itself.
if(! is.null(result1)) {
# note that when you only select 1 column from a data frame it automatically converts to a vector. drop =FALSE prevents that.
allCalls = as.matrix( result1[,grep("calls",colnames(result1)), drop = FALSE ] )
colnames(allCalls)= gsub("___.*","", colnames(allCalls))
colnames(allCalls)= gsub("\\.","-", colnames(allCalls)) # note that samples names should have no "."
colnames(allCalls)= gsub("^I-","I:", colnames(allCalls))
}
return( list(results= allCalls, comparisons = comparisons2, Exp.type=Exp.type, topGenes=topGenes))
}
# main function
limma <- function() {
if(input_dataFileFormat == 1 ) { # if count data
if(input_CountsDEGMethod == 3 ) { # if DESeq2 method
# rawCounts = read.csv("exampleData/airway_GSE52778.csv", row.names=1)
# res =DEG.DESeq2(rawCounts, .05, 2)
# res1 =DEG.limma(rawCounts, .1, 1.5,rawCounts, 2,3)
return( DEG.DESeq2( convertedCounts.out,input_limmaPval, input_limmaFC,
input_selectModelComprions, readSampleInfo.out,
c(input_selectFactorsModel,input_selectInteractions),
input_selectBlockFactorsModel, factorReferenceLevels.out ) )
}
if(input_CountsDEGMethod < 3 ) # voom or limma-trend
return( DEG.limma(convertedData.out, input_limmaPval, input_limmaFC,
convertedCounts.out, input_CountsDEGMethod,
priorCounts=input_countsLogStart,input_dataFileFormat,
input_selectModelComprions, readSampleInfo.out,
c(input_selectFactorsModel,input_selectInteractions),
input_selectBlockFactorsModel) )
} else if (input_dataFileFormat == 2 ){ # normalized data
return( DEG.limma(convertedData.out, input_limmaPval, input_limmaFC,
convertedCounts.out, input_CountsDEGMethod,
priorCounts=input_countsLogStart,input_dataFileFormat,
input_selectModelComprions, readSampleInfo.out,
c(input_selectFactorsModel,input_selectInteractions),
input_selectBlockFactorsModel) )
} else { # dataFileFormat == 3 user just uploaded fold change matrix
x = convertedData.out
pvals = convertedPvals.out
if(!is.null(pvals) ) {
ix = match(rownames(x), rownames(pvals))
pvals = pvals[ix,]
}
# looks like ratio data, take log2
if( sum(round(apply(x,2, median) + .2) == 1 ) == dim(x)[2] & min(x) > 0)
x = log2(x)
Exp.type = "None standard data without replicates."
all.Calls = x # fake calls
for( i in 1: dim(all.Calls)[2]) {
tem <- all.Calls[,i]
all.Calls[which( tem <= log2(input_limmaFC) & tem >= -log2(input_limmaFC) ) ,i] = 0
all.Calls[which( tem > log2(input_limmaFC) ) ,i] = 1
all.Calls[which( tem < -log2(input_limmaFC) ) ,i] = -1
if(!is.null(pvals) )
all.Calls[ which( pvals[,i] > input_limmaPval),i] = 0
}
comparisons = colnames(all.Calls)
extractColumn <- function (i) {
topGenes = as.data.frame( convertedData.out[,i,drop=FALSE])
if(is.null(pvals) ) topGenes$FDR = 0 else
topGenes$FDR = pvals[,i]# fake fdr
colnames(topGenes) = c("Fold","FDR")
return(topGenes)
}
topGenes = lapply( 1:dim( x )[2], extractColumn )
topGenes <- setNames(topGenes, colnames(x ) )
return( list(results= all.Calls, comparisons = colnames(x ), Exp.type=Exp.type, topGenes=topGenes) )
}
}
DEG.data <- function() {
genes = limma.out$results
genes = as.data.frame( genes[which( rowSums( abs (genes) ) != 0 ),] )
colnames(genes) = colnames( limma.out$results )
genes = merge(genes,convertedData.out, by='row.names')
colnames(genes)[1] = "1: upregulation, -1: downregulation"
# add gene symbol
ix = match( genes[,1], allGeneInfo.out[,1])
genes <- cbind(as.character( allGeneInfo.out$symbol)[ix],genes)
colnames(genes)[1] = "Symbol"
genes <- genes[,c(2,1,3:dim(genes)[2]) ]
return(genes)
}
vennPlot <- function() {
results = limma.out$results
# split by up or down regulation
if(input_UpDownRegulated) {
resultUp = results;
resultUp[resultUp < 0 ] <- 0;
colnames(resultUp) = paste0("Up_", colnames(resultUp))
resultDown = results;
resultDown[resultDown > 0] <- 0;
colnames(resultDown) = paste0("Down_", colnames(resultDown))
results <- cbind(resultUp, resultDown)
}
ixa = c()
for (comps in input_selectComparisonsVenn) {
if(!grepl("^I:|^I-|^Up_I:|^Up_I-|^Down_I:|^Down_I-", comps) ) { # if not interaction term
ix = match(comps, colnames(results))
} else {
#mismatch in comparison names for interaction terms for DESeq2
#I:water_Wet.genetic_Hy in the selected Contrast
#Diff-water_Wet-genetic_Hy in column names
tem = gsub("^I-","I:" ,colnames(results))
tem = gsub("-","\\.",tem)
ix = match(comps, tem)
if(is.na(ix) ) # this is for limma package
ix = match(comps, colnames(results))
}
ixa = c(ixa,ix)
}
results = results[,ixa,drop=FALSE] # only use selected comparisons
if(dim(results)[2] >5) results <- results[,1:5]
colnames(results) = gsub("^I-","I:" ,colnames(results))
vennDiagram(results,circle.col=rainbow(5), cex=c(1.,1, 0.7) ) # part of limma package
}
sigGeneStats <- function( ){
results = limma.out$results
library(reshape2)
Up = apply(results, 2, function(x) sum(x == 1) )
Down = apply(results, 2, function(x) sum(x == -1) )
stats = rbind(Up, Down)
gg <- melt(stats)
colnames(gg) = c("Regulation","Comparisons","Genes")
p= ggplot(gg, aes(x=Comparisons, y=Genes, fill= Regulation ) )+
geom_bar(position="dodge", stat="identity") + coord_flip() +
theme(legend.position = "top") +
scale_fill_manual(values=c("red", "blue")) +
ylab("Number of differntially expressed genes") +
theme(axis.title.y=element_blank(),
axis.text=element_text(size=14))
p
}
sigGeneStatsTable <- function( ) {
results = limma.out$results
Up = apply(results, 2, function(x) sum(x == 1) )
Down = apply(results, 2, function(x) sum(x == -1) )
stats = rbind(Up, Down)
stats = t(stats)
stats=cbind(rownames(stats), stats)
colnames(stats)[1]="Comparisons"
return(as.data.frame(stats))
}
selectedHeatmap.data <- function() {
genes <- limma.out$results
if( is.null(genes) ) return(NULL)
if(!grepl("I:", input_selectContrast) ) { # if not interaction term
ix = match(input_selectContrast, colnames(genes))
} else {
#mismatch in comparison names for interaction terms for DESeq2
#I:water_Wet.genetic_Hy in the selected Contrast
#Diff-water_Wet-genetic_Hy in column names
tem = gsub("I-","I:" ,colnames(genes))
tem = gsub("-","\\.",tem)
ix = match(input_selectContrast, tem)
if(is.na(ix) ) # this is for limma package
ix = match(input_selectContrast, colnames(genes))
}
if(is.null(ix)) return(NULL)
if(is.na(ix)) return(NULL)
if( sum(abs(genes[,ix] ) ) <= 1 ) return(NULL) # no significant genes for this comparison
if(dim(genes)[2] < ix ) return(NULL)
query = rownames(genes)[which(genes[,ix] != 0)]
if(length(query) == 0) return(NULL)
iy = match(query, rownames(convertedData.out ) )
iz = findContrastSamples( input_selectContrast,
colnames(convertedData.out),
readSampleInfo.out,
input_selectFactorsModel,
input_selectModelComprions,
factorReferenceLevels.out,
input_CountsDEGMethod,
input_dataFileFormat
)
# color bar
bar = genes[,ix]
bar = bar[bar!=0]
# retreive related data
genes = convertedData.out[iy,iz,drop=FALSE]
genes = genes[order(bar),,drop=FALSE] # needs to be sorted because myheatmap2 does not reorder genes
bar = sort(bar)
return(list(genes=genes, bar=bar ))
}
# find sample index for selected comparisons
findContrastSamples <- function(selectContrast, allSampleNames,sampleInfo=NULL, selectFactorsModel=NULL,selectModelComprions =NULL , referenceLevels=NULL, countsDEGMethod=NULL, dataFileFormat=NULL ){
iz= match( detectGroups(allSampleNames), unlist(strsplit( selectContrast, "-")) )
iz = which(!is.na(iz))
if ( !is.null(sampleInfo) & !is.null(selectFactorsModel) & length(selectModelComprions)>0 ) {
comparisons = gsub(".*: ","",selectModelComprions) # strings like: "groups: mutant vs. control"
comparisons = gsub(" vs\\. ","-",comparisons)
factorsVector= gsub(":.*","",selectModelComprions) # corresponding factors
# if read counts data and DESeq2
if(dataFileFormat==1 & countsDEGMethod == 3) { # if DESeq2
contrast = gsub("_for_.*","",selectContrast) # could be "wt-mu" or "wt-mu_for_conditionB"
ik = match( contrast, comparisons ) # selected contrast lookes like: "mutant-control"
otherFactorLevel = gsub(".*_for_","",selectContrast)
# find the corresponding factor for the other factor
otherFactor=" "
if(nchar( otherFactorLevel ) >0){
for( eachFactor in colnames(sampleInfo) )
if ( otherFactorLevel %in% sampleInfo[,eachFactor ] )
otherFactor = eachFactor
}
if (is.na(ik)) iz=1:(length(allSampleNames)) else { # interaction term, use all samples
selectedfactor= factorsVector[ ik ] # corresponding factors
iz = which(sampleInfo[,selectedfactor] %in% unlist(strsplit( contrast, "-")) )
#filter by other factors: reference level
if(! is.null( referenceLevels) ) { # c("genotype:wt", "treatment:control" )
for ( refs in referenceLevels)
if(! is.null( refs) & gsub(":.*","",refs) != selectedfactor ) {
currentFactor = gsub(":.*","",refs)
if(nchar( otherFactorLevel ) >0 & currentFactor == otherFactor ) { # if not reference level
iz = intersect( iz, which(sampleInfo[,currentFactor] == otherFactorLevel ) )
} else
iz = intersect( iz, which(sampleInfo[,currentFactor] == gsub(".*:","",refs) ) )
}
}
iz = iz[which(!is.na(iz))]
# switching from limma to DESeq2 causes problem, as reference level is not defined.
}
} else { # not DESeq2
# given level find corresponding sample ids
findIDsFromLevel <- function (aLevel){
# find factor
currentFactor=""
for( eachFactor in colnames(sampleInfo) )
if ( aLevel %in% sampleInfo[,eachFactor ] )
currentFactor = eachFactor
if(nchar(currentFactor) >0 )
return( which(sampleInfo[,currentFactor ] %in% aLevel ) ) else return(NULL)
}
if( !grepl(".*_.*-.*_.*",selectContrast )) iz = c.out
levels4 = unlist( strsplit( unlist( strsplit(selectContrast,"-") ), "_") ) #double split!
if(length(levels4)!=4) {
iz = c.out
} else {
iz = intersect( findIDsFromLevel(levels4[1]), findIDsFromLevel(levels4[2]) ) # first sample
iz = c(iz, intersect( findIDsFromLevel(levels4[3]), findIDsFromLevel(levels4[4]) ) ) # 2nd sample
}
} #else
}
if (grepl("I:",selectContrast)) iz=1:length(allSampleNames) # if it is factor design use all samples
if( is.na(iz)[1] | length(iz)<=1 ) iz=1:length(allSampleNames)
return(iz)
}
selectedHeatmap <- function() {
bar = selectedHeatmap.data.out$bar +2;
bar[bar==3] =2
myheatmap2( selectedHeatmap.data.out$genes,bar,200,mycolor=input_heatColors1,c("Down","Up") )
}
selectedHeatmap.data <- function(){
genes <- limma.out$results
if( is.null(genes) ) return(NULL)
if(!grepl("I:", input_selectContrast) ) { # if not interaction term
ix = match(input_selectContrast, colnames(genes))
} else {
#mismatch in comparison names for interaction terms for DESeq2
#I:water_Wet.genetic_Hy in the selected Contrast
#Diff-water_Wet-genetic_Hy in column names
tem = gsub("I-","I:" ,colnames(genes))
tem = gsub("-","\\.",tem)
ix = match(input_selectContrast, tem)
if(is.na(ix) ) # this is for limma package
ix = match(input_selectContrast, colnames(genes))
}
if(is.null(ix)) return(NULL)
if(is.na(ix)) return(NULL)
if( sum(abs(genes[,ix] ) ) <= 1 ) return(NULL) # no significant genes for this comparison
if(dim(genes)[2] < ix ) return(NULL)
query = rownames(genes)[which(genes[,ix] != 0)]
if(length(query) == 0) return(NULL)
iy = match(query, rownames(convertedData.out ) )
iz = findContrastSamples( input_selectContrast,
colnames(convertedData.out),
readSampleInfo.out,
input_selectFactorsModel,
input_selectModelComprions,
factorReferenceLevels.out,
input_CountsDEGMethod,
input_dataFileFormat
)
# color bar
bar = genes[,ix]
bar = bar[bar!=0]
# retreive related data
genes = convertedData()[iy,iz,drop=FALSE]
genes = genes[order(bar),,drop=FALSE] # needs to be sorted because myheatmap2 does not reorder genes
bar = sort(bar)
return(list(genes=genes, bar=bar ))
}
DEG.data <- function(){
genes = limma.out$results
genes = as.data.frame( genes[which( rowSums(genes) != 0 ),] )
colnames(genes) = colnames( limma.out$results )
genes = merge(genes,convertedData.out, by='row.names')
colnames(genes)[1] = "1: upregulation, -1: downregulation"
# add gene symbol
ix = match( genes[,1], allGeneInfo.out[,1])
genes <- cbind(as.character( allGeneInfo.out$symbol)[ix],genes)
colnames(genes)[1] = "Symbol"
genes <- genes[,c(2,1,3:dim(genes)[2]) ]
return(genes)
}
AllGeneListsGMT <- function() {
results = limma.out$results
results2 = cbind(results, results)
colnames(results2)= c( paste0("UP_", colnames(results)),paste0("Down_", colnames(results)) )
for(i in 1:dim(results)[2] ) {
results2[,i*2-1] = results[,i]
results2[ which(results2[,i*2-1] < 0 ) , i*2-1] = 0
results2[,i*2] = results[,i]
results2[ which(results2[,i*2] > 0 ) , i*2] = 0
}
geneList1 <- function (i) {
ix = which(results2[,i] !=0 )
return(paste0(colnames(results2)[i],"\t", length(ix),"\t",
paste(rownames(results2 )[ix], collapse="\t" ) ) )
}
tem = sapply(1:dim(results2)[2],geneList1 )
return( paste(tem, collapse="\n") )
}
geneListData <- function( ) {
noSig = as.data.frame("No significant genes find!")
if( is.null(input_selectContrast) ) return(NULL)
if( is.null( limma.out$comparisons ) ) return(NULL) # if no significant genes found
if( length(limma.out$topGenes) == 0 ) return(noSig)
if(length( limma.out$comparisons) ==1 )
{ top1=limma.out$topGenes[[1]]
} else {
top = limma.out$topGenes
ix = match(input_selectContrast, names(top))
if( is.na(ix)) return (noSig)
top1 <- top[[ix]];
}
if(dim(top1)[1] == 0 ) return (noSig)
colnames(top1)= c("Fold","FDR")
#top1 = merge(top1,convertedData(), by='row.names')
#colnames(top1)[1] = "Genes"
top1 = top1[order(-abs(top1$Fold)) ,]
if ( length( which( top1$FDR <= input_limmaPval & abs(top1$Fold) >= log2(input_limmaFC) ) ) == 0 )
return( noSig)
top1 <- top1[which(top1$FDR <= input_limmaPval ) ,]
top1 <- top1[which(abs(top1$Fold) >= log2( input_limmaFC)) ,]
top1$Top_Genes <- rownames(top1)
top1 <- top1[,c(3,1,2)]
# if new species
if( input_selectGO2 == "ID not recognized!" | input_selectOrg == "NEW") return (top1);
#convertedID = convertID(top1[,1],input_selectOrg, "GOBP" );#"gmax_eg_gene"
# tem <- geneInfo(convertedID,input_selectOrg) #input_selectOrg ) ;
# tem <- geneInfo(converted(),input_selectOrg)
top1 <- merge(top1, allGeneInfo.out, by.x ="Top_Genes", by.y="ensembl_gene_id",all.x=T )
if ( sum( is.na(top1$band)) == dim(top1)[1] ) top1$chr = top1$chromosome_name else
top1$chr = paste( top1$chromosome_name, top1$band,sep="")
top1 <- top1[,c('Top_Genes','Fold','FDR','symbol','chr','gene_biotype')]
# ix = match(top1[,1], tem$ensembl_gene_id)
# if( sum(is.na( tem$Symbol[ix]) ) != length(ix) )
# { top1 <- cbind(top1, tem$Symbol[ix]); colnames(top1)[4]= "Symbol" }
top1 = top1[order(-abs(as.numeric( top1$Fold))) ,]
top1$FDR <- sprintf("%-3.2e",top1$FDR )
colnames(top1) <- c("Ensembl ID", "log2 Fold Change", "Adj.Pval", "Symbol","Chr","Type")
if ( sum( is.na(top1$Symbol)) == dim(top1)[1] ) top1 <- top1[,-4]
return(top1)
}
volcanoPlot <- function( ) {
if(length( limma.out$comparisons) ==1 )
{ top1=limma.out$topGenes[[1]]
} else {
top = limma.out$topGenes
ix = match(input_selectContrast, names(top))
if( is.na(ix)) return (NULL)
top1 <- top[[ix]];
}
if(dim(top1)[1] == 0 ) return (NULL)
colnames(top1)= c("Fold","FDR")
top1 <- as.data.frame(top1) # convert to data frame
top1 <- top1[which(!(is.na(top1$Fold)|is.na(top1$FDR) )),] # remove NA's
top1$upOrDown <- 1
#write.csv(top1,"tem.csv")
top1$upOrDown[ which(top1$FDR <= input_limmaPval& top1$Fold >= log2( input_limmaFC)) ] <- 2
top1$upOrDown[ which(top1$FDR <= input_limmaPval & top1$Fold <= -log2( input_limmaFC)) ] <- 3
par(mar=c(5,5,1,1))
plot(top1$Fold,-log10(top1$FDR),col = c("grey30", "red","blue")[top1$upOrDown],
pch =16, cex = .3, xlab= "log2 fold change", ylab = "- log10 (FDR)",
cex.lab=2, cex.axis=2, cex.main=2, cex.sub=2 )
legend("bottomright",c("Upregulated","Downregulated"),fill = c("red","blue"),cex=1.1 )
}
scatterPlot <- function( ){
if(length( limma.out$comparisons) ==1 )
{ top1=limma.out$topGenes[[1]]
} else {
top = limma.out$topGenes
ix = match(input_selectContrast, names(top))
if( is.na(ix)) return (NULL)
top1 <- top[[ix]];
}
if(dim(top1)[1] == 0 ) return (NULL)
colnames(top1)= c("Fold","FDR")
top1 <- as.data.frame(top1) # convert to data frame
top1 <- top1[which(!(is.na(top1$Fold)|is.na(top1$FDR) )),] # remove NA's
top1$upOrDown <- 1
#write.csv(top1,"tem.csv")
top1$upOrDown[ which(top1$FDR <= input_limmaPval& top1$Fold >= log2( input_limmaFC)) ] <- 2
top1$upOrDown[ which(top1$FDR <= input_limmaPval & top1$Fold <= -log2( input_limmaFC)) ] <- 3
iz = findContrastSamples( input_selectContrast,
colnames(convertedData.out),
readSampleInfo.out,
input_selectFactorsModel,
input_selectModelComprions,
factorReferenceLevels.out,
input_CountsDEGMethod,
input_dataFileFormat
)
genes <- convertedData.out[,iz]
g = detectGroups(colnames(genes))
if(length(unique(g)) > 2) { plot.new(); text(0.5,0.5, "Not available.") } else{
average1 <- apply( genes[, which( g == unique(g)[1] ) ],1,mean)
average2 <- apply( genes[, which( g == unique(g)[2] ) ],1,mean)
genes2 <- cbind(average1,average2)
rownames(genes2) = rownames(genes)
genes2 <- merge(genes2,top1,by="row.names")
par(mar=c(5,5,1,1))
plot(genes2$average2,genes2$average1,col = c("grey45","red","blue")[genes2$upOrDown],
pch =16, cex = .3, xlab= paste("Average expression in", unique(g)[2] ),
ylab = paste("Average expression in", unique(g)[1] ),
cex.lab=2, cex.axis=2, cex.main=2, cex.sub=2 )
legend("bottomright",c("Upregulated","Downregulated"),fill = c("red","blue"),cex=1.3 )
}
}
MAplot <- function ( ) {
if(length( limma.out$comparisons) ==1 )
{ top1=limma.out$topGenes[[1]]
} else {
top = limma.out$topGenes
ix = match(input_selectContrast, names(top))
if( is.na(ix)) return (NULL)
top1 <- top[[ix]];
}
if(dim(top1)[1] == 0 ) return (NULL)
colnames(top1)= c("Fold","FDR")
top1 <- as.data.frame(top1) # convert to data frame
top1 <- top1[which(!(is.na(top1$Fold)|is.na(top1$FDR) )),] # remove NA's
top1$upOrDown <- 1
#write.csv(top1,"tem.csv")
top1$upOrDown[ which(top1$FDR <= input_limmaPval& top1$Fold >= log2( input_limmaFC)) ] <- 2
top1$upOrDown[ which(top1$FDR <= input_limmaPval & top1$Fold <= -log2( input_limmaFC)) ] <- 3
iz = findContrastSamples( input_selectContrast,
colnames(convertedData.out),
readSampleInfo.out,
input_selectFactorsModel,
input_selectModelComprions,
factorReferenceLevels.out,
input_CountsDEGMethod,
input_dataFileFormat
)
average1 <- as.data.frame( apply( convertedData()[,iz],1,mean) )
colnames(average1) = "Average"
rownames(average1) = rownames(convertedData.out)
genes2 <- merge(average1,top1,by="row.names")
par(mar=c(5,5,1,1))
plot(genes2$Average,genes2$Fold,col = c("grey45","red","blue")[genes2$upOrDown],
pch =16, cex = .3, xlab= "Average expression",
ylab = "Log2 fold change",
cex.lab=2, cex.axis=2, cex.main=2, cex.sub=2 )
abline(h=0)
legend("bottomright",c("Upregulated","Downregulated"),fill = c("red","blue"),cex=1.3 )
}
geneListGOTable <- function() {
NoSig=NULL
# using expression data
genes <- selectedHeatmap.data.out$genes
if(is.null(genes) ) return(NULL)
if(dim(genes)[1] <= minGenesEnrichment ) return(NoSig) # if has only few genes
fc = selectedHeatmap.data.out$bar
# GO
results1 <- NULL; result <- NULL
pp <- 0
for( i in c(1,-1) ) {
if( length(which(fc*i<0)) <= minGenesEnrichment) next;
query = rownames(genes)[which(fc*i<0)]
if( length(query) <= minGenesEnrichment) next;
result <- findOverlapGMT( query, GeneSets.out,1)
if( dim(result)[2] ==1) next; # result could be NULL
if(i == -1) result$direction = "Up regulated" else result$direction = "Down regulated" # changed
if (pp==0 ) { results1 <- result; pp = 1;} else results1 = rbind(results1,result)
}
if ( pp == 0 ) return (NoSig)
if ( is.null( results1) ) return (NoSig)
if( dim(results1)[2] == 1 ) return(NoSig) # Returns a data frame: "No significant results found!"
results1= results1[,c(6,1,2,4,5)] # changed
colnames(results1)= c("List","FDR","nGenes","GO terms or pathways","Genes") # changed
minFDR = 0.01
if(min(results1$FDR) > minFDR ) results1 = as.data.frame("No signficant enrichment found.") else
results1 = results1[which(results1$FDR < minFDR),]
if(dim(results1)[2] != 5) return(NoSig) # changed
colnames(results1)= c("Direction","adj.Pval","nGenes","Pathways", "Genes") # changed
results1
}
geneListGO <- function() {
results1 = geneListGOTable.out
tem = input_removeRedudantSets
if(dim(results1)[2] ==1) return(results1) else {
results1$adj.Pval <- sprintf("%-2.1e",as.numeric(results1$adj.Pval) )
results1[,1] <- as.character(results1[,1])
results1[ duplicated (results1[,1] ),1 ] <- ""
return( results1[,-5] )
}
}
# a program for ploting enrichment results by highlighting the similarities among terms
# must have columns: Direction, adj.Pval Pathways Genes
# Direction adj.Pval nGenes Pathways Genes
#Down regulated 3.58E-59 131 Ribonucleoprotein complex biogenesis 36 Nsun5 Nhp2 Rrp15
#Down regulated 2.55E-57 135 NcRNA metabolic process 23 Nsun5 Nhp2 Rrp15 Emg1 Ddx56 Rsl1d1 enrichmentPlot <- function( enrichedTerms){
# Up or down regulation is color-coded
# gene set size if represented by the size of marker
enrichmentPlot <- function( enrichedTerms, rightMargin=33) {
if(class(enrichedTerms) != "data.frame") return(NULL)
library(dendextend) # customizing tree
geneLists = lapply(enrichedTerms$Genes, function(x) unlist( strsplit(as.character(x)," " ) ) )
names(geneLists)= enrichedTerms$Pathways
# compute overlaps percentage--------------------
n = length(geneLists)
w <- matrix(NA, nrow = n, ncol = n)
# compute overlaps among all gene lists
for (i in 1:n) {
for (j in i:n) {
u <- unlist(geneLists[i])
v <- unlist(geneLists[j])
w[i, j] = length(intersect(u, v))/length(unique(c(u,v)))
}
}
# the lower half of the matrix filled in based on symmetry
for (i in 1:n)
for (j in 1:(i-1))
w[i, j] = w[j,i]
Terms = paste( sprintf("%-1.0e",as.numeric(enrichedTerms$adj.Pval)),
names(geneLists))
rownames(w) = Terms
colnames(w) = Terms
par(mar=c(0,0,1,rightMargin)) # a large margin for showing
dend <- as.dist(1-w) %>%
hclust (method="average")
ix = dend$order # permutated order of leaves
leafType= as.factor( gsub(" .*","", enrichedTerms$Direction[ix] ) )
if(length(unique(enrichedTerms$Direction) ) ==2 )
leafColors = c("green","red") else # mycolors
leafColors = mycolors
#leafSize = unlist( lapply(geneLists,length) ) # leaf size represent number of genes
#leafSize = sqrt( leafSize[ix] )
leafSize = -log10(as.numeric( enrichedTerms$adj.Pval[ix] ) ) # leaf size represent P values
leafSize = 1.5*leafSize/max( leafSize ) + .2
dend %>%
as.dendrogram(hang=-1) %>%
set("leaves_pch", 19) %>% # type of marker
set("leaves_cex", leafSize) %>% #Size
set("leaves_col", leafColors[leafType]) %>% # up or down genes
plot(horiz=TRUE)
#legend("top",pch=19, col=leafColors[1:2],legend=levels(leafType),bty = "n",horiz =T )
# add legend using a second layer
par(lend = 1) # square line ends for the color legend
add_legend("top",pch=19, col=leafColors[1:2],legend=levels(leafType),bty = "n",horiz =T
)
}
# numChar=100 maximum number of characters
# n=200 maximum number of nodes
# degree.cutoff = 0 Remove node if less connected
#from PPInfer
enrich.net2 <- function (x, gene.set, node.id, node.name = node.id, pvalue,
n = 50, numChar = NULL, pvalue.cutoff = 0.05, edge.cutoff = 0.05,
degree.cutoff = 0, edge.width = function(x) {
5 * x^2
}, node.size = function(x) {
2.5 * log10(x)
}, group = FALSE, group.color = c("green","red" ), group.shape = c("circle",
"square"), legend.parameter = list("topright"), show.legend = TRUE, plotting=TRUE,
...)
{
library(igraph)
x <- data.frame(x, group)
colnames(x)[length(colnames(x))] <- "Group"
x <- x[as.numeric( x[, pvalue]) < pvalue.cutoff, ]
x <- x[order(x[, pvalue]), ]
n <- min(nrow(x), n)
if (n == 0) {
stop("no enriched term found...")
}
x <- x[1:n, ]
index <- match(x[, node.id], names(gene.set))
geneSets <- list()
for (i in 1:n) {
geneSets[[i]] <- gene.set[[index[i]]]
}
names(geneSets) <- x[, node.name]
if (is.null(numChar)) {
numChar <- max(nchar(as.character(x[, node.name])))
}
else {
if (length(unique(substr(x[, node.name], 1, numChar))) <
nrow(x)) {
numChar <- max(nchar(as.character(x[, node.name])))
message("Note : numChar is too small.", "\n")
}
}
x[, node.name] <- paste(substr(x[, node.name], 1, numChar),
ifelse(nchar(as.character(x[, node.name])) > numChar,
"...", ""), sep = "")
w <- matrix(NA, nrow = n, ncol = n)
for (i in 1:n) {
for (j in i:n) {
u <- unlist(geneSets[i])
v <- unlist(geneSets[j])
w[i, j] = length(intersect(u, v))/length(unique(c(u,
v)))
}
}
list.edges <- stack(data.frame(w))
list.edges <- cbind(list.edges[, 1], rep(x[, node.name],
n), rep(x[, node.name], each = n))
list.edges <- list.edges[list.edges[, 2] != list.edges[,3], ]
list.edges <- list.edges[!is.na(list.edges[, 1]), ]
g <- graph.data.frame(list.edges[, -1], directed = FALSE)
E(g)$width = edge.width(as.numeric(list.edges[, 1]))
V(g)$size <- node.size(lengths(geneSets))
g <- delete.edges(g, E(g)[as.numeric(list.edges[, 1]) < edge.cutoff])
index.deg <- igraph::degree(g) >= degree.cutoff
g <- delete.vertices(g, V(g)[!index.deg])
x <- x[index.deg, ]
index <- index[index.deg]
if (length(V(g)) == 0) {
stop("no categories greater than degree.cutoff...")
}
n <- min(nrow(x), n)
x <- x[1:n, ]
group.level <- sort(unique(group))
pvalues <- x[, pvalue]
for (i in 1:length(group.level)) {
index <- x[, "Group"] == group.level[i]
V(g)$shape[index] <- group.shape[i]
group.pvalues <- pvalues[index]
if (length(group.pvalues) > 0) {
if (max(group.pvalues) == min(group.pvalues)) {
V(g)$color[index] <- adjustcolor(group.color[i],
alpha.f = 0.5)
}
else {
V(g)$color[index] <- sapply(1 - (group.pvalues -
min(group.pvalues))/(max(group.pvalues) - min(group.pvalues)),
function(x) {
adjustcolor(group.color[i], alpha.f = x)
})
}
}
}
if(plotting) {
plot(g,, vertex.label.dist=1, ...)
if (show.legend) {
legend.parameter$legend <- group.level
legend.parameter$text.col <- group.color
legend.parameter$bty <- "n"
do.call(legend, legend.parameter)
}}
return(g)
}
enrichmentNetwork <- function(enrichedTerms){
geneLists = lapply(enrichedTerms$Genes, function(x) unlist( strsplit(as.character(x)," " ) ) )
names(geneLists)= enrichedTerms$Pathways
enrichedTerms$Direction = gsub(" .*","",enrichedTerms$Direction )
g <- enrich.net2(enrichedTerms, geneLists, node.id = "Pathways", numChar = 100,
pvalue = "adj.Pval", edge.cutoff = 0.2, pvalue.cutoff = 1, degree.cutoff = 0,
n = 200, group = enrichedTerms$Direction, vertex.label.cex = 1, vertex.label.color = "black")
}
enrichmentNetworkPlotly <- function(enrichedTerms){
geneLists = lapply(enrichedTerms$Genes, function(x) unlist( strsplit(as.character(x)," " ) ) )
names(geneLists)= enrichedTerms$Pathways
g <- enrich.net2(enrichedTerms, geneLists, node.id = "Pathways", numChar = 100,
pvalue = "adj.Pval", edge.cutoff = 0.2, pvalue.cutoff = 1, degree.cutoff = 0,
n = 200, group = enrichedTerms$Direction, vertex.label.cex = 0.8, vertex.label.color = "black"
,plotting=TRUE)
vs <- V(g)
es <- as.data.frame(get.edgelist(g))
Nv <- length(vs)
Ne <- length(es[1]$V1)
# create nodes
L <- layout.kamada.kawai(g)
Xn <- L[,1]
Yn <- L[,2]
inc = (max(Yn)-min(Yn))/50 # for shifting
#group <- ifelse(V(g)$shape == "circle", "GO", "KEGG")
group <- as.character(enrichedTerms$Direction)
network <- plot_ly(x = ~Xn, y = ~Yn, type = "scatter", mode = "markers",
marker = list(color = V(g)$color, size = V(g)$size*2,
symbol= ~V(g)$shape, line = list(color = "gray", width = 2)),
hoverinfo = "text", text = ~paste("</br>", group, "</br>", names(vs))) %>%
add_annotations( x = ~Xn, y = ~Yn+inc, text = names(vs), showarrow = FALSE,
font = list(color = "#030303", size = 12))
# create edges
edge_shapes <- list()
for(i in 1:Ne)
{
v0 <- es[i,]$V1
v1 <- es[i,]$V2
index0 <- match(v0, names(V(g)))
index1 <- match(v1, names(V(g)))
edge_shape <- list(type = "line", line = list(color = "gray" ,
width = E(g)$width[i]/2), x0 = Xn[index0], y0 = Yn[index0],
x1 = Xn[index1], y1 = Yn[index1])
edge_shapes[[i]] <- edge_shape
}
# create network
axis <- list(title = "", showgrid = FALSE, showticklabels = FALSE, zeroline = FALSE)
h <- layout(network, title = "Enrichment Network", shapes = edge_shapes,
xaxis = axis, yaxis = axis)
config(h, showLink = TRUE)
}
#Homo sapies --> hsapiens
shortSpeciesNames <- function(tem){
tem2 = strsplit(as.character(tem)," " )
return( tolower( paste0(substr(tem2[[1]][1],1,1), tem2[[1]][2] ) ) )
}
findTaxonomyID <- function( speciesName = "mmusculus") {
if(!is.null(input_speciesName) ) { # if species name is entered
ix = match(input_speciesName, STRING10_species$official_name)
} else if( input_selectGO2 != "ID not recognized!" )
{ # if no species is entered, try to resolve species using existing info
codedNames = sapply(STRING10_species$compact_name,shortSpeciesNames )
ix = match(speciesName , codedNames)
if(input_selectOrg != speciesChoice[[1]]) { # if species is entered
selectedSpecies = findSpeciesById(input_selectOrg)[1,1]
ix = match( gsub("_.*","", selectedSpecies ), codedNames)
}
} else return(NULL)
if(length(ix) == 0 | is.na(ix) ) return(NULL)
return(STRING10_species$species_id[ix])
}
STRINGdb_geneList <- function() {
library(STRINGdb,verbose=FALSE)
taxonomyID = findTaxonomyID.out
if(is.null( taxonomyID ) ) return(NULL)
#Intialization
string_db <- STRINGdb$new( version="10", species=taxonomyID,
score_threshold=0, input_directory="" )
# using expression data
genes <- geneListData.out
colnames(genes)[1:2]=c("gene","lfc")
mapped <- string_db$map(genes,"gene", removeUnmappedRows = TRUE )
up= subset(mapped, lfc>0, select="STRING_id", drop=TRUE )
down= subset(mapped, lfc<0, select="STRING_id", drop=TRUE )
mappingRatio = nrow(mapped)/ nrow(genes)
if(nrow(mapped) == 0) return(NULL) else
return( list(up=up, down=down, ratio=mappingRatio, geneTable=mapped ) )
}
STRINGDB_mapping_stat <- function( ) {
if( is.null(STRINGdb_geneList.out ) ) return("No genes mapped by STRINGdb. Please enter or double-check species name above.")
if(! is.null(STRINGdb_geneList.out ) ) {
tem=paste0( 100*round(STRINGdb_geneList()$ratio,3), "% genes mapped by STRING web server.")
if(STRINGdb_geneList()$ratio <0.3 ) tem = paste(tem, "Warning!!! Very few gene mapped. Double check if the correct species is selected.")
return( tem )
}
}
stringDB_GO_enrichmentData <- function() {
library(STRINGdb,verbose=FALSE)
tem = input_STRINGdbGO
taxonomyID = findTaxonomyID.out
if(is.null( taxonomyID ) ) return(NULL)
NoSig = as.data.frame("No significant enrichment found.")
####################################
#Intialization
string_db <- STRINGdb$new( version="10", species=taxonomyID,
score_threshold=0, input_directory="" )
# using expression data
genes <- selectedHeatmap.data.out$genes
if(is.null(genes) ) return(NULL)
if(dim(genes)[1] <= minGenesEnrichment ) return(NoSig) # if has only few genes
fc = selectedHeatmap.data.out$bar
# GO
results1 <- NULL; result <- NULL
pp <- 0
for( i in c(1:2) ) {
if( length(which(fc*i<0)) <= minGenesEnrichment) next;
query = rownames(genes)[which(fc*i<0)]
if( length(query) <= minGenesEnrichment) next;
ids = STRINGdb_geneList.out[[i]]
result <- string_db$get_enrichment( ids, category = input_STRINGdbGO, methodMT = "fdr", iea = TRUE )
if(nrow(result) == 0 ) next;
if(nrow(result) > 15) result <- result[1:15,]
if( dim(result)[2] ==1) next; # result could be NULL
if(i == 1) result$direction = "Up regulated" else result$direction = "Down regulated"
if (pp==0 ) { results1 <- result; pp = 1;} else results1 = rbind(results1,result)
}
if ( pp == 0 ) return (NoSig)
if ( is.null( results1) ) return (NoSig)
if( dim(results1)[2] == 1 ) return(NoSig) # Returns a data frame: "No significant results found!"
results1= results1[,c(7,5,3, 6)]
colnames(results1)= c("List","FDR","nGenes","GO terms or pathways")
minFDR = 0.01
if(min(results1$FDR) > minFDR ) results1 = as.data.frame("No signficant enrichment found.") else
results1 = results1[which(results1$FDR < minFDR),]
if(dim(results1)[2] != 4) return(NoSig)
colnames(results1)= c("Direction","adj.Pval","nGenes","Pathways")
results1$adj.Pval <- sprintf("%-2.1e",as.numeric(results1$adj.Pval) )
rownames(results1)=1:nrow(results1)
results1[ duplicated (results1[,1] ),1 ] <- ""
return( results1 )
}
stringDB_network1 <- function( geneLists = 1 ) { # geneLists =2 for both up and down
library(STRINGdb)
tem = input_nGenesPPI
taxonomyID = findTaxonomyID.out
if(is.null( taxonomyID ) ) return(NULL)
####################################
NoSig = as.data.frame("No significant enrichment found.")
if(is.null(STRINGdb_geneList.out ) ) return(NULL)
#Intialization
string_db <- STRINGdb$new( version="10", species=taxonomyID,
score_threshold=0, input_directory="" )
# only up regulated is ploted
for( i in c(1:geneLists) ) {
ids = STRINGdb_geneList.out[[i]]
if(length(ids)> input_nGenesPPI ) # n of genes cannot be more than 400
ids <- ids[1:input_nGenesPPI]
string_db$plot_network( ids,add_link=FALSE)
}
}
# generates a html file containing results and also links to STRING-db
stringDB_network_link <- function(){
library(STRINGdb,verbose=FALSE)
taxonomyID = findTaxonomyID.out
if(is.null( taxonomyID ) ) return(NULL)
NoSig = as.data.frame("No significant enrichment found.")
if(is.null(STRINGdb_geneList.out ) ) return(NULL)
#Intialization
string_db <- STRINGdb$new( version="10", species=taxonomyID,
score_threshold=0, input_directory="" )
# upregulated
ids = STRINGdb_geneList.out[[1]]
if(length(ids)> input_nGenesPPI ) # n of genes cannot be more than 400
ids <- ids[1:input_nGenesPPI]
link1 = string_db$get_link( ids)
Pval1 = string_db$get_ppi_enrichment( ids)
tem = "<h5> Interactive and annotated PPI networks among DEGs: <br/> "
tem = paste(tem, "<a href=\"", link1, "\" target=\"_blank\"> Up-regulated; </a>" )
# downregulated
ids = STRINGdb_geneList()[[2]]
if(length(ids)> input_nGenesPPI )
ids <- ids[1:input_nGenesPPI]
link2 = string_db$get_link( ids)
Pval2 = string_db$get_ppi_enrichment( ids)
tem = paste(tem, "   <a href=\"", link2, "\"target=\"_blank\"> Down-regulated; </a>" )
# both up and down with color code
geneTable = STRINGdb_geneList.out$geneTable
if(nrow(geneTable)> input_nGenesPPI )
geneTable <- geneTable[1:input_nGenesPPI,]
geneTable = string_db$add_diff_exp_color( geneTable, logFcColStr="lfc" )
payload_id <- string_db$post_payload( geneTable$STRING_id,colors=geneTable$color )
link3 = string_db$get_link(geneTable$STRING_id, payload_id = payload_id)
tem = paste(tem, "   <a href=\"", link3, "\"target=\"_blank\"> Both with fold-changes color coded.</a></h5>" )
tem2 = paste("<h5> PPI enrichment P values: ")
tem2 = paste0(tem2,"Up-regulated: ", sprintf("%-3.2e",Pval1[1]), "   Down-regulated: ", sprintf("%-3.2e",Pval2[1]),".")
tem2 = paste(tem2, " Small P values indicate more PPIs among DEGs than background. </h5>" )
tem = paste(tem2,tem )
return(tem)
}6.9 Pathway analysis
################################################################
# Pathway analysis
################################################################
gagePathwayData <- function( ){
library(gage,verbose=FALSE) # pathway analysis
####################################
myrange = c(input_minSetSize, input_maxSetSize)
noSig = as.data.frame("No significant pathway found.")
if( length(limma.out$topGenes) == 0 ) return(noSig)
if(length( limma.out$comparisons) ==1 )
{ top1=limma.out$topGenes[[1]]
} else {
top = limma.out$topGenes
ix = match(input_selectContrast1, names(top))
if( is.na(ix)) return (noSig)
top1 <- top[[ix]];
}
if(dim(top1)[1] == 0 ) return (noSig)
colnames(top1)= c("Fold","FDR")
top1 = top1[which(top1$FDR <input_GenePvalCutoff) , ]
gmt = GeneSets.out
if(length( GeneSets.out ) == 0) { return(as.data.frame("No gene set found!"))}
#converted = convertID(rownames(top1),input_selectOrg)
#
#gmt = readGeneSets(converted, top1, input_selectGO, input_selectOrg, myrange )
# cat("Sets",length(gmt))
fold = top1[,1]; names(fold) <- rownames(top1)
if(input_absoluteFold) fold <- abs(fold)
paths <- gage(fold, gsets = gmt, ref = NULL, samp = NULL)
paths <- rbind(paths$greater,paths$less)
# write.csv(paths,"tem.csv")
# cat( dim(paths) )
if(dim(paths)[1] < 1 | dim(paths)[2]< 6 ) return( noSig )
top1 <- paths[,c('stat.mean','set.size','q.val')]
colnames(top1)= c("statistic","Genes","adj.Pval")
top1 <- top1[order(top1[,3]) ,]
if ( length( which( top1[,3] <= input_pathwayPvalCutoff ) ) == 0 )
return( noSig)
top1 <- top1[which(top1[,3] <= input_pathwayPvalCutoff ) ,,drop=FALSE]
if(dim(top1)[1] > input_nPathwayShow )
top1 <- top1[1:input_nPathwayShow, ,drop=FALSE]
top1 <- as.data.frame(top1)
top1 <- cbind(rep( input_selectContrast1, dim(top1)[1]),row.names(top1), top1);
top1$statistic <- as.character( round(as.numeric(top1$statistic),4));
top1$adj.Pval <- sprintf("%-2.1e",as.numeric(top1$adj.Pval) )
top1[,2] <- as.character(top1[,2]);top1[,1] <- as.character(top1[,1])
colnames(top1)[1] <- "Direction"
p.m <- "GAGE"
colnames(top1)[2] <- paste(p.m," analysis:", gsub("-"," vs ",input_selectContrast1 ) )
top1[ which( top1[,3] >0),1 ] <- "Up" #gsub("-"," > ",input_selectContrast1 )
top1[ which( top1[,3] <0),1 ] <- "Down" # gsub("-"," < ",input_selectContrast1 )
top1 <- top1[order( top1[,1], -abs(as.numeric( top1[,3]) ) ) ,]
top1[ duplicated (top1[,1] ),1 ] <- ""
return( top1)
}
fgseaPathwayData <- function() {
library(fgsea,verbose=FALSE) # fast GSEA
noSig = as.data.frame("No significant pathway found.")
if( length(limma.out$topGenes) == 0 ) return(noSig)
if(length( limma.out$comparisons) ==1 )
{ top1=limma.out$topGenes[[1]]
} else {
top = limma.out$topGenes
ix = match(input_selectContrast1, names(top))
if( is.na(ix)) return (noSig)
top1 <- top[[ix]];
}
if(dim(top1)[1] == 0 ) return (noSig)
colnames(top1)= c("Fold","FDR")
# remove some genes
top1 = top1[which(top1$FDR <input_GenePvalCutoff) , ]
gmt = GeneSets.out
if(length( GeneSets.out ) == 0) { return(as.data.frame("No gene set found!"))}
#converted = convertID(rownames(top1),input_selectOrg)
#
#gmt = readGeneSets(converted, top1, input_selectGO, input_selectOrg, myrange )
# cat("Sets",length(gmt))
fold = top1[,1]; names(fold) <- rownames(top1)
if(input_absoluteFold) fold <- abs(fold) # use absolute value of fold change, disregard direction
paths <- fgsea(pathways = gmt,
stats = fold,
minSize=input_minSetSize,
maxSize=input_maxSetSize,
nperm=10000)
# paths <- rbind(paths$greater,paths$less)
if(dim(paths)[1] < 1 ) return( noSig )
paths <- as.data.frame(paths)
paths <- paths[order(-abs( paths[,5])) ,] # sort by NES
# paths <- paths[order( paths[,3]) ,] # sort by FDR
top1 <- paths[,c(1,5,7,3)]
# rownames(top1) <- paths[,1] #paste(1:dim(paths)[1],": ",paths[,1],sep="" )
colnames(top1)= c("Pathway", "NES","Genes","adj.Pval")
if ( length( which( top1[,4] <= input_pathwayPvalCutoff ) ) == 0 )
return( noSig)
top1 <- top1[which(top1[,4] <= input_pathwayPvalCutoff ) , ,drop=FALSE]
if(dim(top1)[1] > input_nPathwayShow )
top1 <- top1[1:input_nPathwayShow,,drop=FALSE]
#top1 <- cbind(row.names(top1), top1); colnames(top1)[1] <-input_selectContrast1
top1 <- as.data.frame(top1)
top1 <- cbind(rep( input_selectContrast1, dim(top1)[1]), top1);
top1[,4] = as.character( round(as.numeric(top1[,4]),4));
top1$adj.Pval <- sprintf("%-2.1e",as.numeric(top1$adj.Pval) )
top1[,1] <- as.character(top1[,1])
colnames(top1)[1] <- "Direction"
p.m <- "GSEA"
colnames(top1)[2] <- paste(p.m," analysis:", gsub("-"," vs ",input_selectContrast1 ) )
top1[ which( as.numeric( top1[,3]) >0),1 ] <- "Up" #gsub("-"," > ",input_selectContrast1 )
top1[ which( as.numeric( top1[,3]) <0),1 ] <- "Down" #gsub("-"," < ",input_selectContrast1 )
top1 <- top1[order( top1[,1], -abs(as.numeric( top1[,3]) ) ) ,]
top1[ duplicated (top1[,1] ),1 ] <- ""
top1[,3] = as.character( round(as.numeric(top1[,3]),4));
return( top1)
}
extract1 <- function (x) {
words <- unlist ( strsplit(x,"_"))
if(length( words ) <=4 ) return(gsub("_"," ",x)) else {
words <- words[-c(1:4)]
return( proper(paste(words,collapse = " ") ) )}
}
PGSEApathway <- function (converted,convertedData, selectOrg,GO,gmt, myrange,Pval_pathway,top){
subtype = detectGroups(colnames(convertedData))
library(PGSEA,verbose=FALSE)
Pvalue = 0.01 # cut off to report in PGSEA. Otherwise NA
#Pval_pathway = 0.2 # cut off for P value of ANOVA test to writ to file
# top = 30 # number of pathways to show
if(length(gmt) ==0 ) return( list(pg3 = NULL, best = 1 ) )
# centering by mean
#pg = myPGSEA (convertedData - rowMeans(convertedData),
# cl=gmt,range=myrange,p.value=TRUE, weighted=FALSE,nPermutation=100)
#if( class(convertedData) != "data.frame" | class(convertedData) != "matarix") return( list(pg3 = NULL, best = 1 ) )
#if( dim(convertedData)[2] <2 ) return( list(pg3 = NULL, best = 1 ) )
pg = PGSEA (convertedData - rowMeans(convertedData),cl=gmt,range=myrange,p.value=TRUE, weighted=FALSE)
pg2 = pg$results;
pg2 = pg2[rowSums(is.na(pg2))<ncol(pg2) ,] # remove se/wrts with all missing(non-signficant)
if (dim(pg2)[1] < 2 ) return.out
best = max(abs(pg2))
if(length(subtype) < 4 || length(unique(subtype)) <2 ||length(unique(subtype)) == dim(convertedData)[2] ) {
pg2 = pg2[order(-apply(pg2,1,sd) ) ,]
return( list(pg3 = pg2[1:top,], best = best ) )
}
cat("\nComputing P values using ANOVA\n");
pathPvalue <- function ( k){
return( summary(aov(pg2[k,]~subtype) )[[1]][["Pr(>F)"]][1] )
}
Pvalues = sapply(1:dim(pg2)[1], pathPvalue)
Pvalues = p.adjust(Pvalues, "fdr")
#if(min(Pvalues) > Pval_pathway ) return( list(pg3 = NULL, best = best ) ) else {
if(sort(Pvalues)[2] > Pval_pathway ) return( list(pg3 = NULL, best = best ) ) else {
NsigT = rowSums(pg$p.results<Pvalue)
result=cbind( as.matrix(Pvalues),NsigT,pg2);
result = result[ order(result[,1]) ,]
result = result[which(result[,1] < Pval_pathway),,drop=F]
#result = result[which(result[,2] >2) ,]
pg2 = result[,-2]
# when there is only 1 left in the matrix pg2 becomes a vector
if(sum( Pvalues<Pval_pathway) == 1) { pg3 = t( as.matrix(pg2));pg3 = rbind(pg3,pg3);} else
{ if(dim(pg2)[1] > top ) { pg3 = pg2[1:top,]; } else { pg3 = pg2; } }
rownames(pg3) = sapply(rownames(pg3) , extract1)
a=sprintf("%-3.2e",pg3[,1])
rownames(pg3) = paste(a,rownames(pg3),sep=" ")
pg3 =pg3[,-1]
pg3 <- pg3[order( -apply(pg3,1,sd) ),] # sort by SD
return( list(pg3 = pg3, best = best ) )
}
}
PGSEAplot <- function(){
library(PGSEA,verbose=FALSE)
if(input_selectGO == "ID not recognized!" ) return( NULL)
myrange = c(input_minSetSize, input_maxSetSize)
genes = convertedData.out
if (is.null(input_selectContrast1 ) ) return(NULL)
gmt = GeneSets.out
# find related samples
iz = findContrastSamples(input_selectContrast1, colnames(convertedData.out),readSampleInfo.out,
input_selectFactorsModel,input_selectModelComprions,
factorReferenceLevels.out,input_CountsDEGMethod,
input_dataFileFormat )
genes = genes[,iz]
subtype = detectGroups(colnames(genes ))
if(length( GeneSets.out ) == 0) { plot.new(); text(0.5,0.5, "No gene sets!")} else {
result = PGSEApathway(converted.out,genes, input_selectOrg,input_selectGO,
GeneSets.out, myrange, input_pathwayPvalCutoff, input_nPathwayShow )
if( is.null(result$pg3) ) { plot.new(); text(0.5,1, "No significant pathway found!")} else
smcPlot(result$pg3,factor(subtype),scale = c(-max(result$pg3), max(result$pg3)),
show.grid = T, margins = c(3,1, 13,28), col = .rwb,cex.lab=0.5, main="Pathway Analysis:PGSEA")
}
}
convertEnsembl2Entrez <- function (query,Species) {
querySet <- cleanGeneSet( unlist( strsplit( toupper(names( query)),'\t| |\n|\\,' ) ) )
speciesID <- orgInfo$id[ which(orgInfo$ensembl_dataset == Species)] # note uses species Identifying
# idType 6 for entrez gene ID
result <- dbGetQuery( convert,
paste( " select id,ens,species from mapping where ens IN ('", paste(querySet,collapse="', '"),
"') AND idType ='",idType_Entrez,"'",sep="") ) # slow
if( dim(result)[1] == 0 ) return(NULL)
result <- subset(result, species==speciesID, select = -species)
ix = match(result$ens,names(query) )
tem <- query[ix]; names(tem) = result$id
return(tem)
}
ReactomePAPathwayData <- function( ){
library(ReactomePA,verbose=FALSE) # pathway analysis
ensemblSpecies <- c("hsapiens_gene_ensembl","rnorvegicus_gene_ensembl", "mmusculus_gene_ensembl",
"celegans_gene_ensembl","scerevisiae_gene_ensembl", "drerio_gene_ensembl", "dmelanogaster_gene_ensembl")
ReactomePASpecies= c("human", "rat", "mouse", "celegans", "yeast", "zebrafish", "fly" )
# cbind(ensemblSpecies,ReactomePASpecies) # double check mapping
myrange = c(input_minSetSize, input_maxSetSize)
noSig = as.data.frame("No significant pathway found.")
if( length(limma.out$topGenes) == 0 ) return(noSig)
if(length( limma.out$comparisons) ==1 )
{ top1=limma.out$topGenes[[1]]
} else {
top = limma.out$topGenes
ix = match(input_selectContrast1, names(top))
if( is.na(ix)) return (noSig)
top1 <- top[[ix]];
}
if(dim(top1)[1] == 0 ) return (noSig)
colnames(top1)= c("Fold","FDR")
# remove some genes
top1 = top1[which(top1$FDR <input_GenePvalCutoff) , ]
fold = top1[,1]; names(fold) <- rownames(top1)
if(input_absoluteFold) fold <- abs(fold) # use absolute value of fold change, disregard direction
Species <- converted.out$species
ix <- match( Species,ensemblSpecies )
if(is.na(ix) ) return(as.data.frame("Species not coverted by ReactomePA package!"))
fold <- convertEnsembl2Entrez (fold, Species)
fold <- sort(fold,decreasing =T)
paths <- gsePathway(fold, nPerm=5000, organism = ReactomePASpecies[ix],
minGSSize= input_minSetSize,
maxGSSize= input_maxSetSize,
pvalueCutoff=0.5,
pAdjustMethod="BH", verbose=FALSE)
paths <- as.data.frame(paths)
if(is.null(paths) ) return( noSig)
if(dim(paths)[1] ==0 ) return( noSig)
# paths <- rbind(paths$greater,paths$less)
if(dim(paths)[1] < 1 ) return( noSig )
paths <- as.data.frame(paths)
paths <- paths[order(-abs( paths[,5])) ,] # sort by NES
# paths <- paths[order( paths[,3]) ,] # sort by FDR
top1 <- paths[,c(2,5,3,7)]
# rownames(top1) <- paths[,1] #paste(1:dim(paths)[1],": ",paths[,1],sep="" )
colnames(top1)= c("Pathway", "NES","Genes","adj.Pval")
if ( length( which( top1[,4] <= input_pathwayPvalCutoff ) ) == 0 )
return( noSig)
top1 <- top1[which(top1[,4] <= input_pathwayPvalCutoff ) ,,drop=FALSE]
if(dim(top1)[1] > input_nPathwayShow )
top1 <- top1[1:input_nPathwayShow,,drop=FALSE]
#top1 <- cbind(row.names(top1), top1); colnames(top1)[1] <-input_selectContrast1
top1 <- as.data.frame(top1)
top1 <- cbind(rep( input_selectContrast1, dim(top1)[1]), top1);
top1[,4] = as.character( round(as.numeric(top1[,4]),4));
top1$adj.Pval <- sprintf("%-2.1e",as.numeric(top1$adj.Pval) )
top1[,1] <- as.character(top1[,1])
colnames(top1)[1] <- "Direction"
if(input_pathwayMethod == 1 ) p.m <- "GAGE"
else if(input_pathwayMethod == 2 ) p.m <- "PGSEA"
else if(input_pathwayMethod == 3 ) p.m <- "GSEA"
else if(input_pathwayMethod == 4 ) p.m <- "PGSEA_All"
else if(input_pathwayMethod == 5 ) p.m <- "ReactomePA"
colnames(top1)[2] <- paste(p.m," analysis:", gsub("-"," vs ",input_selectContrast1 ) )
top1[ which( as.numeric( top1[,3]) >0),1 ] <- "Up" #gsub("-"," > ",input_selectContrast1 )
top1[ which( as.numeric( top1[,3]) <0),1 ] <- "Down" #gsub("-"," < ",input_selectContrast1 )
top1 <- top1[order( top1[,1], -abs(as.numeric( top1[,3]) ) ) ,]
top1[ duplicated (top1[,1] ),1 ] <- ""
top1[,3] = as.character( round(as.numeric(top1[,3]),4));
return( top1)
}
selectedPathwayData <- function( ){
if(input_sigPathways == "All") return (NULL)
ix <- which(names(GeneSets.out ) == input_sigPathways ) # find the gene set
if(length(ix) == 0 ) return(NULL)
genes <- GeneSets.out[[ix]] # retrieve genes
# find related samples
iz = findContrastSamples(input_selectContrast1, colnames(convertedData.out),readSampleInfo.out,
input_selectFactorsModel,input_selectModelComprions,
factorReferenceLevels.out,input_CountsDEGMethod ,
input_dataFileFormat )
x <- convertedData.out[which(rownames(convertedData.out) %in% genes), iz ]
if( input_selectOrg != "NEW") {
ix = match( rownames(x), allGeneInfo.out[,1])
if( sum( is.na(allGeneInfo.out$symbol )) != dim(allGeneInfo.out )[1] ) # symbol really exists?
rownames(x) <- paste(rownames(x),":", as.character( allGeneInfo.out$symbol)[ix])
}
return( x )
}
selectedPathwayHeatmap <- function() {
x = selectedPathwayData.out
if(dim(x)[1]<=2 | dim(x)[2]<=2 ) return(NULL)
groups = detectGroups(colnames(x))
# this will cutoff very large values, which could skew the color
x=as.matrix(x)-apply(as.matrix(x),1,mean)
cutoff = median(unlist(x)) + 3*sd (unlist(x))
x[x>cutoff] <- cutoff
cutoff = median(unlist(x)) - 3*sd (unlist(x))
x[x< cutoff] <- cutoff
# sometimes, a gene can be all zero in selected samples.
x <- x[which(apply(x,1,sd)>0) ,]
lmat = rbind(c(5,4),c(0,1),c(3,2))
lwid = c(1.5,6)
lhei = c(.5,.2,8)
if( dim(x)[1]>200)
heatmap.2(x, distfun = dist2,hclustfun=hclust2,
col=heatColors[as.integer(input_heatColors1),], density.info="none", trace="none", scale="none", keysize=.5
,key=T, symkey=F
,dendrogram = "row"
,ColSideColors=mycolors[ groups]
,labRow=""
,margins=c(10,15)
,srtCol=45
,lmat = lmat, lwid = lwid, lhei = lhei
#,main ="Title"
)
if( dim(x)[1]<=200)
heatmap.2(x, distfun = dist2,hclustfun=hclust2,
col=heatColors[as.integer(input_heatColors1),], density.info="none", trace="none", scale="none", keysize=.5
,key=T, symkey=F,
dendrogram = "row",
,labRow=gsub(".*:","",rownames(x))
,ColSideColors=mycolors[ groups]
,margins=c(10,15)
,cexRow=1.2
,srtCol=45
,lmat = lmat, lwid = lwid, lhei = lhei
#,main ="Title"
)
}
convertEnsembl2KEGG <- function (query,Species) { # not working
querySet <- cleanGeneSet( unlist( strsplit( toupper(names( query)),'\t| |\n|\\,' ) ) )
speciesID <- orgInfo$id[ which(orgInfo$ensembl_dataset == Species)] # note uses species Identifying
# idType 6 for entrez gene ID
result <- dbGetQuery( convert,
paste( " select id,ens,species from mapping where ens IN ('", paste(querySet,collapse="', '"),
"') AND idType ='",idType_KEGG,"'",sep="") ) # slow
if( dim(result)[1] == 0 ) return(NULL)
result <- subset(result, species==speciesID, select = -species)
ix = match(result$ens,names(query) )
tem <- query[ix]; names(tem) = result$id
return(tem)
}
keggPathwayID <- function (pathwayDescription, Species, GO,selectOrg) {
ix = grep(Species,gmtFiles)
if (length(ix) == 0 ) {return(NULL)}
# If selected species is not the default "bestMatch", use that species directly
if(selectOrg != speciesChoice[[1]]) {
ix = grep(findSpeciesById(selectOrg)[1,1], gmtFiles )
if (length(ix) == 0 ) {return(NULL )}
totalGenes <- orgInfo[which(orgInfo$id == as.numeric(selectOrg)),7]
}
pathway <- dbConnect(sqlite,gmtFiles[ix],flags=SQLITE_RO)
# change Parkinson's disease to Parkinson\'s disease otherwise SQL
pathwayDescription <- gsub("\'","\'\'",pathwayDescription)
pathwayInfo <- dbGetQuery( pathway, paste( " select * from pathwayInfo where description = '",
pathwayDescription, "' AND name LIKE '",GO,"%'",sep="") )
pathwayInfo <- dbGetQuery( pathway, paste( " select * from pathwayInfo where description = '",
pathwayDescription, "'",sep="") )
dbDisconnect(pathway);
if(dim(pathwayInfo)[1] != 1 ) {return(NULL) }
tem = gsub(".*:","",pathwayInfo[1,2])
return( gsub("_.*","",tem) )
}
KeggImage <- function(){
# First generate a blank image. Otherwise return(NULL) gives us errors.
outfile <- tempfile(fileext='.png')
png(outfile, width=400, height=300)
frame()
dev.off()
blank <- list(src = outfile,
contentType = 'image/png',
width = 400,
height = 300,
alt = " ")
if(is.null( input_selectGO ) ) return(outfile)
if(input_selectGO != "KEGG") return(outfile)
if(is.null(gagePathwayData.out ) ) return(outfile)
if(is.null( input_sigPathways)) return (outfile)
# if( is.null(selectedPathwayData()) ) return(blank)
library(pathview)
if (is.null(input_selectContrast1 ) ) return(outfile)
if(input_sigPathways == "All") return (outfile)
if( length(limma.out$topGenes) == 0 ) return(outfile)
# get fold change
if(length( limma.out$comparisons) ==1 )
{ top1=limma.out$topGenes[[1]]
} else {
top = limma.out$topGenes
ix = match(input_selectContrast1, names(top))
if( is.na(ix)) return (outfile)
top1 <- top[[ix]];
}
if(dim(top1)[1] == 0 ) return (outfile)
# cat("here5")
colnames(top1)= c("Fold","FDR")
Species <- converted.out$species
fold = top1[,1]; names(fold) <- rownames(top1)
fold <- convertEnsembl2Entrez(fold,Species)
keggSpecies <- as.character( keggSpeciesID[which(keggSpeciesID[,1] == Species),3] )
if(nchar( keggSpecies) <=2 ) return(outfile) # not in KEGG
# kegg pathway id
pathID = keggPathwayID(input_sigPathways, Species, "KEGG",input_selectOrg)
#cat("\nhere5 ",keggSpecies, " ",Species," ",input_sigPathways, "pathID:",pathID,"End", fold[1:5],names(fold)[1:5],"\n")
#cat("\npathway:",is.na(input_sigPathways))
#cat("\n",fold[1:5],"\n",keggSpecies,"\n",pathID)
if(is.null(pathID) ) return(blank) # kegg pathway id not found.
if(nchar(pathID)<3 ) return(blank)
randomString <- gsub(".*file","",tempfile())
outfile <- paste(pathID,".",randomString,".png",sep="")
try(
pv.out <- pathview(gene.data = fold, pathway.id = pathID, out.suffix = randomString, species = keggSpecies, kegg.native=TRUE)
)
return( outfile )
}
# list of pathways with details
pathwayListData <- function(){
pathways = NULL
if( input_pathwayMethod == 1)
if(!is.null(gagePathwayData.out))
if(dim(gagePathwayData.out)[2] >1) {
pathways <- gagePathwayData.out
colnames(pathways)[2] ="Pathways";
colnames(pathways)[4] ="nGenes";
}
if( input_pathwayMethod == 3)
if(!is.null(fgseaPathwayData.out))
if(dim(fgseaPathwayData.out)[2] >1) {
pathways <- fgseaPathwayData.out
colnames(pathways)[2] ="Pathways";
colnames(pathways)[4] ="nGenes";
}
if( input_pathwayMethod == 2)
if(!is.null(PGSEAplot.data.out))
if(dim(PGSEAplot.data.out)[2] >1) {
pathways <- as.data.frame( PGSEAplot.data.out)
pathways$Pathways = substr(rownames(pathways),10, nchar( rownames(pathways)) )
pathways$adj.Pval = gsub(" .*","", rownames(pathways))
pathways$Direction ="Diff"
}
if( input_pathwayMethod == 4)
if(!is.null(PGSEAplotAllSamples.data.out))
if(dim(PGSEAplotAllSamples.data.out)[2] >1) {
pathways <- as.data.frame( PGSEAplotAllSamples.data.out)
pathways$Pathways = substr(rownames(pathways),10, nchar( rownames(pathways)) )
pathways$adj.Pval = gsub(" .*","", rownames(pathways))
pathways$Direction ="Diff"
}
if( is.null( pathways) ) return(NULL)
# if no gene set data, return pathway list
if(is.null(GeneSets.out ) ) return(pathways)
pathways$adj.Pval = as.numeric(pathways$adj.Pval)
if(nrow(pathways)>1)
for( i in 2:nrow(pathways) )
if(nchar(pathways$Direction[i]) <=1)
pathways$Direction[i] = pathways$Direction[i-1]
# gene symbol matching symbols
probeToGene = NULL
if( input_selectGO != "ID not recognized!" & input_selectOrg != "NEW")
if(sum(is.na( allGeneInfo.out$symbol ) )/ dim( allGeneInfo.out )[1] <.5 ) { # if more than 50% genes has symbol
probeToGene = allGeneInfo.out[,c("ensembl_gene_id","symbol")]
probeToGene$symbol = gsub(" ","",probeToGene$symbol)
ix = which( is.na(probeToGene$symbol) |
nchar(probeToGene$symbol)<2 |
toupper(probeToGene$symbol)=="NA" |
toupper(probeToGene$symbol)=="0" )
probeToGene[ix,2] = probeToGene[ix,1] # use gene ID
}
pathways$Genes =""
# looking up genes for each pathway
for(i in 1:nrow(pathways) ){
ix <- which(names(GeneSets.out ) == pathways$Pathways[i] ) # find the gene set
if(length(ix) != 0 ) {
genes <- GeneSets.out[[ix]] # retrieve genes
if(!is.null(probeToGene) ) {
iy = match(genes,probeToGene[,1])
genes = probeToGene[iy,2]
}
pathways$Genes[i] = paste(genes, collapse=" ")
}
}
return(pathways)
}6.10 Genome-wide view
################################################################
# Genome-wide view
################################################################
genomePlotly <- function( ){
# default plot
fake = data.frame(a=1:3,b=1:3)
p <- ggplot(fake, aes(x = a, y = b)) +
geom_blank() + ggtitle("No genes with position info.") +
theme(axis.title.x=element_blank(),axis.title.y=element_blank())
if(length( limma.out$comparisons) ==1 ) {
top1=limma.out$topGenes[[1]]
} else {
top = limma.out$topGenes
ix = match(input_selectContrast2, names(top))
if( is.na(ix)) return (ggplotly(p))
top1 <- top[[ix]];
}
if(dim(top1)[1] == 0 ) return (ggplotly(p))
colnames(top1)= c("Fold","FDR")
# write.csv(merge(top1,allGeneInfo(), by.x="row.names",by.y="ensembl_gene_id" ),"tem.csv" )
x <- merge(top1,allGeneInfo.out, by.x="row.names",by.y="ensembl_gene_id" )
# if no chromosomes found. For example if user do not convert gene IDs.
if( dim(x)[1] >5 ) {
x <- x[order(x$chromosome_name,x$start_position),]
tem = sort( table( x$chromosome_name), decreasing=T)
chromosomes <- names( tem[tem >= 5 ] ) # chromosomes with less than 100 genes are excluded
if(length(chromosomes) > 50) chromosomes <- chromosomes[1:50] # at most 50 chromosomes
chromosomes <- chromosomes[ nchar(chromosomes)<=12] # chr. name less than 10 characters
chromosomes = chromosomes[order(as.numeric(chromosomes) ) ]
# chromosomes = chromosomes[!is.na(as.numeric(chromosomes) ) ]
chromosomesNumbers = as.numeric(chromosomes)
# convert chr.x to numbers
j = max( chromosomesNumbers,na.rm=T)
for( i in 1:length( chromosomes)) {
if ( is.na(chromosomesNumbers[i]) )
{ chromosomesNumbers[i] <- j+1; j <- j+1; }
}
x <- x[which(x$chromosome_name %in% chromosomes ),]
x <- droplevels(x)
# find the number coding for chromosome
getChrNumber <- function (chrName){
return( chromosomesNumbers[ which( chromosomes == chrName)] )
}
x$chrNum = 1 # numeric coding
x$chrNum <- unlist( lapply( x$chromosome_name, getChrNumber) )
x$Row.names <- as.character( x$Row.names)
# if symbol is missing use Ensembl id
x$symbol = as.character(x$symbol)
ix = which(is.na(x$symbol))
ix2 = which(nchar(as.character(x$symbol))<= 2 )
ix3 = which( duplicated(x$symbol))
ix = unique( c(ix,ix2,ix3))
x$symbol[ix] <- x$Row.names[ix]
##################################
# plotting
# x = read.csv("tem_genome.csv")
x = x[!is.na(x$chromosome_name),]
x = x[!is.na(x$start_position),]
# only keep significant genes
#x = x[which(x$FDR<input_limmaPval),]
# x = x[which(abs(x$Fold) > log2( input_limmaFC)),]
ix = which( (x$FDR< as.numeric(input_limmaPvalViz)) & (abs(x$Fold) > as.numeric(input_limmaFCViz) ) )
if (length(ix) > 5) {
x = x[ix,]
x$start_position = x$start_position/1000000 # Mbp
chrD = 20 # distance between chrs.
foldCutoff = 3 # max log2 fold
x$Fold = x$Fold / sd(x$Fold) # standardize fold change
x$Fold[which(x$Fold > foldCutoff )] = foldCutoff # log2fold within -5 to 5
x$Fold[which(x$Fold < -1*foldCutoff )] = -1*foldCutoff
x$Fold = 4* x$Fold
x$y = x$chrNum*chrD + x$Fold
chrLengthTable = aggregate(start_position~chrNum, data=x,max )
chrTotal = dim(chrLengthTable)[1]
x$R = as.factor(sign(x$Fold))
colnames(x)[ which(colnames(x) == "start_position")] = "x"
p= ggplot(x, aes(x = x, y = y, colour = R, text = symbol ) ) + geom_point(shape = 20, size = .2)
#label y with chr names
p <- p + scale_y_continuous(labels = paste("chr",chromosomes[chrLengthTable$chrNum],sep=""), breaks = chrD* (1:chrTotal), limits = c(0,
chrD*chrTotal + 5) )
# draw horizontal lines for each chr.
for( i in 1:dim(chrLengthTable)[1] )
p = p+ annotate( "segment",x = 0, xend = chrLengthTable$start_position[i],
y = chrLengthTable$chrNum[i]*chrD, yend = chrLengthTable$chrNum[i]*chrD)
# change legend http://ggplot2.tidyverse.org/reference/scale_manual.html
p=p+scale_colour_manual(name="", # customize legend text
values=c("blue", "red"),
breaks=c("1","-1"),
labels=c("Up", "Dn"))
p = p + xlab("Position on chrs. (Mbp)") + theme(axis.title.y=element_blank())
p= p + theme(legend.position="none")
# p <- ggplot(mtcars, aes(x=hp, y=mpg)) + geom_point(shape=20, size=17)
# p=p+ geom_smooth(method = "lm",se=FALSE)
#p+ geom_line(aes(y=rollmean(y,7, na.pad=TRUE)))
# Customize hover text https://cran.r-project.org/web/packages/plotly/plotly.pdf
# style(ggplotly(p),hoverinfo="text") # not working
} # if no genes else
}
ggplotly(p)
}
# results from PREDA
genomePlotData <- function( ){
if (is.null(input_selectContrast2 ) ) return(NULL)
if( length(limma.out$topGenes) == 0 ) return(NULL)
if(length( limma.out$comparisons) ==1 )
{ top1=limma.out$topGenes[[1]]
} else {
top = limma.out$topGenes
ix = match(input_selectContrast2, names(top))
if( is.na(ix)) return (NULL)
top1 <- top[[ix]];
}
if(dim(top1)[1] == 0 ) return (NULL)
colnames(top1)= c("Fold","FDR")
# write.csv(merge(top1,allGeneInfo(), by.x="row.names",by.y="ensembl_gene_id" ),"tem.csv" )
x <- merge(top1,allGeneInfo.out, by.x="row.names",by.y="ensembl_gene_id" )
x <- x[order(x$chromosome_name,x$start_position),]
tem = sort( table( x$chromosome_name), decreasing=T)
chromosomes <- names( tem[tem > 100 ] ) # chromosomes with less than 100 genes are excluded
if(length(chromosomes) > 50) chromosomes <- chromosomes[1:50] # at most 50 chromosomes
chromosomes = chromosomes[order(as.numeric(chromosomes) ) ]
# chromosomes = chromosomes[!is.na(as.numeric(chromosomes) ) ]
chromosomesNumbers = as.numeric(chromosomes)
# convert chr.x to numbers
j = max( chromosomesNumbers,na.rm=T)
for( i in 1:length( chromosomes)) {
if ( is.na(chromosomesNumbers[i]) )
{ chromosomesNumbers[i] <- j+1; j <- j+1; }
}
x <- x[which(x$chromosome_name %in% chromosomes ),]
x <- droplevels(x)
# find the number coding for chromosome
getChrNumber <- function (chrName){
return( chromosomesNumbers[ which( chromosomes == chrName)] )
}
x$chrNum = 1 # numeric coding
x$chrNum <- unlist( lapply( x$chromosome_name, getChrNumber) )
x$Row.names <- as.character( x$Row.names)
fold = x$Fold; names(fold) = x$Row.names
# write.csv(x,"tem.csv")
x <- x[!duplicated(x$Row.names),]
#rownames(x) = x$Row.names
GEanalysisResults <- genomePlotDataPre.out;
genomic_regions_UP<-PREDAResults2GenomicRegions(GEanalysisResults, qval.threshold=input_RegionsPvalCutoff, smoothStatistic.tail="upper", smoothStatistic.threshold= input_StatisticCutoff)
genomic_regions_DOWN<-PREDAResults2GenomicRegions(GEanalysisResults, qval.threshold=input_RegionsPvalCutoff, smoothStatistic.tail="lower", smoothStatistic.threshold= -1 *input_StatisticCutoff)
if( is.null(genomic_regions_UP$Test) && is.null(genomic_regions_UP$Test) ) return(-1) # no significant regions
regions <- 0
if( !is.null(genomic_regions_UP$Test)) {
dataframe_UPregions<-GenomicRegions2dataframe(genomic_regions_UP[[1]])
dataframe_UPregions$Regulation <- "Up"
regions = dataframe_UPregions
}
if( !is.null(genomic_regions_DOWN$Test) ){
dataframe_DOWNregions<-GenomicRegions2dataframe(genomic_regions_DOWN[[1]])
dataframe_DOWNregions$Regulation <- "Down"
if ( class(regions) != "data.frame" ) regions <- dataframe_DOWNregions else # if UP regions is NULL
regions = rbind(regions,dataframe_DOWNregions )
}
if( class(regions) != "data.frame") return(-1)
Regions <- regions[,c(4,1:3)]
Regions <- Regions[which(Regions$end != Regions$start),]
Regions$size = round((Regions$end - Regions$start )/1000000,3)
Regions$chr <- chromosomes[ Regions$chr ] # convert from chr. number to names
# find the gene indices in the up or down regulated regions
regulatedGenes <- function (i) {
ix = which( Regions$chr == x$chromosome_name[i] &
Regions$start < x$start_position[i] #&(Regions$end > x$start_position[i] + x$genomeSpan[i])
&(Regions$end > x$start_position[i] )) # if the start position is within the region
if( length(ix) == 0 | length(ix) >1 ) return(NA) else return( ix )
}
regionID = unlist( lapply(1:dim(x)[1], regulatedGenes) )
x1 = x[which(!is.na(regionID)),]
regionID = regionID[!is.na(regionID)]
x1 = cbind(regionID,Regions[regionID, ], x1[,c('symbol', 'Row.names', 'Fold','FDR', 'band','start_position')])
x1 = x1[order(x1$regionID,x1$start_position ), ]
colnames(x1)[8] = "Ensembl"; colnames(x1)[4] = "Region.Start";colnames(x1)[5] = "Region.End"; colnames(x1)[12] = "Gene.Start"
# number of genes
tem = table(x1$regionID)
Regions$Ngenes = 0
Regions$Ngenes[as.integer(names(tem) ) ] <- tem
# cytoband per region
tem = unique(x1[,c('regionID','band')])
tem$band = gsub("\\..*","",tem$band)
tem = unique(tem)
Regions$band =""
Regions$ID <- 1:dim(Regions)[1]
for ( i in 1:dim(Regions)[1] )
Regions$band[i] <- paste( tem[ which( tem[,1] == Regions$ID[i]),2], collapse=";" )
# Genes
Regions$Genes <- ""
for ( i in 1:dim(Regions)[1] )
Regions$Genes[i] <- paste( x1$symbol[ which( x1[,1] == Regions$ID[i])], collapse=" " )
return( list(mainResult = GEanalysisResults, Regions = Regions, Genes = x1,legend.x=max(x$start_position)*.6, legend.y=max(chromosomesNumbers)-3 ) )
}
genomePlot <- function(){
library(PREDA,verbose=FALSE) # showing expression on genome
library(PREDAsampledata,verbose=FALSE)
library(hgu133plus2.db,verbose=FALSE)
if( class(genomePlotData.out) != "list" ) { plot.new(); text(0.2,1, "No significant regions found!")} else {
GEanalysisResults <- genomePlotData.out$mainResult;
if( is.null( GEanalysisResults ) ) return(NULL)
genomic_regions_UP<-PREDAResults2GenomicRegions(GEanalysisResults, qval.threshold=input_RegionsPvalCutoff, smoothStatistic.tail="upper", smoothStatistic.threshold= -1 *input_StatisticCutoff)
genomic_regions_DOWN<-PREDAResults2GenomicRegions(GEanalysisResults, qval.threshold=input_RegionsPvalCutoff, smoothStatistic.tail="lower", smoothStatistic.threshold= -1 *input_StatisticCutoff)
checkplot<-genomePlot(GEanalysisResults, genomicRegions=c(genomic_regions_UP, genomic_regions_DOWN), grouping=c(1, 1), scale.positions="Mb", region.colors=c("red","blue"))
legend(x=genomePlotData.out$legend.x, y=genomePlotData.out$legend.y , legend=c("UP", "DOWN"), fill=c("red","blue"))
} #if else
}
# pre-calculating PREDA, so that changing FDR cutoffs does not trigger entire calculation
genomePlotDataPre <- function( ){
library(PREDA,verbose=FALSE) # showing expression on genome
library(PREDAsampledata,verbose=FALSE)
library(hgu133plus2.db,verbose=FALSE)
if (is.null(input_selectContrast2 ) ) return(NULL)
if( length(limma.out$topGenes) == 0 ) return(NULL)
if(length( limma.out$comparisons) ==1 )
{ top1=limma.out$topGenes[[1]]
} else {
top = limma.out$topGenes
ix = match(input_selectContrast2, names(top))
if( is.na(ix)) return (NULL)
top1 <- top[[ix]];
}
if(dim(top1)[1] == 0 ) return (NULL)
colnames(top1)= c("Fold","FDR")
# write.csv(merge(top1,allGeneInfo(), by.x="row.names",by.y="ensembl_gene_id" ),"tem.csv" )
x <- merge(top1,allGeneInfo.out, by.x="row.names",by.y="ensembl_gene_id" )
#hist(top1[,1])
########## PREDA
infofile <- system.file("sampledata", "GeneExpression", "sampleinfoGE_PREDA.txt", package = "PREDAsampledata")
sampleinfo<-read.table(infofile, sep="\t", header=TRUE)
head(sampleinfo)
data(ExpressionSetRCC)
GEstatisticsForPREDA<-statisticsForPREDAfromEset(ExpressionSetRCC, statisticType="tstatistic", referenceGroupLabel="normal", classVector=sampleinfo[,"Class"])
analysesNames(GEstatisticsForPREDA)
GEGenomicAnnotations<-eset2GenomicAnnotations(ExpressionSetRCC, retain.chrs=1:22) # needs hgu133plus2.db package
GEGenomicAnnotationsForPREDA<-GenomicAnnotations2GenomicAnnotationsForPREDA(GEGenomicAnnotations, reference_position_type="median")
GEDataForPREDA<-MergeStatisticAnnotations2DataForPREDA(GEstatisticsForPREDA, GEGenomicAnnotationsForPREDA, sortAndCleanNA=TRUE)
#x = read.csv("PREDA_test.csv")
#x <- x[,-1] # remove the row index, not needed in Shiny
x <- x[order(x$chromosome_name,x$start_position),]
tem = sort( table( x$chromosome_name), decreasing=T)
chromosomes <- names( tem[tem > 100 ] ) # chromosomes with less than 100 genes are excluded
if(length(chromosomes) > 50) chromosomes <- chromosomes[1:50] # at most 50 chromosomes
chromosomes = chromosomes[order(as.numeric(chromosomes) ) ]
# chromosomes = chromosomes[!is.na(as.numeric(chromosomes) ) ]
chromosomesNumbers = as.numeric(chromosomes)
# convert chr.x to numbers
j = max( chromosomesNumbers,na.rm=T)
for( i in 1:length( chromosomes)) {
if ( is.na(chromosomesNumbers[i]) )
{ chromosomesNumbers[i] <- j+1; j <- j+1; }
}
x <- x[which(x$chromosome_name %in% chromosomes ),]
x <- droplevels(x)
# find the number coding for chromosome
getChrNumber <- function (chrName){
return( chromosomesNumbers[ which( chromosomes == chrName)] )
}
x$chrNum = 1 # numeric coding
x$chrNum <- unlist( lapply( x$chromosome_name, getChrNumber) )
x$Row.names <- as.character( x$Row.names)
fold = x$Fold; names(fold) = x$Row.names
# write.csv(x,"tem.csv")
x <- x[!duplicated(x$Row.names),]
#rownames(x) = x$Row.names
myData <- GEDataForPREDA
myData@position <- as.integer( x$start_position+ x$genomeSpan/2 )
myData@ids <- as.character( x$Row.names)
myData@chr <- as.integer( x$chrNum )
myData@start <- x$start_position
myData@end <- x$start_position + x$genomeSpan
myData@strand <- rep(1,dim(x)[1])
myData@chromosomesLabels <- chromosomes
myData@chromosomesNumbers <- as.integer( chromosomesNumbers )
myData@statistic <- as.matrix( x[,2,drop=FALSE] )
myData@analysesNames <- "Test"
myData@testedTail <- "both"
myData@optionalAnnotations <- as.matrix( x[,1:2,drop=FALSE] )
myData@optionalAnnotationsHeaders <- c("a","b")
set.seed(2)
GEanalysisResults<-PREDA_main(myData,nperms = PREDA_Permutations)
return( GEanalysisResults )
}6.11 Biclustering
################################################################
# Biclustering
################################################################
biclustering <- function( ){
library(biclust,verbose=FALSE)
if(input_biclustMethod == "BCQU()" )
library(QUBIC,verbose=FALSE) # have trouble installing on Linux
if(input_biclustMethod == "BCUnibic()" )
library(runibic,verbose=FALSE) #
x <- convertedData.out
n=input_nGenesBiclust
if(n>dim(x)[1]) n = dim(x)[1] # max as data
if(n<10) n = 10
if(n> 2000 ) n = 2000
x=as.matrix(x[1:n,])-apply(x[1:n,],1,mean)
if( input_biclustMethod == "BCXmotifs()" )
x<-discretize(x)
#res <- biclust::biclust(as.matrix( x), method = BCQU())
runR = paste( "res <- biclust::biclust(as.matrix( x), method =", input_biclustMethod ,")" )
eval(parse(text = runR ) )
return(list( x=x, res=res) )
}
biclustHeatmap <- function ( ){
res = biclustering()$res
if( res@Number == 0 ) { plot.new(); text(0.5,0.5, "No cluster found!")} else {
x = biclust::bicluster(biclustering.out$x, res, as.numeric( input_selectBicluster) )[[1]]
par(mar = c(5, 4, 1.4, 0.2))
if( dim(x)[1] <=30 )
heatmap.2(x, Rowv =T,Colv=F, dendrogram ="none",
col=heatColors[as.integer(input_heatColors1),], density.info="none", trace="none", scale="none", keysize=.2
,key=F, #labRow = T,
,margins = c(8, 24)
,cexRow=1
#,srtCol=45
,cexCol=1. # size of font for sample names
) else
heatmap.2(x, Rowv =T,Colv=F, dendrogram ="none",
col=heatColors[as.integer(input_heatColors1),], density.info="none", trace="none", scale="none", keysize=.2
,key=F, labRow = F,
,margins = c(8, 4)
,cexRow=1
#,srtCol=45
,cexCol=1. # size of font for sample names
)
}
}
geneListBclustGO <- function( ){
res = biclustering.out$res
if( res@Number == 0 ) return(as.data.frame("No clusters found!") )
x = biclust::bicluster(biclustering.out$x, res, as.numeric( input_selectBicluster) )[[1]]
if( dim(x)[1] <= minGenesEnrichment) return(NoSig)
query = rownames(x)
result <- findOverlapGMT( query, GeneSets.out,1)
result$Genes = "Up regulated"
results1 = result
if ( is.null( results1) ) return (NoSig)
if( dim(results1)[2] <5 ) return(NoSig) # Returns a data frame: "No significant results found!"
results1= results1[,c(5,1,2,4)]
colnames(results1)= c("List","FDR","Genes","GO terms or pathways")
minFDR = 0.01
if(min(results1$FDR) > minFDR ) results1 = as.data.frame("No signficant enrichment found.") else
results1 = results1[which(results1$FDR < minFDR),]
if(dim(results1)[2] != 4) return(NoSig)
colnames(results1)= c("Direction","adj.Pval","Genes","Pathways")
results1$adj.Pval <- sprintf("%-2.1e",as.numeric(results1$adj.Pval) )
results1[,1] <- as.character(results1[,1])
tem <- results1[,1]
results1[ duplicated (results1[,1] ),1 ] <- ""
results1[,-1]
}6.12 Co-expression network
################################################################
# Co-expression network
################################################################
wgcna <- function ( ){
#http://pklab.med.harvard.edu/scw2014/WGCNA.html
library(WGCNA)
x <- convertedData.out
n=input_nGenesNetwork
if(n>dim(x)[1]) n = dim(x)[1] # max as data
if(n<50) return(NULL)
if(dim(x)[2] <4) return(NULL)
if(n> maxGeneWGCNA ) n = maxGeneWGCNA
#x=as.matrix(x[1:n,])-apply(x[1:n,],1,mean)
datExpr=t(x[1:n,])
subGeneNames=colnames(datExpr)
#Choosing a soft-threshold to fit a scale-free topology to the network
powers = c(c(1:10), seq(from = 12, to=20, by=2));
sft=pickSoftThreshold(datExpr,dataIsExpr = TRUE,powerVector = powers,corFnc = cor,corOptions = list(use = 'p'),networkType = "unsigned")
softPower = input_mySoftPower;
#calclute the adjacency matrix
adj= adjacency(datExpr,type = "unsigned", power = softPower);
#turn adjacency matrix into topological overlap to minimize the effects of noise and spurious associations
TOM=TOMsimilarityFromExpr(datExpr,networkType = "unsigned", TOMType = "unsigned", power = softPower);
colnames(TOM) =subGeneNames
rownames(TOM) =subGeneNames
##########################################
# module detection
library(flashClust,verbose=FALSE)
geneTree = flashClust(as.dist(1-TOM),method="average");
# Set the minimum module size
minModuleSize = input_minModuleSize;
# Module identification using dynamic tree cut
dynamicMods = cutreeDynamic(dendro = geneTree, method="tree", minClusterSize = minModuleSize);
#dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM, method="hybrid", deepSplit = 2, pamRespectsDendro = FALSE, minClusterSize = minModuleSize);
# table(dynamicMods)
dynamicColors = labels2colors(dynamicMods)
#table(dynamicColors)
#discard the unassigned genes, and focus on the rest # this causes problems
#restGenes= (dynamicColors != "grey")
#diss1=1-TOMsimilarityFromExpr(datExpr[,restGenes], power = softPower)
#colnames(diss1) =rownames(diss1) =subGeneNames[restGenes]
#colorCode = as.character(dynamicColors[restGenes])
moduleInfo = cbind( subGeneNames, dynamicColors, dynamicMods)
moduleInfo = moduleInfo[which(moduleInfo[,2] != "grey") ,] # remove genes not in any modules
moduleInfo = moduleInfo[order(moduleInfo[,3]),] # sort
n.modules = length(unique(dynamicColors) ) -1 ; nGenes = dim(moduleInfo)[1]
return(list(x = t(datExpr),powers=powers,sft=sft, TOM = TOM, dynamicColors = dynamicColors, moduleInfo = moduleInfo,n.modules=n.modules, nGenes =nGenes) )
}
softPower <- function(){
####################################
sft = wgcna.out$sft;
powers=wgcna.out$powers;
# Plot the results
#sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit, signed R^2",type="n", main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],labels=powers,cex=cex1,col="red");
# Red line corresponds to using an R^2 cut-off
abline(h=0.80,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
}
modulePlot <- function(){
diss1 = 1-wgcna.out$TOM;
dynamicColors = wgcna.out$dynamicColors
hier1=flashClust(as.dist(diss1), method="average" )
#set the diagonal of the dissimilarity to NA
diag(diss1) = NA;
plotDendroAndColors(hier1, dynamicColors, "Dynamic Tree Cut", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05, main = "Gene dendrogram and module colors")
#Visualize the Tom plot. Raise the dissimilarity matrix to the power of 4 to bring out the module structure
#TOMplot(diss1, hier1, colorCode)
}
listWGCNA.Modules <- function(){
if (is.null(wgcna.out ) ){ # if sample info is uploaded and correctly parsed.
return(NULL)
} else {
if( dim(wgcna.out$moduleInfo)[1] == 0 ) { # if no module
return(NULL)
} else {
modules = unique(wgcna.out$moduleInfo[, c("dynamicMods","dynamicColors")] )
moduleList = apply(modules,1,paste,collapse=". ")
moduleList = paste0( moduleList, " (", table(wgcna.out$moduleInfo[,"dynamicMods"] )," genes)" )
moduleList = c(moduleList,"Entire network")
return(moduleList)
}
}
}
moduleNetwork <- function(){
outfile <- tempfile(fileext='.txt')
#module = gsub(".* ","",input_selectWGCNA.Module)
module = unlist(strsplit(input_selectWGCNA.Module," " ) )[2]
moduleColors = wgcna.out$dynamicColors
inModule = (moduleColors==module);
if( input_selectWGCNA.Module == "Entire network")
inModule = rep(TRUE, length(inModule))
datExpr = t(wgcna.out$x )
probes = colnames(datExpr)
modProbes = probes[inModule];
modTOM = wgcna.out$TOM[inModule, inModule];
dimnames(modTOM) = list(modProbes, modProbes)
nTop = input_topGenesNetwork;
if( nTop > 1000) nTop = 1000;
IMConn = softConnectivity(datExpr[, modProbes]);
top = (rank(-IMConn) <= nTop)
# adding symbols
probeToGene = NULL
if( input_selectGO5 != "ID not recognized!" & input_selectOrg != "NEW")
if(sum(is.na( allGeneInfo.out$symbol ) )/ dim( allGeneInfo.out )[1] <.5 ) { # if more than 50% genes has symbol
probeToGene = allGeneInfo.out[,c("ensembl_gene_id","symbol")]
probeToGene$symbol = gsub(" ","",probeToGene$symbol)
ix = which( is.na(probeToGene$symbol) |
nchar(probeToGene$symbol)<2 |
toupper(probeToGene$symbol)=="NA" |
toupper(probeToGene$symbol)=="0" )
probeToGene[ix,2] = probeToGene[ix,1] # use gene ID
}
net <- modTOM[top,top] >input_edgeThreshold
for( i in 1:dim(net)[1]) # remove self connection
net[i,i] = FALSE
if(!is.null(probeToGene) & !input_noIDConversion) { # if gene symbol exist
ix = match( colnames(net), probeToGene[,1])
colnames(net) = probeToGene[ix,2]
ix = match( rownames(net), probeToGene[,1])
rownames(net) = probeToGene[ix,2]
}
library(igraph,verbose=FALSE)
#plot(graph_from_data_frame(d=data.frame(1:10,ncol=2) ,directed=F) )
# http://www.kateto.net/wp-content/uploads/2016/01/NetSciX_2016_Workshop.pdf
plot( graph_from_adjacency_matrix( net, mod ="undirected" ),
vertex.label.color="black", vertex.label.dist=3,vertex.size=7)
}
networkModuleGO <- function(){
#module = gsub(".* ","",input_selectWGCNA.Module)
module = unlist(strsplit(input_selectWGCNA.Module," " ) )[2]
moduleColors = wgcna.out$dynamicColors
inModule = (moduleColors==module);
if( input_selectWGCNA.Module == "Entire network")
inModule = rep(TRUE, length(inModule))
probes = rownames(wgcna.out$x )
query = probes[inModule];
if( length(query) <= minGenesEnrichment) return(NoSig)
result <- findOverlapGMT( query, GeneSets.out,1)
result$Genes = "Up regulated"
results1 = result
if ( is.null( results1) ) return (NoSig)
if( dim(results1)[2] <5 ) return(NoSig) # Returns a data frame: "No significant results found!"
results1= results1[,c(5,1,2,4)]
colnames(results1)= c("List","FDR","Genes","GO terms or pathways")
minFDR = 0.01
if(min(results1$FDR) > minFDR ) results1 = as.data.frame("No signficant enrichment found.") else
results1 = results1[which(results1$FDR < minFDR),]
if(dim(results1)[2] != 4) return(NoSig)
colnames(results1)= c("Direction","adj.Pval","Genes","Pathways")
results1$adj.Pval <- sprintf("%-2.1e",as.numeric(results1$adj.Pval) )
results1[,1] <- as.character(results1[,1])
tem <- results1[,1]
results1[ duplicated (results1[,1] ),1 ] <- ""
results1[,-1]
}