rm(list = ls())
library(WGCNA)
library(data.table)
library(stringr)
library(openxlsx)
allowWGCNAThreads()
####step1 數(shù)據(jù)預處理
if(T){
#read expression data and pre-processing
exprset0 <- read.delim('LiverFemale3600.csv',header = T,sep = ',')
exprset <- exprset0[,c(2,9:ncol(exprset0))]
dim(exprset)
exprset <- exprset[exprset$gene_symbol != 0,]
dim(exprset)
table(duplicated(exprset[,1]))
exprset <- exprset[!duplicated(exprset$gene_symbol),]
dim(exprset)
table(duplicated(exprset[,1]))
rownames(exprset) <- exprset[,1];exprset <- exprset[,-1]
#read phenotype data and pre-processing
datTraits0 <- read.delim('ClinicalTraits.csv',header = T,sep = ',')
datTraits <- datTraits0[,c(2,9:15)]
datTraits <- datTraits[,c(1,4:8)]
dim(datTraits)
rownames(datTraits) <- datTraits[,1];datTraits <- datTraits[,-1]
#match expression data and phenotype data
sample <- colnames(exprset) #sample 是表達矩陣的樣品名排序
datTraits <- datTraits[rownames(datTraits) %in% sample,] #刪除臨床表型中多余的sample
datTraits <- datTraits[match(sample,rownames(datTraits)),] #將數(shù)量匹配的sample進行排序
identical(colnames(exprset),rownames(datTraits)) ##validation:數(shù)量及排序是否一致
#轉置expression
exprset <- t(exprset)
save(exprset,datTraits,file = 'WGCNA_HUALIN_DATApreprocessing.RData')
}
#判斷sample里面是否有離群值
if(F){
#sample clust 其實我覺得這里需要放到最前面去判斷樣本是否有離群值
#明確樣本數(shù)和基因
datExpr <- exprset
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
#首先針對樣本做個系統(tǒng)聚類
datExpr_tree<-hclust(dist(datExpr), method = "average")
sampleTree <-datExpr_tree
# 畫出樣本聚類圖(上)與樣本性狀熱圖(下):
traitColors = numbers2colors(datTraits, signed = TRUE,centered=TRUE);
plotDendroAndColors(sampleTree,
traitColors,
groupLabels = names(datTraits),
rowTextAlignment = "right-justified",
addTextGuide = TRUE ,
hang = 0.03,
dendroLabels = NULL, # 是否顯示樹labels
addGuide = FALSE, # 顯示虛線
guideHang = 0.05,
main = "Sample dendrogram and trait heatmap")
}
#####step2 選擇一個β值建立臨近矩陣根據(jù)連接度使我們的基因分布符合無尺度網(wǎng)絡
if(T){
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# 獲得各個閾值下的 R方 和平均連接度
sft = pickSoftThreshold(exprset, powerVector = powers, verbose = 5)
# 作圖:
sizeGrWindow(9, 5)
#sizeGrWindow(width, height) size in inches
#sizeGrWindow: Opens a graphics window with specified dimensions
par(mfrow = c(1,2)); #par設置作圖參數(shù):tag = value
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
}
####step2.1 根據(jù)連接度和R^2判斷softPower是否合理
if(F){
ADJ1_cor <- abs(WGCNA::cor(exprset,use = "p" ))^softPower
#ADJ1_cor是一個nGene*nGene的matrix,無向網(wǎng)絡的邊屬性計算
# 基因少(<5000)的時候使用下面的代碼:
k <- as.vector(apply(ADJ1_cor,2,sum,na.rm=T))#k表示把同一個基因與其他所有基因相關性的sum
# 基因多的時候使用下面的代碼:
# k <- softConnectivity(datE=exprset,power=softPower)
sizeGrWindow(10, 5)
par(mfrow=c(1,2))
hist(k)
scaleFreePlot(k,main="Check Scale free topology\n")
## figure legends :
#圖一可以看出只有較少的點具有較高的連接性,而大多數(shù)的點具有較低的連接性。
#圖二可以看出k與p(k)成負相關(相關性系數(shù)0.91,0.9以上最好),說明選擇的β值是沒有問題的,(相關系數(shù)r的絕對值一般在0.8以上,認為A和B有強的相關性。0.3到0.8之間,可以認為有弱的相關性。0.3以下,認為沒有相關性)
}
#### step3 根據(jù)β值獲得臨近矩陣adjacency和拓撲矩陣TOM
####分步法構建網(wǎng)絡
if(T){
# 獲得臨近矩陣:
adjacency = adjacency(exprset, power = softPower);
# 將臨近矩陣轉為 Tom 矩陣
TOM = TOMsimilarity(adjacency);
# 計算基因之間的相異度dissTOM,然后根據(jù)dissTOM層次聚類得到geneTree,然后動態(tài)剪切法對樹進行剪切成不同的模塊
dissTOM = 1-TOM
geneTree = hclust(as.dist(dissTOM),method="average")
sizeGrWindow(12,9)
par(mfrow = c(2,1))
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04);
# 使用動態(tài)剪切樹挖掘模塊:
minModuleSize = 30;
# 動態(tài)切割樹:
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
deepSplit = 2, pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
table(dynamicMods)#這個時候還是數(shù)字代表module,下一步將數(shù)字轉為color
dynamicColors=labels2colors(dynamicMods)
table(dynamicColors)
sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")
}
####step3.1 隨機選擇400個基因畫拓撲重疊熱圖
if(F){
# 拓撲熱圖:
nSelect = 400
# For reproducibility, we set the random seed
set.seed(10);
nGenes <- ncol(exprset)
select = sample(nGenes, size = nSelect);
selectTOM = dissTOM[select, select];
# There's no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster.
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = dynamicColors[select];
# Open a graphical window
sizeGrWindow(9,9)
# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing
# the color palette; setting the diagonal to NA also improves the clarity of the plot
plotDiss = selectTOM^softPower;
diag(plotDiss) = NA;
TOMplot(plotDiss,
selectTree, #selected geneTree
selectColors,
main = "Network heatmap plot, selected genes")
}
####step4 MEs and merged_MEs
if(T){
MEList = moduleEigengenes(exprset, colors = dynamicColors)
MEs = MEList$eigengenes ##MEs是一個列為模塊特征基因,行為sample的dataframe
# 計算根據(jù)模塊特征向量基因計算模塊相異度:
MEDiss = 1-cor(MEs);
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average");
# Plot the result
plotEigengeneNetworks(MEs,
"Eigengene adjacency heatmap",
marHeatmap = c(3,4,2,2),
plotDendrograms = T,
xLabelsAngle = 90)
plot(METree,
main = "Clustering of module eigengenes",
xlab = "",
sub = "")
# 在聚類圖中畫出剪切線,紅線以下的模塊表示相關性>0.8,將被合并
MEDissThres = 0.2
abline(h=MEDissThres, col = "red")
# 合并模塊:
merge_modules = mergeCloseModules(exprset, dynamicColors, cutHeight = MEDissThres, verbose = 3)
# 合并后的顏色:
mergedColors = merge_modules$colors;
# 新模塊的特征向量基因:
mergedMEs = merge_modules$newMEs;
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
}
##step5 模塊與樣本性狀相關性熱圖
if(T){
mergedMEs[1:4,1:4];datTraits[1:4,1:4]
moduleTraitCor_noFP <- cor(mergedMEs, datTraits, use = "p");
moduleTraitPvalue_noFP = corPvalueStudent(moduleTraitCor_noFP, nSamples);
textMatrix_noFP <- paste(signif(moduleTraitCor_noFP, 2), "\n(", signif(moduleTraitPvalue_noFP, 1), ")", sep = ""); #signif保留幾位有效的digital
sizeGrWindow(12,8)
par(mar = c(5, 8.5, 3, 3));
labeledHeatmap(Matrix = moduleTraitCor_noFP,
xLabels = names(datTraits),
yLabels = names(mergedMEs),
ySymbols = names(mergedMEs),
colorLabels = FALSE,
colors = blueWhiteRed(50),
textMatrix = textMatrix_noFP,
setStdMargins = FALSE,
cex.text = 0.65,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
}
##step5.1 單個形狀進行畫GS
if(F){
ph <- 'weight_g'
#Gene significance and module significance
#geneSignificance表示基因的表達量與臨床形狀Pearson相關系數(shù)
GS1 <- as.numeric(WGCNA::cor(datTraits[,ph],exprset,use="p",method="pearson"))
# 顯著性是絕對值:
GeneSignificance <- abs(GS1)
# 獲得該性狀在每個模塊中的顯著性:
ModuleSignificance <- tapply(GeneSignificance,mergedColors,mean,na.rm=T)
which.max(ModuleSignificance[names(ModuleSignificance != 'MEgrey')])
plotModuleSignificance(GeneSignificance,mergedColors)
##批量出圖《代碼不完美》
if(F){for (i in 1:ncol(datTraits)){
file = paste('GS',colnames(datTraits)[i],sep = '_')
tiff(filename = paste(file,'.tiff',sep = ''))
GS1 <- as.numeric(WGCNA::cor(datTraits[,i],exprset,use="p",method="pearson"))
# 顯著性是絕對值:
GeneSignificance <- abs(GS1)
# 獲得該性狀在每個模塊中的顯著性:
ModuleSignificance <- tapply(GeneSignificance,mergedColors,mean,na.rm=T)
#MS_max = which.max(ModuleSignificance[names(ModuleSignificance != 'MEgrey')])
plotModuleSignificance(GeneSignificance,mergedColors)
dev.off()
}}
}
#####step6 挑選自己感興趣的臨床表型畫:內部連接度VS MM
if(T){
ph <- 'weight_g'
cor_ADR <- signif(WGCNA::cor(datTraits,mergedMEs,use="p",method="pearson"),5)
p.values <- corPvalueStudent(cor_ADR,nSamples=nrow(datTraits))
Max_cor <- which.max(abs(cor_ADR[ph,-which(colnames(cor_ADR) == "MEgrey")])) # which 返回滿足條件(true值)的下標
Min_p_value <- which.min(p.values[ph,-which(colnames(p.values) == "MEgrey")])
Max_cor == Min_p_value
#尋找與該性狀相關的樞紐基因(hub genes)
#計算基因的inter connectivity和module membership,內部連接度衡量的是基因在模塊內部的地位,而模塊身份表明基因屬于哪個模塊。
# 計算每個基因模塊內部連接度,也就是基因直接兩兩加權相關性。
load('softPower.RData')
ADJ1=abs(cor(exprset,use="p"))^softPower
# 根據(jù)上面結果和基因所屬模塊信息獲得連接度:
# 整體連接度 kTotal,模塊內部連接度:kWithin,kOut=kTotal-kWithin, kDiff=kIn-kOut=2*kIN-kTotal
colorh1=mergedColors ###教程中沒有給出這個color的賦值,但是通過幫助文檔可以知道:
#color: module labels. A vector of length ncol(ADJ1) giving a module label for each gene (node) of the network.
Alldegrees1=intramodularConnectivity(ADJ1, colorh1)
# 注意模塊內基于特征向量基因連接度評估模塊內其他基因: de ne a module eigengene-based connectivity measure for each gene as the correlation between a the gene expression and the module eigengene
# 如 brown 模塊內:kM Ebrown(i) = cor(xi, MEbrown) , xi is the gene expression pro le of gene i and M Ebrown is the module eigengene of the brown module
# 而 module membership 與內部連接度不同。MM 衡量了基因在全局網(wǎng)絡中的位置。
datKME=signedKME(exprset, mergedMEs, outputColumnName="MM.")
which.color=substring(names(Max_cor == Min_p_value),3);
which.color ##which.color就是返回感興趣的形狀所對應的最有意義的module顏色
restrictGenes=colorh1==which.color
# pdf(file = 'MM VS connec.pdf')
verboseScatterplot(Alldegrees1$kWithin[ restrictGenes],
(datKME[restrictGenes, paste("MM.", which.color, sep="")])^4,
col=which.color,
xlab="Intramodular Connectivity",
ylab="(Module Membership)^4")
# dev.off()
}
####step7 挑選module中的hub gene
if(T){
ph <- 'weight_g'
GS_spe=as.numeric(cor(datTraits[,ph],exprset, use="p")) #選擇的樣品性狀datTraits$weight_g與基因之間的相關性
GeneSignificance_spe <- abs(GS_spe)
#基于顯著性和MM計算每個基因與 指定trait 的關聯(lián),結果包括p, q, cor, z,
NS1=networkScreening(y=datTraits[,ph],
datME=mergedMEs,
datExpr=exprset,
oddPower=3,
blockSize=1000,
minimumSampleSize=4,
addMEy=TRUE,
removeDiag=FALSE,
weightESy=0.5)
rownames(NS1) <- colnames(exprset)
#根據(jù)基因與指定性狀的直接相關性(biserial.cor),模塊身份,和加權相關性篩選基因:
#網(wǎng)上的教程推薦使用的是使用基因與臨床形狀的直接相關性函數(shù):biserial.cor;但是我這里用的是Pearson相關系數(shù)算的
FilterGenes_spe = ((GeneSignificance_spe > 0.2) & (abs(datKME[paste("MM.",which.color,sep="")])>0.8) & (NS1$q.Weighted < 0.01) )
table(FilterGenes_spe)
# 找到滿足上面條件的基因:
trait_hubGenes_spe <- colnames(exprset)[FilterGenes_spe]
# hub 基因熱圖:
plotNetworkHeatmap(exprset,
plotGenes = trait_hubGenes_spe,
networkType = "unsigned",
useTOM = TRUE,
power=softPower,
main="unsigned correlations")
}
####step8 enrichment analysis
if(T){
#hub genes GO and KEGG analysis.
library(clusterProfiler)
library(org.Hs.eg.db)
library(ggplot2)
# GO 分析:
ego <- enrichGO(gene = trait_hubGenes_spe,
# universe = names(geneList), #background genes. If missing, the all genes listed in the database (eg TERM2GENE table) will be used as background.
OrgDb = org.Hs.eg.db,
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05,
readable = TRUE)
GO_BP <- as.data.frame(ego)
GO_BP$point_shape<-"0"
GO_BP$point_size<-"15"
# write.xlsx(GO_BP,"./results/392_genes_GO_BP.xlsx")
ggplot(data=GO_BP)+
geom_bar(aes(x=reorder(Description,Count),y=Count, fill=-log10(qvalue)), stat='identity') +
coord_flip() +
scale_fill_gradient(expression(-log["10"]("q value")),low="red", high = "blue") +
xlab("") +
ylab("Gene count") +
scale_y_continuous(expand=c(0, 0))+
theme_bw()+
theme(
axis.text.x=element_text(color="black",size=rel(1.5)),
axis.text.y=element_text(color="black", size=rel(1.6)),
axis.title.x = element_text(color="black", size=rel(1.6)),
legend.text=element_text(color="black",size=rel(1.0)),
legend.title = element_text(color="black",size=rel(1.1))
# legend.position=c(0,1),legend.justification=c(-1,0)
# legend.position="top",
)
# KEGG:
kk <- enrichKEGG(gene = trait_hubGenes_spe,
organism = 'hsa',
pvalueCutoff = 0.05)
kegg_DF <- as.data.frame(kk)
}
####step9 export
if(T){
# 導出整個模塊基因到 VisANT
modTOM <- TOM[mergedColors==which.color,mergedColors==which.color]
dimnames(modTOM) = list(colnames(exprset)[mergedColors==which.color], colnames(exprset)[which.color])
vis = exportNetworkToVisANT(modTOM,
file = paste("./WGCNA/ADR_drug_exp_new/VisANTInput-Mod-", which.color, ".txt", sep=""),
weighted = TRUE,
threshold = 0
# probeToGene = data.frame(annot$substanceBXH, annot$gene_symbol)
)
# 導出樞紐基因到 Cytoscape
hubGene_TOM <- TOM[FilterGenes_spe,FilterGenes_spe]
dimnames(hubGene_TOM) = list(colnames(exprset)[FilterGenes_spe], colnames(exprset)[FilterGenes_spe])
cyt = exportNetworkToCytoscape(hubGene_TOM,
edgeFile = paste("./WGCNA/ADR_drug_exp_new/CytoscapeInput-edges-", paste(which.color, collapse="-"), ".txt", sep=""),
nodeFile = paste("./WGCNA/ADR_drug_exp_new/CytoscapeInput-nodes-", paste(which.color, collapse="-"), ".txt", sep=""),
weighted = TRUE,
threshold = 0.02,
nodeNames = trait_hubGenes_spe,
altNodeNames = trait_hubGenes_spe,
nodeAttr = mergedColors[FilterGenes_spe]
)
}
WGCNA
?著作權歸作者所有,轉載或內容合作請聯(lián)系作者
【社區(qū)內容提示】社區(qū)部分內容疑似由AI輔助生成,瀏覽時請結合常識與多方信息審慎甄別。
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發(fā)布,文章內容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務。
【社區(qū)內容提示】社區(qū)部分內容疑似由AI輔助生成,瀏覽時請結合常識與多方信息審慎甄別。
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發(fā)布,文章內容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務。
相關閱讀更多精彩內容
- 本教程根據(jù)PlantTech的WGCNA課程編寫,課程還是不錯的,所以將該課程給大家分享一下。 WGCNA筆記第一...
- 致知 2019-10-06一起為你的基因找個朋友吧~ 上一次學習WGCNA還是7月份做小組作業(yè)的時候,本來想做WG...
- WGCNA筆記第三彈 WGCNA分析原理 WGCNA應用場景 WGCNA分析實踐 本代碼借鑒了生信技能樹的WGCN...
- WGCNA筆記第二彈 WGCNA分析原理 WGCNA應用場景 WGCNA分析實踐 1.WGCNA應用場景 不同組織...
- 鏈接:http://genek.tv/,本文是該課程的學習記錄。 1.共表達 兩條基因的表達模式相似,即在某些樣本...