WGCNA

rm(list = ls())
library(WGCNA)
library(data.table)
library(stringr)
library(openxlsx)
allowWGCNAThreads()
####step1 數(shù)據(jù)預處理
if(T){
  #read expression data and pre-processing 
  exprset0 <- read.delim('LiverFemale3600.csv',header = T,sep = ',')
  exprset <- exprset0[,c(2,9:ncol(exprset0))]
  dim(exprset)
  exprset <- exprset[exprset$gene_symbol != 0,]
  dim(exprset)
  table(duplicated(exprset[,1]))
  exprset <- exprset[!duplicated(exprset$gene_symbol),]
  dim(exprset)
  table(duplicated(exprset[,1]))
  rownames(exprset) <- exprset[,1];exprset <- exprset[,-1]
  #read phenotype data and pre-processing 
  datTraits0 <- read.delim('ClinicalTraits.csv',header = T,sep = ',')
  datTraits <- datTraits0[,c(2,9:15)]
  datTraits <- datTraits[,c(1,4:8)]
  dim(datTraits)
  rownames(datTraits) <- datTraits[,1];datTraits <- datTraits[,-1]
  #match expression data and phenotype data
  sample <- colnames(exprset) #sample 是表達矩陣的樣品名排序
  datTraits <- datTraits[rownames(datTraits) %in% sample,] #刪除臨床表型中多余的sample
  datTraits <- datTraits[match(sample,rownames(datTraits)),] #將數(shù)量匹配的sample進行排序
  identical(colnames(exprset),rownames(datTraits)) ##validation:數(shù)量及排序是否一致
  #轉置expression 
  exprset <- t(exprset)
  save(exprset,datTraits,file = 'WGCNA_HUALIN_DATApreprocessing.RData')
}
#判斷sample里面是否有離群值
if(F){
  #sample clust 其實我覺得這里需要放到最前面去判斷樣本是否有離群值
  #明確樣本數(shù)和基因
  datExpr <- exprset
  nGenes = ncol(datExpr)
  nSamples = nrow(datExpr)
  #首先針對樣本做個系統(tǒng)聚類
  datExpr_tree<-hclust(dist(datExpr), method = "average")
  sampleTree <-datExpr_tree
  # 畫出樣本聚類圖(上)與樣本性狀熱圖(下):
  traitColors = numbers2colors(datTraits, signed = TRUE,centered=TRUE);
  plotDendroAndColors(sampleTree, 
                      traitColors, 
                      groupLabels = names(datTraits), 
                      rowTextAlignment = "right-justified",
                      addTextGuide = TRUE ,
                      hang = 0.03,
                      dendroLabels = NULL, # 是否顯示樹labels
                      addGuide = FALSE,  # 顯示虛線
                      guideHang = 0.05,
                      main = "Sample dendrogram and trait heatmap") 
}

#####step2 選擇一個β值建立臨近矩陣根據(jù)連接度使我們的基因分布符合無尺度網(wǎng)絡
if(T){
  powers = c(c(1:10), seq(from = 12, to=20, by=2))
  # 獲得各個閾值下的 R方 和平均連接度
  sft = pickSoftThreshold(exprset, powerVector = powers, verbose = 5)
  # 作圖:
  sizeGrWindow(9, 5) 
  #sizeGrWindow(width, height) size in inches
  #sizeGrWindow: Opens a graphics window with specified dimensions
  par(mfrow = c(1,2)); #par設置作圖參數(shù):tag = value
  cex1 = 0.9;
  # Scale-free topology fit index as a function of the soft-thresholding power
  plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
       xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
       main = paste("Scale independence"));
  text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
       labels=powers,cex=cex1,col="red");
  # this line corresponds to using an R^2 cut-off of h
  abline(h=0.90,col="red")
  # Mean connectivity as a function of the soft-thresholding power
  plot(sft$fitIndices[,1], sft$fitIndices[,5],
       xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
       main = paste("Mean connectivity"))
  text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
}

####step2.1 根據(jù)連接度和R^2判斷softPower是否合理
if(F){
  ADJ1_cor <- abs(WGCNA::cor(exprset,use = "p" ))^softPower
  #ADJ1_cor是一個nGene*nGene的matrix,無向網(wǎng)絡的邊屬性計算
  # 基因少(<5000)的時候使用下面的代碼:
  k <- as.vector(apply(ADJ1_cor,2,sum,na.rm=T))#k表示把同一個基因與其他所有基因相關性的sum
  # 基因多的時候使用下面的代碼:
  # k <- softConnectivity(datE=exprset,power=softPower) 
  sizeGrWindow(10, 5)
  par(mfrow=c(1,2))
  hist(k)
  scaleFreePlot(k,main="Check Scale free topology\n")
  ## figure legends :
  #圖一可以看出只有較少的點具有較高的連接性,而大多數(shù)的點具有較低的連接性。
  #圖二可以看出k與p(k)成負相關(相關性系數(shù)0.91,0.9以上最好),說明選擇的β值是沒有問題的,(相關系數(shù)r的絕對值一般在0.8以上,認為A和B有強的相關性。0.3到0.8之間,可以認為有弱的相關性。0.3以下,認為沒有相關性)
  
}

#### step3 根據(jù)β值獲得臨近矩陣adjacency和拓撲矩陣TOM
####分步法構建網(wǎng)絡
if(T){
  # 獲得臨近矩陣:
  adjacency = adjacency(exprset, power = softPower);
  # 將臨近矩陣轉為 Tom 矩陣
  TOM = TOMsimilarity(adjacency);
  # 計算基因之間的相異度dissTOM,然后根據(jù)dissTOM層次聚類得到geneTree,然后動態(tài)剪切法對樹進行剪切成不同的模塊
  dissTOM = 1-TOM
  geneTree = hclust(as.dist(dissTOM),method="average")
  sizeGrWindow(12,9)
  par(mfrow = c(2,1))
  plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
       labels = FALSE, hang = 0.04);
  # 使用動態(tài)剪切樹挖掘模塊:
  minModuleSize = 30;
  # 動態(tài)切割樹:
  dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
                              deepSplit = 2, pamRespectsDendro = FALSE,
                              minClusterSize = minModuleSize);
  table(dynamicMods)#這個時候還是數(shù)字代表module,下一步將數(shù)字轉為color
  dynamicColors=labels2colors(dynamicMods)
  table(dynamicColors)
  sizeGrWindow(8,6)
  plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
                      dendroLabels = FALSE, hang = 0.03,
                      addGuide = TRUE, guideHang = 0.05,
                      main = "Gene dendrogram and module colors")
}

####step3.1 隨機選擇400個基因畫拓撲重疊熱圖
if(F){
  # 拓撲熱圖:
  nSelect = 400 
  # For reproducibility, we set the random seed 
  set.seed(10); 
  nGenes <- ncol(exprset)
  select = sample(nGenes, size = nSelect); 
  selectTOM = dissTOM[select, select]; 
  # There's no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster. 
  selectTree = hclust(as.dist(selectTOM), method = "average") 
  selectColors = dynamicColors[select]; 
  # Open a graphical window 
  sizeGrWindow(9,9) 
  # Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing 
  # the color palette; setting the diagonal to NA also improves the clarity of the plot 
  plotDiss = selectTOM^softPower; 
  diag(plotDiss) = NA; 
  TOMplot(plotDiss, 
          selectTree, #selected geneTree
          selectColors, 
          main = "Network heatmap plot, selected genes") 
  
}

####step4 MEs and merged_MEs
if(T){
  MEList = moduleEigengenes(exprset, colors = dynamicColors)
  MEs = MEList$eigengenes ##MEs是一個列為模塊特征基因,行為sample的dataframe
  # 計算根據(jù)模塊特征向量基因計算模塊相異度:
  MEDiss = 1-cor(MEs);
  # Cluster module eigengenes
  METree = hclust(as.dist(MEDiss), method = "average");
  # Plot the result
  plotEigengeneNetworks(MEs, 
                        "Eigengene adjacency heatmap", 
                        marHeatmap = c(3,4,2,2), 
                        plotDendrograms = T, 
                        xLabelsAngle = 90) 
  
  plot(METree, 
       main = "Clustering of module eigengenes",
       xlab = "", 
       sub = "")
  # 在聚類圖中畫出剪切線,紅線以下的模塊表示相關性>0.8,將被合并
  MEDissThres = 0.2
  abline(h=MEDissThres, col = "red")
  # 合并模塊:
  merge_modules = mergeCloseModules(exprset, dynamicColors, cutHeight = MEDissThres, verbose = 3)
  # 合并后的顏色:
  mergedColors = merge_modules$colors;
  # 新模塊的特征向量基因:
  mergedMEs = merge_modules$newMEs;
  plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
                      c("Dynamic Tree Cut", "Merged dynamic"),
                      dendroLabels = FALSE, hang = 0.03,
                      addGuide = TRUE, guideHang = 0.05)
  
}


##step5 模塊與樣本性狀相關性熱圖
if(T){
  mergedMEs[1:4,1:4];datTraits[1:4,1:4]
  moduleTraitCor_noFP <- cor(mergedMEs, datTraits, use = "p");
  moduleTraitPvalue_noFP = corPvalueStudent(moduleTraitCor_noFP, nSamples); 
  textMatrix_noFP <- paste(signif(moduleTraitCor_noFP, 2), "\n(", signif(moduleTraitPvalue_noFP, 1), ")", sep = ""); #signif保留幾位有效的digital 
  sizeGrWindow(12,8)
  par(mar = c(5, 8.5, 3, 3)); 
  labeledHeatmap(Matrix = moduleTraitCor_noFP, 
                 xLabels = names(datTraits), 
                 yLabels = names(mergedMEs), 
                 ySymbols = names(mergedMEs), 
                 colorLabels = FALSE, 
                 colors = blueWhiteRed(50), 
                 textMatrix = textMatrix_noFP,
                 setStdMargins = FALSE, 
                 cex.text = 0.65, 
                 zlim = c(-1,1), 
                 main = paste("Module-trait relationships")) 
}

##step5.1 單個形狀進行畫GS
if(F){
    ph <- 'weight_g'
    #Gene significance and module significance
    #geneSignificance表示基因的表達量與臨床形狀Pearson相關系數(shù)
    GS1 <- as.numeric(WGCNA::cor(datTraits[,ph],exprset,use="p",method="pearson"))
    # 顯著性是絕對值:
    GeneSignificance <- abs(GS1)
    # 獲得該性狀在每個模塊中的顯著性:
    ModuleSignificance <- tapply(GeneSignificance,mergedColors,mean,na.rm=T)
    which.max(ModuleSignificance[names(ModuleSignificance != 'MEgrey')])
    plotModuleSignificance(GeneSignificance,mergedColors)
  ##批量出圖《代碼不完美》
  if(F){for (i in 1:ncol(datTraits)){
    file = paste('GS',colnames(datTraits)[i],sep = '_')
    tiff(filename = paste(file,'.tiff',sep = ''))
    GS1 <- as.numeric(WGCNA::cor(datTraits[,i],exprset,use="p",method="pearson"))
    # 顯著性是絕對值:
    GeneSignificance <- abs(GS1)
    # 獲得該性狀在每個模塊中的顯著性:
    ModuleSignificance <- tapply(GeneSignificance,mergedColors,mean,na.rm=T)
    #MS_max = which.max(ModuleSignificance[names(ModuleSignificance != 'MEgrey')])
    plotModuleSignificance(GeneSignificance,mergedColors)
    dev.off()
  }}
}

#####step6 挑選自己感興趣的臨床表型畫:內部連接度VS MM
if(T){
  ph <- 'weight_g'
  cor_ADR <- signif(WGCNA::cor(datTraits,mergedMEs,use="p",method="pearson"),5)
  p.values <- corPvalueStudent(cor_ADR,nSamples=nrow(datTraits))
  Max_cor <- which.max(abs(cor_ADR[ph,-which(colnames(cor_ADR) == "MEgrey")])) # which 返回滿足條件(true值)的下標
  Min_p_value <- which.min(p.values[ph,-which(colnames(p.values) == "MEgrey")])
  Max_cor == Min_p_value
  #尋找與該性狀相關的樞紐基因(hub genes)
  #計算基因的inter connectivity和module membership,內部連接度衡量的是基因在模塊內部的地位,而模塊身份表明基因屬于哪個模塊。
  # 計算每個基因模塊內部連接度,也就是基因直接兩兩加權相關性。
  load('softPower.RData')
  ADJ1=abs(cor(exprset,use="p"))^softPower 
  # 根據(jù)上面結果和基因所屬模塊信息獲得連接度:
  # 整體連接度 kTotal,模塊內部連接度:kWithin,kOut=kTotal-kWithin, kDiff=kIn-kOut=2*kIN-kTotal 
  colorh1=mergedColors ###教程中沒有給出這個color的賦值,但是通過幫助文檔可以知道:
  #color: module labels. A vector of length ncol(ADJ1) giving a module label for each gene (node) of the network.
  Alldegrees1=intramodularConnectivity(ADJ1, colorh1) 
  
  # 注意模塊內基于特征向量基因連接度評估模塊內其他基因: de ne a module eigengene-based connectivity measure for each gene as the correlation between a the gene expression and the module eigengene
  # 如 brown 模塊內:kM Ebrown(i) = cor(xi, MEbrown) , xi is the gene expression pro le of gene i and M Ebrown is the module eigengene of the brown module
  # 而 module membership 與內部連接度不同。MM 衡量了基因在全局網(wǎng)絡中的位置。
  datKME=signedKME(exprset, mergedMEs, outputColumnName="MM.")
  
  which.color=substring(names(Max_cor == Min_p_value),3); 
  which.color ##which.color就是返回感興趣的形狀所對應的最有意義的module顏色
  restrictGenes=colorh1==which.color 
  # pdf(file = 'MM VS connec.pdf')
  verboseScatterplot(Alldegrees1$kWithin[ restrictGenes], 
                     (datKME[restrictGenes, paste("MM.", which.color, sep="")])^4, 
                     col=which.color, 
                     xlab="Intramodular Connectivity", 
                     ylab="(Module Membership)^4")
  # dev.off()
  
}

####step7 挑選module中的hub gene
if(T){
  ph <- 'weight_g'
  GS_spe=as.numeric(cor(datTraits[,ph],exprset, use="p")) #選擇的樣品性狀datTraits$weight_g與基因之間的相關性
  GeneSignificance_spe <- abs(GS_spe)
  #基于顯著性和MM計算每個基因與 指定trait 的關聯(lián),結果包括p, q, cor, z, 
  NS1=networkScreening(y=datTraits[,ph], 
                       datME=mergedMEs, 
                       datExpr=exprset, 
                       oddPower=3, 
                       blockSize=1000, 
                       minimumSampleSize=4, 
                       addMEy=TRUE, 
                       removeDiag=FALSE, 
                       weightESy=0.5) 
  rownames(NS1) <- colnames(exprset)
  
  #根據(jù)基因與指定性狀的直接相關性(biserial.cor),模塊身份,和加權相關性篩選基因:
  #網(wǎng)上的教程推薦使用的是使用基因與臨床形狀的直接相關性函數(shù):biserial.cor;但是我這里用的是Pearson相關系數(shù)算的
  FilterGenes_spe = ((GeneSignificance_spe > 0.2) & (abs(datKME[paste("MM.",which.color,sep="")])>0.8) & (NS1$q.Weighted < 0.01) ) 
  table(FilterGenes_spe)
  # 找到滿足上面條件的基因:
  trait_hubGenes_spe <- colnames(exprset)[FilterGenes_spe] 
  # hub 基因熱圖:
  plotNetworkHeatmap(exprset,
                     plotGenes = trait_hubGenes_spe,
                     networkType = "unsigned",
                     useTOM = TRUE,
                     power=softPower,
                     main="unsigned correlations")
}

####step8 enrichment analysis 
if(T){
  #hub genes GO and KEGG analysis.
  library(clusterProfiler)
  library(org.Hs.eg.db)
  library(ggplot2)
  # GO 分析:
  ego <- enrichGO(gene          = trait_hubGenes_spe,
                  # universe      = names(geneList),  #background genes. If missing, the all genes listed in the database (eg TERM2GENE table) will be used as background.
                  OrgDb         = org.Hs.eg.db,
                  ont           = "BP",
                  pAdjustMethod = "BH",
                  pvalueCutoff  = 0.01,
                  qvalueCutoff  = 0.05,
                  readable      = TRUE)
  
  GO_BP <- as.data.frame(ego)
  GO_BP$point_shape<-"0"
  GO_BP$point_size<-"15"
  # write.xlsx(GO_BP,"./results/392_genes_GO_BP.xlsx")
  
  ggplot(data=GO_BP)+
    geom_bar(aes(x=reorder(Description,Count),y=Count, fill=-log10(qvalue)), stat='identity') + 
    coord_flip() +
    scale_fill_gradient(expression(-log["10"]("q value")),low="red", high = "blue") +
    xlab("") +
    ylab("Gene count") +
    scale_y_continuous(expand=c(0, 0))+
    theme_bw()+
    theme(
      axis.text.x=element_text(color="black",size=rel(1.5)),
      axis.text.y=element_text(color="black", size=rel(1.6)),
      axis.title.x = element_text(color="black", size=rel(1.6)),
      legend.text=element_text(color="black",size=rel(1.0)),
      legend.title = element_text(color="black",size=rel(1.1))
      # legend.position=c(0,1),legend.justification=c(-1,0)
      # legend.position="top",
    )
  
  # KEGG:
  kk <- enrichKEGG(gene         = trait_hubGenes_spe,
                   organism     = 'hsa',
                   pvalueCutoff = 0.05)
  kegg_DF <- as.data.frame(kk)
  
}

####step9 export
if(T){
  # 導出整個模塊基因到 VisANT 
  modTOM <- TOM[mergedColors==which.color,mergedColors==which.color]
  dimnames(modTOM) = list(colnames(exprset)[mergedColors==which.color], colnames(exprset)[which.color]) 
  vis = exportNetworkToVisANT(modTOM, 
                              file = paste("./WGCNA/ADR_drug_exp_new/VisANTInput-Mod-", which.color, ".txt", sep=""),
                              weighted = TRUE, 
                              threshold = 0
                              # probeToGene = data.frame(annot$substanceBXH, annot$gene_symbol) 
  ) 
  
  # 導出樞紐基因到 Cytoscape 
  hubGene_TOM <- TOM[FilterGenes_spe,FilterGenes_spe]
  dimnames(hubGene_TOM) = list(colnames(exprset)[FilterGenes_spe], colnames(exprset)[FilterGenes_spe]) 
  cyt = exportNetworkToCytoscape(hubGene_TOM, 
                                 edgeFile = paste("./WGCNA/ADR_drug_exp_new/CytoscapeInput-edges-", paste(which.color, collapse="-"), ".txt", sep=""), 
                                 nodeFile = paste("./WGCNA/ADR_drug_exp_new/CytoscapeInput-nodes-", paste(which.color, collapse="-"), ".txt", sep=""), 
                                 weighted = TRUE, 
                                 threshold = 0.02, 
                                 nodeNames = trait_hubGenes_spe, 
                                 altNodeNames = trait_hubGenes_spe, 
                                 nodeAttr = mergedColors[FilterGenes_spe]
  )
  
}

?著作權歸作者所有,轉載或內容合作請聯(lián)系作者
【社區(qū)內容提示】社區(qū)部分內容疑似由AI輔助生成,瀏覽時請結合常識與多方信息審慎甄別。
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發(fā)布,文章內容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務。

相關閱讀更多精彩內容

友情鏈接更多精彩內容