GEO——代碼復(fù)現(xiàn)

文章題目:Identification of the interaction network of hub genes for melanoma treated with vemurafenib based on microarray data####

GEO42872 GPL6244

B站學(xué)習(xí)了生信技能樹jimmy老師的代碼后,自己嘗試運(yùn)行復(fù)現(xiàn)一遍,從下載數(shù)據(jù)(GEOquery/GEOmirror)到檢查數(shù)據(jù)(boxplot/PCA),分組的構(gòu)建,接著是標(biāo)準(zhǔn)化流程:差異分析(可視化展示:熱圖、火山圖),功能富集分析(GO、KEGG,這里用的是y叔的clusterProfiler包),GSEA(gene set enrichment analysis)。其中,火山圖畫的感覺還不錯(cuò),哈哈!


rm(list = ls())  #一鍵清空
options(stringsAsFactors = F) #在調(diào)用as.data.frame的時(shí),將stringsAsFactors設(shè)置為FALSE可以避免character類型自動轉(zhuǎn)化為factor類型
##加載包
library(GEOmirror)
suppressMessages(library(GEOquery))
#GSE42872數(shù)據(jù)下載
f <- 'GSE42872_eSet.Rdata'
if (T) {
gse <- geoChina(gse = "GSE42872")
save(gse,file=f)
}
load(f)
exprSet <- exprs(gse[[1]])  #提取表達(dá)矩陣賦值給exprSet
#查看數(shù)據(jù)
dim(exprSet)  #行33297     列6
head(exprSet)
range(exprSet)   # [1]  2.66517 14.56410  初步判斷數(shù)據(jù)可能經(jīng)過了log轉(zhuǎn)換
boxplot(exprSet,las=0)  #已經(jīng)過標(biāo)準(zhǔn)化,las參數(shù)調(diào)整橫軸標(biāo)簽方向0橫,1橫,2縱,3縱

#提取臨床信息
pdat <- pData(gse[[1]])
View(pdat)
library(stringr)  #字符串處理包
group_list <- str_split(pdat$title," ",simplify = T)[,4]  #提取分組信息
table(group_list)  #查看分組
if (F) {
##注釋包/GEO注釋平臺 GPL6244
BiocManager::install("hugene10sttranscriptcluster.db")  #下載對應(yīng)注釋包
library(hugene10sttranscriptcluster.db)  #加載包
ls("package:hugene10sttranscriptcluster.db") #查看包得內(nèi)容
ids <- toTable(hugene10sttranscriptclusterSYMBOL)  #toTable函數(shù)提取探針id,symbol對應(yīng)關(guān)系
head(ids)  #查看前六行 
exprs
dim(ids) # 19869     2
ids<- ids[ids$symbol!=" ",] # 19869     2 提取symbol 不為空得行
ids <- ids[ids$probe_id %in%  rownames(exprSet),] #19869 
exprSet[1:4,1:4]
head(ids)
exprSet <- exprSet[rownames(exprSet) %in%ids$probe_id,  ]
dim(exprSet) # 19869     6
table(sort(table(ids$symbol))) #查看基因探針對應(yīng)關(guān)系
# 1     2     3     4     5     6     7     8    10    26 
# 18098   597   131    15     7     5     1     2     1     1 
#由于多個(gè)探針對應(yīng)一個(gè)基因,所以取基因在樣本中的均值并排序取最大得作為唯一探針與基因?qū)?yīng)。
##加載注釋函數(shù)
anno <- function(exprSet,ids){
  probes <- as.character(by(exprSet,ids$symbol,function(x)rownames(x)[which.max(rowMeans(x))]))
  exprSet=exprSet[rownames(exprSet) %in% probes ,]
  rownames(exprSet)=ids[ids$probe_id%in%rownames(exprSet),2]
  return(exprSet)
}
exprSet<- anno(exprSet,ids)
exprSet[1:4,1:4]  #查看表達(dá)矩陣
#保存注釋后的表達(dá)矩陣以及分組信息
}
save(exprSet,group_list,file = "output.Rdata")


##檢查分組是否合適,PCA。
load("output.Rdata")
dat<- exprSet
## 下面是畫PCA的必須操作,需要看說明書。
#畫PCA圖時(shí)要求是行名時(shí)樣本名,列名時(shí)探針名,因此此時(shí)需要轉(zhuǎn)換。將matrix轉(zhuǎn)換為data.frame
dat <- cbind(as.data.frame(t(dat)),group_list) #cbind橫向追加,即將分組信息追加到最后一列
library("FactoMineR")#畫主成分分析圖需要加載這兩個(gè)包
library("factoextra") 
dim(dat)  #[1]     6 18859
dim(exprSet)  #[1] 18858     6
# The variable group_list is removed before PCA analysis
dat.pca <- PCA(dat[,-ncol(dat)], graph = FALSE)#現(xiàn)在dat最后一列是group_list,需要重新賦值給一個(gè)dat.pca,這個(gè)矩陣是不含有分組信息的
fviz_pca_ind(dat.pca,
             geom.ind = "point", # 圖形顯示形式,可以是point,text
             col.ind = dat$group_list, # color by groups
             # palette = c("#00AFBB", "#E7B800"),
             addEllipses = F, # 圖例是否加框
             legend.title = "Groups"
)  #PCA可視化fviz_pca_ind
ggsave('all_samples_PCA.png',width = 600,height = 600,units = "mm")

###DEGs package limma input: 1.expression matrix 2.design matrix 3. contrast matrix
suppressMessages(library(limma))
#1.expression matrix :exprSet
exprSet[1:4,1:4]
#2 design matrix
table(group_list)
design <- model.matrix(~0+factor(group_list,levels = c("Control","Vemurafenib"),ordered = T))
head(design)
colnames(design) <- levels(factor(group_list))
rownames(design) <- colnames(exprSet)
#3. contrast matrix
contrast <- makeContrasts(paste0(rev(levels(factor(group_list))),collapse = "-"),
                          levels =design)
#DEGs
DEGs <- function(exprSet,design,contrast){
# step1
fit <- lmFit(exprSet,design)
# step2
fit1 <- contrasts.fit(fit,contrast)
# step3
fit2 <- eBayes(fit1)
nrDEG  <- na.omit(topTable(fit2,coef = paste0(rev(levels(factor(group_list))),collapse = "-"),number = Inf,adjust.method = "BH"))
return(nrDEG)
}
nrDEG <- DEGs(exprSet,design,contrast)
save(nrDEG,file = 'nrDEG.Rdata')
table(nrDEG$adj.P.Val<0.01&nrDEG$logFC>=2)
table(nrDEG$adj.P.Val<0.01&nrDEG$logFC<=-2)

#valcone plot
if(T){ 
  library(dplyr)
  library(tidyverse)
  library(ggplot2)
  DEG_valvone <- nrDEG
  colnames(DEG_valvone)
  DEG_valvone <- DEG_valvone %>%
    select(logFC,AveExpr, adj.P.Val) %>%
    mutate(v= -log10(adj.P.Val),
           gene_type = case_when(logFC >= 2 & adj.P.Val < 0.01 ~ "up",
                                 logFC <= -2 & adj.P.Val < 0.01 ~ "down",
                                 TRUE ~ "stable")) 
  ##查看x坐標(biāo)軸設(shè)定范圍
  DEG_valvone %>%
    pull(logFC) %>%
    min() %>%
    floor()   #查看x軸最小邊界 -5
  DEG_valvone %>%
    pull(logFC) %>%
    max() %>%
    ceiling()  #最大邊界6
  
  ###設(shè)定上調(diào)下調(diào)穩(wěn)定點(diǎn)顏色大小透明度
  cols <- c("up" = "#ffad73", "down" = "#26b3ff", "stable" = "grey") 
  sizes <- c("up" = 2, "down" = 2, "stable" = 1) 
  alphas <- c("up" = 1, "down" = 1, "stable" = 0.5)
}
final_plot <- DEG_valvone %>%
  ggplot(aes(x = logFC,
             y = v,
             fill = gene_type,    
             size = gene_type,
             alpha = gene_type)) + 
  geom_point(shape = 21, # Specify shape and colour as fixed local parameters    
             colour = "black") + 
  geom_hline(yintercept = -log10(0.01),
             linetype = "dashed") + 
  geom_vline(xintercept = c(-2, 2),
             linetype = "dashed") +
  scale_fill_manual(values = cols) + # Modify point colour
  scale_size_manual(values = sizes) + # Modify point size
  scale_alpha_manual(values = alphas) + # Modify point transparency
  scale_x_continuous(breaks = c(seq(-6, 6, 1)),       
                     limits = c(-6, 6))+
  labs(title = "Gene expression changes in Vemurafenib versus control samples",
       x = "log2(fold change)",
       y = "-log10(adjusted P-value)",
       colour = "Expression \nchange") +
  theme_bw() + # Select theme with a white background  
  theme(panel.border = element_rect(colour = "black", fill = NA, size= 0.5),    
        panel.grid.minor = element_blank(),
        panel.grid.major = element_blank()) 
final_plot
ggsave("valcone_plot.png",plot = final_plot,width = 200,height = 150,units ="mm" )


## Plot MA plots
if(T){DEG_valvone %>%
    pull(logFC) %>%
    min() %>%
    floor()   #查看x軸最小邊界 -5
  DEG_valvone %>%
    pull(logFC) %>%
    max() %>%
    ceiling()  #最大邊界6
  res <- DEG_valvone[with(DEG_valvone, order(logFC)),]  #按照logFC倒序排列
  res$threshold <- as.factor(res$adj.P.Val < 0.01)
  MAplot<- ggplot(data=res, aes(x=AveExpr, y=logFC, colour=threshold)) + 
    geom_point(alpha=0.4, size=1.8) + 
    geom_hline(aes(yintercept = 0), colour = "blue", size = 1.2) +
    ylim(c(-5,6)) + 
    labs(title = "MA plot between Vemurafenib and control samples",
         x = "Mean expression",
         y = "Log2 Fold Change")+
    theme(axis.title.x = element_text(face = "bold", size = 15),
          axis.text.x = element_text(face = "bold", size = 12)) +
    theme(axis.title.y = element_text(face = "bold", size = 15),
          axis.text.y = element_text(face = "bold", size = 12)) +
    scale_colour_discrete(name = "p.adjusted < 0.01") +
    theme(legend.title = element_text(face = "bold", size = 15)) +
    theme(legend.text = element_text(size = 14))
  ggsave("MA_plot.png",plot = MAplot,width = 200,height = 200,units ="mm" )
}


#GO enrichment analysis
load("nrDEG.Rdata")
head(nrDEG)
if(T){ 
  suppressMessages(library(org.Hs.eg.db,
                           ggplot2,
                           dplyr,
                           tidyverse))
  nrDEG <- nrDEG %>%
    mutate(v= -log10(adj.P.Val),
           gene_type = case_when(logFC >= 2 & adj.P.Val < 0.01 ~ "up",
                                 logFC <= -2 & adj.P.Val < 0.01 ~ "down",
                                 TRUE ~ "stable")) 
  save(nrDEG,file="deg.Rdata")
}
load("deg.Rdata")
head(nrDEG)

nrDEG$SYMBOL <- rownames(nrDEG)
df <- bitr(nrDEG$SYMBOL, fromType = "SYMBOL",
           toType = c( "ENTREZID"),
           OrgDb = org.Hs.eg.db) 
nrDEG=merge(nrDEG,df,by.y='SYMBOL',by.x='SYMBOL')
gene_up <- nrDEG[nrDEG$gene_type=="up",'ENTREZID']
gene_down <- nrDEG[nrDEG$gene_type=="down",'ENTREZID']
gene_df <- c(gene_up,gene_down)
gene_all<- as.character(nrDEG[,'ENTREZID'])
#GO
if(F){ego_BP <- enrichGO(gene = gene_up,  #entrez gene id
                   keyType = "ENTREZID", 
                   OrgDb= org.Hs.eg.db,
                   ont = "BP",  #生物學(xué)功能
                   pAdjustMethod = "BH", 
                   universe=gene_all,
                   minGSSize = 10,
                   maxGSSize = 500,
                   pvalueCutoff = 0.05,  #adjust pvalue
                   qvalueCutoff = 0.05) #pvalue test value
ego_MF <- enrichGO(gene = gene_up,  #entrez gene id
                   keyType = "ENTREZID", 
                   OrgDb= org.Hs.eg.db,
                   ont = "MF",  #生物學(xué)功能
                   pAdjustMethod = "BH", 
                   universe=gene_all,
                   minGSSize = 10,
                   maxGSSize = 500,
                   pvalueCutoff = 0.05,  #adjust pvalue
                   qvalueCutoff = 0.05) #pvalue test value
ego_MF <- enrichGO(gene = gene_up,  #entrez gene id
                   keyType = "ENTREZID", 
                   OrgDb= org.Hs.eg.db,
                   ont = "CC",  #生物學(xué)功能
                   pAdjustMethod = "BH", 
                   universe=gene_all,
                   minGSSize = 10,
                   maxGSSize = 500,
                   pvalueCutoff = 0.05,  #adjust pvalue
                   qvalueCutoff = 0.05) 
}
ego_all <- enrichGO(gene = gene_df,  #entrez gene id
                   keyType = "ENTREZID", 
                   OrgDb= org.Hs.eg.db,
                   ont = "CC",  #生物學(xué)功能
                   pAdjustMethod = "BH", 
                   universe=gene_all,
                   minGSSize = 10,
                   maxGSSize = 500,
                   pvalueCutoff = 0.05,  #adjust pvalue
                   qvalueCutoff = 0.05,
                   readable = T)
save(ego_all,file = "GO_result.Rdata")

#可視化
dotplot(ego_all)   #氣泡圖
cnetplot(ego_all)
barplot(ego_all)  #條帶圖
cnetplot(ego_all, categorySize="pvalue",colorEdge = TRUE,circular = TRUE) #簡易弦圖

##KEGG
geo_kegg <- enrichKEGG(gene_df,
                       organism = "hsa",keyType = "kegg",
                       pvalueCutoff = 0.05,
                       pAdjustMethod = "BH",
                       universe=gene_all,
                       minGSSize = 10,
                       maxGSSize = 500,
                       qvalueCutoff =2,
                       use_internal_data = FALSE
)
#查看富集結(jié)果
table(geo_kegg@result$p.adjust<0.05)  #1個(gè)
dotplot(geo_kegg)
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容