GEO數(shù)據(jù)表達差異分析

1-1從GEO數(shù)據(jù)庫下載數(shù)據(jù)

rm(list = ls())
options(stringsAsFactors = F)
library(GEOquery)
gse = "GSE4107"
eSet <- getGEO(gse, 
               destdir = '.', 
               getGPL = F)

1-2提取表達矩陣exp

exp <- exprs(eSet[[1]])
head(exp)
exp = log2(exp+1)
dim(exp)  #22個樣本,54975個探針
boxplot(exp)  #對樣本的總體表達水平范圍有一個了解
截屏2020-07-18 19.35.13.png

1-3提取各樣本的分組信息

pd <- pData(eSet[[1]])

1-4調(diào)整exp的行名順序與pd列名完全一致

p = identical(rownames(pd),colnames(exp));p
if(!p) exp <- exp[,match(rownames(pd),colnames(exp))]

1-5提取芯片平臺編號,保存

gpl <- eSet[[1]]@annotation

2-1提取分組信息

rm(list = ls())  
load(file = "step1output.Rdata")
library(stringr)
group_list=ifelse(str_detect(pd$title,"control"),"control","treat")
table(group_list)
group_list = factor(group_list,
                    levels = c("control","treat"))

2-2獲取芯片的探針注釋

gpl 
#http://www.bio-info-trainee.com/1399.html
if(!require(hgu133plus2.db))BiocManager::install("hgu133plus2.db")
library(hgu133plus2.db)
ls("package:hgu133plus2.db")
ids <- toTable(hgu133plus2SYMBOL) #toTable轉(zhuǎn)換為矩陣
head(ids)
save(exp,group_list,ids,file = "step2output.Rdata")

3借助PCA和熱圖,進行數(shù)據(jù)檢查
3-1PCA

rm(list = ls())  
load(file = "step1output.Rdata")
load(file = "step2output.Rdata")
dat=as.data.frame(t(exp))  #轉(zhuǎn)換為行是樣本,列是基因
library(FactoMineR)#畫主成分分析圖需要加載這兩個包
library(factoextra) 
# pca
dat.pca <- PCA(dat, graph = FALSE)
pca_plot <- fviz_pca_ind(dat.pca,
                         geom.ind = "point", # show points only (nbut not "text")
                         col.ind = group_list, # color by groups
                         #palette = c("#00AFBB", "#E7B800"),  #修改顏色
                         addEllipses = TRUE, # Concentration ellipses
                         legend.title = "Groups"
)
pca_plot
ggsave(plot = pca_plot,filename = paste0(gse,"PCA.png"))
save(pca_plot,file = "pca_plot.Rdata")
GSE4107PCA.png

3-2熱圖

cg=names(tail(sort(apply(exp,1,sd)),1000))  #對exp按行計算每個探針的標準差,從小到大排序,取標準差最大的前1000個探針,并提取對應的探針名
n=exp[cg,]
#繪制熱圖
annotation_col=data.frame(group=group_list)
rownames(annotation_col)=colnames(n) 
library(pheatmap)
pheatmap(n,
         show_colnames =F,
         show_rownames = F,
         annotation_col=annotation_col,
         scale = "row")
dev.off()

4差異分析
4-1差異基因分析

rm(list = ls()) 
load(file = "step2output.Rdata")
library(limma)
design=model.matrix(~group_list)
fit=lmFit(exp,design)
fit=eBayes(fit)
deg=topTable(fit,coef=2,number = Inf)

4-2加probe_id列,把行名變成一列

library(dplyr)
deg <- mutate(deg,probe_id=rownames(deg));head(deg)

4-3加symbol列,去重復

deg <- inner_join(deg,ids,by="probe_id");head(deg)
deg <- deg[!duplicated(deg$symbol),]

4-4標記上下調(diào)基因

logFC_t=1  #變化超過2倍的視為差異基因
P.Value_t = 0.01  #P值小于等于0.01視為顯著
k1 = (deg$P.Value < P.Value_t)&(deg$logFC < -logFC_t)
k2 = (deg$P.Value < P.Value_t)&(deg$logFC > logFC_t)
change = ifelse(k1,"down",ifelse(k2,"up","stable"))
deg <- mutate(deg,change);head(deg)

4-5加ENTREZID列,用于富集分析

library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
s2e <- bitr(deg$symbol, 
            fromType = "SYMBOL",
            toType = "ENTREZID",
            OrgDb = org.Hs.eg.db)#人類
deg <- inner_join(deg,s2e,by=c("symbol"="SYMBOL"));head(deg)
save(group_list,deg,logFC_t,P.Value_t,file = "step4output.Rdata")

5畫火山圖

rm(list = ls()) 
load(file = "step1output.Rdata")
load(file = "step4output.Rdata")
library(dplyr)
library(ggplot2)
dat  = deg
for_label <- dat%>% 
  filter(symbol %in% c("TRPM3","SFRP1")) 
p <- ggplot(data = dat, 
            aes(x = logFC, 
                y = -log10(P.Value))) +
  geom_point(alpha=0.4, size=3.5, 
             aes(color=change)) +
  ylab("-log10(Pvalue)")+
  scale_color_manual(values=c("blue", "grey","red"))+
  geom_vline(xintercept=c(-logFC_t,logFC_t),lty=4,col="black",lwd=0.8) +
  geom_hline(yintercept = -log10(P.Value_t),lty=4,col="black",lwd=0.8) +
  theme_bw()
p
volcano_plot <- p +
  geom_point(size = 3, shape = 1, data = for_label) +
  ggrepel::geom_label_repel(
    aes(label = symbol),
    data = for_label,
    color="black"
  )
volcano_plot
ggsave(plot = volcano_plot,filename = paste0(gse,"volcano.png"))
GSE4107volcano.png

6畫熱圖

load(file = 'step2output.Rdata')
x=deg$logFC 
names(x)=deg$probe_id 
cg = deg$probe_id[deg$change !="stable"]
n=exp[cg,]
dim(n)
library(pheatmap)
annotation_col=data.frame(group=group_list)
rownames(annotation_col)=colnames(n) 
library(ggplotify)
heatmap_plot <- as.ggplot(pheatmap(n,show_colnames =F,
                                   show_rownames = F,
                                   scale = "row",
                                   #cluster_cols = F, 
                                   annotation_col=annotation_col)) 
heatmap_plot
ggsave(heatmap_plot,filename = paste0(gse,"heatmap.png"))
GSE4107heatmap.png
test.png
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務。

相關閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容